diff options
Diffstat (limited to 'src')
107 files changed, 9196 insertions, 2511 deletions
diff --git a/src/citra/CMakeLists.txt b/src/citra/CMakeLists.txt index 918687312..1d6aac9a9 100644 --- a/src/citra/CMakeLists.txt +++ b/src/citra/CMakeLists.txt @@ -14,7 +14,7 @@ set(HEADERS create_directory_groups(${SRCS} ${HEADERS}) add_executable(citra ${SRCS} ${HEADERS}) -target_link_libraries(citra core common video_core) +target_link_libraries(citra core video_core common) target_link_libraries(citra ${GLFW_LIBRARIES} ${OPENGL_gl_LIBRARY} inih) if (MSVC) target_link_libraries(citra getopt) diff --git a/src/citra/citra.cpp b/src/citra/citra.cpp index 182646f4c..d6fcb66a5 100644 --- a/src/citra/citra.cpp +++ b/src/citra/citra.cpp @@ -71,6 +71,7 @@ int main(int argc, char **argv) { EmuWindow_GLFW* emu_window = new EmuWindow_GLFW; VideoCore::g_hw_renderer_enabled = Settings::values.use_hw_renderer; + VideoCore::g_shader_jit_enabled = Settings::values.use_shader_jit; System::Init(emu_window); diff --git a/src/citra/config.cpp b/src/citra/config.cpp index 506cb7939..8a98bda87 100644 --- a/src/citra/config.cpp +++ b/src/citra/config.cpp @@ -40,37 +40,28 @@ bool Config::LoadINI(INIReader* config, const char* location, const std::string& return true; } +static const std::array<int, Settings::NativeInput::NUM_INPUTS> defaults = { + GLFW_KEY_A, GLFW_KEY_S, GLFW_KEY_Z, GLFW_KEY_X, + GLFW_KEY_Q, GLFW_KEY_W, GLFW_KEY_1, GLFW_KEY_2, + GLFW_KEY_M, GLFW_KEY_N, GLFW_KEY_B, + GLFW_KEY_T, GLFW_KEY_G, GLFW_KEY_F, GLFW_KEY_H, + GLFW_KEY_UP, GLFW_KEY_DOWN, GLFW_KEY_LEFT, GLFW_KEY_RIGHT, + GLFW_KEY_I, GLFW_KEY_K, GLFW_KEY_J, GLFW_KEY_L +}; + void Config::ReadValues() { // Controls - Settings::values.pad_a_key = glfw_config->GetInteger("Controls", "pad_a", GLFW_KEY_A); - Settings::values.pad_b_key = glfw_config->GetInteger("Controls", "pad_b", GLFW_KEY_S); - Settings::values.pad_x_key = glfw_config->GetInteger("Controls", "pad_x", GLFW_KEY_Z); - Settings::values.pad_y_key = glfw_config->GetInteger("Controls", "pad_y", GLFW_KEY_X); - Settings::values.pad_l_key = glfw_config->GetInteger("Controls", "pad_l", GLFW_KEY_Q); - Settings::values.pad_r_key = glfw_config->GetInteger("Controls", "pad_r", GLFW_KEY_W); - Settings::values.pad_zl_key = glfw_config->GetInteger("Controls", "pad_zl", GLFW_KEY_1); - Settings::values.pad_zr_key = glfw_config->GetInteger("Controls", "pad_zr", GLFW_KEY_2); - Settings::values.pad_start_key = glfw_config->GetInteger("Controls", "pad_start", GLFW_KEY_M); - Settings::values.pad_select_key = glfw_config->GetInteger("Controls", "pad_select", GLFW_KEY_N); - Settings::values.pad_home_key = glfw_config->GetInteger("Controls", "pad_home", GLFW_KEY_B); - Settings::values.pad_dup_key = glfw_config->GetInteger("Controls", "pad_dup", GLFW_KEY_T); - Settings::values.pad_ddown_key = glfw_config->GetInteger("Controls", "pad_ddown", GLFW_KEY_G); - Settings::values.pad_dleft_key = glfw_config->GetInteger("Controls", "pad_dleft", GLFW_KEY_F); - Settings::values.pad_dright_key = glfw_config->GetInteger("Controls", "pad_dright", GLFW_KEY_H); - Settings::values.pad_sup_key = glfw_config->GetInteger("Controls", "pad_sup", GLFW_KEY_UP); - Settings::values.pad_sdown_key = glfw_config->GetInteger("Controls", "pad_sdown", GLFW_KEY_DOWN); - Settings::values.pad_sleft_key = glfw_config->GetInteger("Controls", "pad_sleft", GLFW_KEY_LEFT); - Settings::values.pad_sright_key = glfw_config->GetInteger("Controls", "pad_sright", GLFW_KEY_RIGHT); - Settings::values.pad_cup_key = glfw_config->GetInteger("Controls", "pad_cup", GLFW_KEY_I); - Settings::values.pad_cdown_key = glfw_config->GetInteger("Controls", "pad_cdown", GLFW_KEY_K); - Settings::values.pad_cleft_key = glfw_config->GetInteger("Controls", "pad_cleft", GLFW_KEY_J); - Settings::values.pad_cright_key = glfw_config->GetInteger("Controls", "pad_cright", GLFW_KEY_L); + for (int i = 0; i < Settings::NativeInput::NUM_INPUTS; ++i) { + Settings::values.input_mappings[Settings::NativeInput::All[i]] = + glfw_config->GetInteger("Controls", Settings::NativeInput::Mapping[i], defaults[i]); + } // Core Settings::values.frame_skip = glfw_config->GetInteger("Core", "frame_skip", 0); // Renderer Settings::values.use_hw_renderer = glfw_config->GetBoolean("Renderer", "use_hw_renderer", false); + Settings::values.use_shader_jit = glfw_config->GetBoolean("Renderer", "use_shader_jit", true); Settings::values.bg_red = (float)glfw_config->GetReal("Renderer", "bg_red", 1.0); Settings::values.bg_green = (float)glfw_config->GetReal("Renderer", "bg_green", 1.0); diff --git a/src/citra/default_ini.h b/src/citra/default_ini.h index fd5a90d56..7e5d49729 100644 --- a/src/citra/default_ini.h +++ b/src/citra/default_ini.h @@ -33,10 +33,6 @@ pad_cleft = pad_cright = [Core] -# The refresh rate for the GPU -# Defaults to 30 -gpu_refresh_rate = - # The applied frameskip amount. Must be a power of two. # 0 (default): No frameskip, 1: x2 frameskip, 2: x4 frameskip, 3: x8 frameskip, etc. frame_skip = @@ -46,6 +42,10 @@ frame_skip = # 0 (default): Software, 1: Hardware use_hw_renderer = +# Whether to use the Just-In-Time (JIT) compiler for shader emulation +# 0 : Interpreter (slow), 1 (default): JIT (fast) +use_shader_jit = + # The clear color for the renderer. What shows up on the sides of the bottom screen. # Must be in range of 0.0-1.0. Defaults to 1.0 for all. bg_red = diff --git a/src/citra/emu_window/emu_window_glfw.cpp b/src/citra/emu_window/emu_window_glfw.cpp index 42fb683a9..6d6656b5a 100644 --- a/src/citra/emu_window/emu_window_glfw.cpp +++ b/src/citra/emu_window/emu_window_glfw.cpp @@ -150,32 +150,9 @@ void EmuWindow_GLFW::DoneCurrent() { } void EmuWindow_GLFW::ReloadSetKeymaps() { - KeyMap::SetKeyMapping({Settings::values.pad_a_key, keyboard_id}, Service::HID::PAD_A); - KeyMap::SetKeyMapping({Settings::values.pad_b_key, keyboard_id}, Service::HID::PAD_B); - KeyMap::SetKeyMapping({Settings::values.pad_select_key, keyboard_id}, Service::HID::PAD_SELECT); - KeyMap::SetKeyMapping({Settings::values.pad_start_key, keyboard_id}, Service::HID::PAD_START); - KeyMap::SetKeyMapping({Settings::values.pad_dright_key, keyboard_id}, Service::HID::PAD_RIGHT); - KeyMap::SetKeyMapping({Settings::values.pad_dleft_key, keyboard_id}, Service::HID::PAD_LEFT); - KeyMap::SetKeyMapping({Settings::values.pad_dup_key, keyboard_id}, Service::HID::PAD_UP); - KeyMap::SetKeyMapping({Settings::values.pad_ddown_key, keyboard_id}, Service::HID::PAD_DOWN); - KeyMap::SetKeyMapping({Settings::values.pad_r_key, keyboard_id}, Service::HID::PAD_R); - KeyMap::SetKeyMapping({Settings::values.pad_l_key, keyboard_id}, Service::HID::PAD_L); - KeyMap::SetKeyMapping({Settings::values.pad_x_key, keyboard_id}, Service::HID::PAD_X); - KeyMap::SetKeyMapping({Settings::values.pad_y_key, keyboard_id}, Service::HID::PAD_Y); - - KeyMap::SetKeyMapping({Settings::values.pad_zl_key, keyboard_id}, Service::HID::PAD_ZL); - KeyMap::SetKeyMapping({Settings::values.pad_zr_key, keyboard_id}, Service::HID::PAD_ZR); - - // KeyMap::SetKeyMapping({Settings::values.pad_touch_key, keyboard_id}, Service::HID::PAD_TOUCH); - - KeyMap::SetKeyMapping({Settings::values.pad_cright_key, keyboard_id}, Service::HID::PAD_C_RIGHT); - KeyMap::SetKeyMapping({Settings::values.pad_cleft_key, keyboard_id}, Service::HID::PAD_C_LEFT); - KeyMap::SetKeyMapping({Settings::values.pad_cup_key, keyboard_id}, Service::HID::PAD_C_UP); - KeyMap::SetKeyMapping({Settings::values.pad_cdown_key, keyboard_id}, Service::HID::PAD_C_DOWN); - KeyMap::SetKeyMapping({Settings::values.pad_sright_key, keyboard_id}, Service::HID::PAD_CIRCLE_RIGHT); - KeyMap::SetKeyMapping({Settings::values.pad_sleft_key, keyboard_id}, Service::HID::PAD_CIRCLE_LEFT); - KeyMap::SetKeyMapping({Settings::values.pad_sup_key, keyboard_id}, Service::HID::PAD_CIRCLE_UP); - KeyMap::SetKeyMapping({Settings::values.pad_sdown_key, keyboard_id}, Service::HID::PAD_CIRCLE_DOWN); + for (int i = 0; i < Settings::NativeInput::NUM_INPUTS; ++i) { + KeyMap::SetKeyMapping({Settings::values.input_mappings[Settings::NativeInput::All[i]], keyboard_id}, Service::HID::pad_mapping[i]); + } } void EmuWindow_GLFW::OnMinimalClientAreaChangeRequest(const std::pair<unsigned,unsigned>& minimal_size) { diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt index 47aaeca24..0c0515054 100644 --- a/src/citra_qt/CMakeLists.txt +++ b/src/citra_qt/CMakeLists.txt @@ -71,7 +71,7 @@ if (APPLE) else() add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS}) endif() -target_link_libraries(citra-qt core common video_core qhexedit) +target_link_libraries(citra-qt core video_core common qhexedit) target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS}) target_link_libraries(citra-qt ${PLATFORM_LIBRARIES}) diff --git a/src/citra_qt/bootmanager.cpp b/src/citra_qt/bootmanager.cpp index fa7bce466..a96fbea5f 100644 --- a/src/citra_qt/bootmanager.cpp +++ b/src/citra_qt/bootmanager.cpp @@ -94,7 +94,7 @@ private: }; GRenderWindow::GRenderWindow(QWidget* parent, EmuThread* emu_thread) : - QWidget(parent), emu_thread(emu_thread), keyboard_id(0) { + QWidget(parent), keyboard_id(0), emu_thread(emu_thread) { std::string window_title = Common::StringFromFormat("Citra | %s-%s", Common::g_scm_branch, Common::g_scm_desc); setWindowTitle(QString::fromStdString(window_title)); @@ -248,32 +248,9 @@ void GRenderWindow::mouseReleaseEvent(QMouseEvent *event) void GRenderWindow::ReloadSetKeymaps() { - KeyMap::SetKeyMapping({Settings::values.pad_a_key, keyboard_id}, Service::HID::PAD_A); - KeyMap::SetKeyMapping({Settings::values.pad_b_key, keyboard_id}, Service::HID::PAD_B); - KeyMap::SetKeyMapping({Settings::values.pad_select_key, keyboard_id}, Service::HID::PAD_SELECT); - KeyMap::SetKeyMapping({Settings::values.pad_start_key, keyboard_id}, Service::HID::PAD_START); - KeyMap::SetKeyMapping({Settings::values.pad_dright_key, keyboard_id}, Service::HID::PAD_RIGHT); - KeyMap::SetKeyMapping({Settings::values.pad_dleft_key, keyboard_id}, Service::HID::PAD_LEFT); - KeyMap::SetKeyMapping({Settings::values.pad_dup_key, keyboard_id}, Service::HID::PAD_UP); - KeyMap::SetKeyMapping({Settings::values.pad_ddown_key, keyboard_id}, Service::HID::PAD_DOWN); - KeyMap::SetKeyMapping({Settings::values.pad_r_key, keyboard_id}, Service::HID::PAD_R); - KeyMap::SetKeyMapping({Settings::values.pad_l_key, keyboard_id}, Service::HID::PAD_L); - KeyMap::SetKeyMapping({Settings::values.pad_x_key, keyboard_id}, Service::HID::PAD_X); - KeyMap::SetKeyMapping({Settings::values.pad_y_key, keyboard_id}, Service::HID::PAD_Y); - - KeyMap::SetKeyMapping({Settings::values.pad_zl_key, keyboard_id}, Service::HID::PAD_ZL); - KeyMap::SetKeyMapping({Settings::values.pad_zr_key, keyboard_id}, Service::HID::PAD_ZR); - - // KeyMap::SetKeyMapping({Settings::values.pad_touch_key, keyboard_id}, Service::HID::PAD_TOUCH); - - KeyMap::SetKeyMapping({Settings::values.pad_cright_key, keyboard_id}, Service::HID::PAD_C_RIGHT); - KeyMap::SetKeyMapping({Settings::values.pad_cleft_key, keyboard_id}, Service::HID::PAD_C_LEFT); - KeyMap::SetKeyMapping({Settings::values.pad_cup_key, keyboard_id}, Service::HID::PAD_C_UP); - KeyMap::SetKeyMapping({Settings::values.pad_cdown_key, keyboard_id}, Service::HID::PAD_C_DOWN); - KeyMap::SetKeyMapping({Settings::values.pad_sright_key, keyboard_id}, Service::HID::PAD_CIRCLE_RIGHT); - KeyMap::SetKeyMapping({Settings::values.pad_sleft_key, keyboard_id}, Service::HID::PAD_CIRCLE_LEFT); - KeyMap::SetKeyMapping({Settings::values.pad_sup_key, keyboard_id}, Service::HID::PAD_CIRCLE_UP); - KeyMap::SetKeyMapping({Settings::values.pad_sdown_key, keyboard_id}, Service::HID::PAD_CIRCLE_DOWN); + for (int i = 0; i < Settings::NativeInput::NUM_INPUTS; ++i) { + KeyMap::SetKeyMapping({Settings::values.input_mappings[Settings::NativeInput::All[i]], keyboard_id}, Service::HID::pad_mapping[i]); + } } void GRenderWindow::OnClientAreaResized(unsigned width, unsigned height) diff --git a/src/citra_qt/config.cpp b/src/citra_qt/config.cpp index 5c056446e..a20351fb8 100644 --- a/src/citra_qt/config.cpp +++ b/src/citra_qt/config.cpp @@ -21,31 +21,21 @@ Config::Config() { Reload(); } +static const std::array<QVariant, Settings::NativeInput::NUM_INPUTS> defaults = { + Qt::Key_A, Qt::Key_S, Qt::Key_Z, Qt::Key_X, + Qt::Key_Q, Qt::Key_W, Qt::Key_1, Qt::Key_2, + Qt::Key_M, Qt::Key_N, Qt::Key_B, + Qt::Key_T, Qt::Key_G, Qt::Key_F, Qt::Key_H, + Qt::Key_Up, Qt::Key_Down, Qt::Key_Left, Qt::Key_Right, + Qt::Key_I, Qt::Key_K, Qt::Key_J, Qt::Key_L +}; + void Config::ReadValues() { qt_config->beginGroup("Controls"); - Settings::values.pad_a_key = qt_config->value("pad_a", Qt::Key_A).toInt(); - Settings::values.pad_b_key = qt_config->value("pad_b", Qt::Key_S).toInt(); - Settings::values.pad_x_key = qt_config->value("pad_x", Qt::Key_Z).toInt(); - Settings::values.pad_y_key = qt_config->value("pad_y", Qt::Key_X).toInt(); - Settings::values.pad_l_key = qt_config->value("pad_l", Qt::Key_Q).toInt(); - Settings::values.pad_r_key = qt_config->value("pad_r", Qt::Key_W).toInt(); - Settings::values.pad_zl_key = qt_config->value("pad_zl", Qt::Key_1).toInt(); - Settings::values.pad_zr_key = qt_config->value("pad_zr", Qt::Key_2).toInt(); - Settings::values.pad_start_key = qt_config->value("pad_start", Qt::Key_M).toInt(); - Settings::values.pad_select_key = qt_config->value("pad_select", Qt::Key_N).toInt(); - Settings::values.pad_home_key = qt_config->value("pad_home", Qt::Key_B).toInt(); - Settings::values.pad_dup_key = qt_config->value("pad_dup", Qt::Key_T).toInt(); - Settings::values.pad_ddown_key = qt_config->value("pad_ddown", Qt::Key_G).toInt(); - Settings::values.pad_dleft_key = qt_config->value("pad_dleft", Qt::Key_F).toInt(); - Settings::values.pad_dright_key = qt_config->value("pad_dright", Qt::Key_H).toInt(); - Settings::values.pad_sup_key = qt_config->value("pad_sup", Qt::Key_Up).toInt(); - Settings::values.pad_sdown_key = qt_config->value("pad_sdown", Qt::Key_Down).toInt(); - Settings::values.pad_sleft_key = qt_config->value("pad_sleft", Qt::Key_Left).toInt(); - Settings::values.pad_sright_key = qt_config->value("pad_sright", Qt::Key_Right).toInt(); - Settings::values.pad_cup_key = qt_config->value("pad_cup", Qt::Key_I).toInt(); - Settings::values.pad_cdown_key = qt_config->value("pad_cdown", Qt::Key_K).toInt(); - Settings::values.pad_cleft_key = qt_config->value("pad_cleft", Qt::Key_J).toInt(); - Settings::values.pad_cright_key = qt_config->value("pad_cright", Qt::Key_L).toInt(); + for (int i = 0; i < Settings::NativeInput::NUM_INPUTS; ++i) { + Settings::values.input_mappings[Settings::NativeInput::All[i]] = + qt_config->value(QString::fromStdString(Settings::NativeInput::Mapping[i]), defaults[i]).toInt(); + } qt_config->endGroup(); qt_config->beginGroup("Core"); @@ -54,6 +44,7 @@ void Config::ReadValues() { qt_config->beginGroup("Renderer"); Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", false).toBool(); + Settings::values.use_shader_jit = qt_config->value("use_shader_jit", true).toBool(); Settings::values.bg_red = qt_config->value("bg_red", 1.0).toFloat(); Settings::values.bg_green = qt_config->value("bg_green", 1.0).toFloat(); @@ -75,29 +66,10 @@ void Config::ReadValues() { void Config::SaveValues() { qt_config->beginGroup("Controls"); - qt_config->setValue("pad_a", Settings::values.pad_a_key); - qt_config->setValue("pad_b", Settings::values.pad_b_key); - qt_config->setValue("pad_x", Settings::values.pad_x_key); - qt_config->setValue("pad_y", Settings::values.pad_y_key); - qt_config->setValue("pad_l", Settings::values.pad_l_key); - qt_config->setValue("pad_r", Settings::values.pad_r_key); - qt_config->setValue("pad_zl", Settings::values.pad_zl_key); - qt_config->setValue("pad_zr", Settings::values.pad_zr_key); - qt_config->setValue("pad_start", Settings::values.pad_start_key); - qt_config->setValue("pad_select", Settings::values.pad_select_key); - qt_config->setValue("pad_home", Settings::values.pad_home_key); - qt_config->setValue("pad_dup", Settings::values.pad_dup_key); - qt_config->setValue("pad_ddown", Settings::values.pad_ddown_key); - qt_config->setValue("pad_dleft", Settings::values.pad_dleft_key); - qt_config->setValue("pad_dright", Settings::values.pad_dright_key); - qt_config->setValue("pad_sup", Settings::values.pad_sup_key); - qt_config->setValue("pad_sdown", Settings::values.pad_sdown_key); - qt_config->setValue("pad_sleft", Settings::values.pad_sleft_key); - qt_config->setValue("pad_sright", Settings::values.pad_sright_key); - qt_config->setValue("pad_cup", Settings::values.pad_cup_key); - qt_config->setValue("pad_cdown", Settings::values.pad_cdown_key); - qt_config->setValue("pad_cleft", Settings::values.pad_cleft_key); - qt_config->setValue("pad_cright", Settings::values.pad_cright_key); + for (int i = 0; i < Settings::NativeInput::NUM_INPUTS; ++i) { + qt_config->setValue(QString::fromStdString(Settings::NativeInput::Mapping[i]), + Settings::values.input_mappings[Settings::NativeInput::All[i]]); + } qt_config->endGroup(); qt_config->beginGroup("Core"); @@ -106,6 +78,7 @@ void Config::SaveValues() { qt_config->beginGroup("Renderer"); qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer); + qt_config->setValue("use_shader_jit", Settings::values.use_shader_jit); // Cast to double because Qt's written float values are not human-readable qt_config->setValue("bg_red", (double)Settings::values.bg_red); diff --git a/src/citra_qt/debugger/callstack.cpp b/src/citra_qt/debugger/callstack.cpp index 6799ce844..e97e81b65 100644 --- a/src/citra_qt/debugger/callstack.cpp +++ b/src/citra_qt/debugger/callstack.cpp @@ -4,12 +4,14 @@ #include <QStandardItemModel> +#include "common/common_types.h" +#include "common/symbols.h" + #include "callstack.h" #include "core/core.h" #include "core/arm/arm_interface.h" #include "core/memory.h" -#include "common/symbols.h" #include "core/arm/disassembler/arm_disasm.h" CallstackWidget::CallstackWidget(QWidget* parent): QDockWidget(parent) @@ -49,8 +51,8 @@ void CallstackWidget::OnDebugModeEntered() { std::string name; // ripped from disasm - uint8_t cond = (insn >> 28) & 0xf; - uint32_t i_offset = insn & 0xffffff; + u8 cond = (insn >> 28) & 0xf; + u32 i_offset = insn & 0xffffff; // Sign-extend the 24-bit offset if ((i_offset >> 23) & 1) i_offset |= 0xff000000; diff --git a/src/citra_qt/debugger/disassembler.cpp b/src/citra_qt/debugger/disassembler.cpp index 1e5ef5299..d3629bbf6 100644 --- a/src/citra_qt/debugger/disassembler.cpp +++ b/src/citra_qt/debugger/disassembler.cpp @@ -159,7 +159,7 @@ void DisassemblerModel::SetNextInstruction(unsigned int address) { } DisassemblerWidget::DisassemblerWidget(QWidget* parent, EmuThread* emu_thread) : - QDockWidget(parent), emu_thread(emu_thread), base_addr(0) { + QDockWidget(parent), base_addr(0), emu_thread(emu_thread) { disasm_ui.setupUi(this); diff --git a/src/citra_qt/debugger/graphics_tracing.cpp b/src/citra_qt/debugger/graphics_tracing.cpp index 3f20f149d..f80cb7493 100644 --- a/src/citra_qt/debugger/graphics_tracing.cpp +++ b/src/citra_qt/debugger/graphics_tracing.cpp @@ -14,6 +14,8 @@ #include <boost/range/algorithm/copy.hpp> +#include "common/common_types.h" + #include "core/hw/gpu.h" #include "core/hw/lcd.h" @@ -66,14 +68,14 @@ void GraphicsTracingWidget::StartRecording() { // Encode floating point numbers to 24-bit values // TODO: Drop this explicit conversion once we store float24 values bit-correctly internally. - std::array<uint32_t, 4 * 16> default_attributes; + std::array<u32, 4 * 16> default_attributes; for (unsigned i = 0; i < 16; ++i) { for (unsigned comp = 0; comp < 3; ++comp) { default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs.default_attributes[i][comp].ToFloat32()); } } - std::array<uint32_t, 4 * 96> vs_float_uniforms; + std::array<u32, 4 * 96> vs_float_uniforms; for (unsigned i = 0; i < 96; ++i) for (unsigned comp = 0; comp < 3; ++comp) vs_float_uniforms[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs.uniforms.f[i][comp].ToFloat32()); diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp index f42a2f4ce..0c17edee0 100644 --- a/src/citra_qt/debugger/graphics_vertex_shader.cpp +++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp @@ -6,9 +6,16 @@ #include <sstream> #include <QBoxLayout> +#include <QFileDialog> +#include <QGroupBox> +#include <QLabel> +#include <QLineEdit> +#include <QPushButton> +#include <QSignalMapper> +#include <QSpinBox> #include <QTreeView> -#include "video_core/vertex_shader.h" +#include "video_core/shader/shader.h" #include "graphics_vertex_shader.h" @@ -17,7 +24,7 @@ using nihstro::Instruction; using nihstro::SourceRegister; using nihstro::SwizzlePattern; -GraphicsVertexShaderModel::GraphicsVertexShaderModel(QObject* parent): QAbstractItemModel(parent) { +GraphicsVertexShaderModel::GraphicsVertexShaderModel(GraphicsVertexShaderWidget* parent): QAbstractItemModel(parent), par(parent) { } @@ -34,7 +41,7 @@ int GraphicsVertexShaderModel::columnCount(const QModelIndex& parent) const { } int GraphicsVertexShaderModel::rowCount(const QModelIndex& parent) const { - return static_cast<int>(info.code.size()); + return static_cast<int>(par->info.code.size()); } QVariant GraphicsVertexShaderModel::headerData(int section, Qt::Orientation orientation, int role) const { @@ -62,21 +69,21 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con { switch (index.column()) { case 0: - if (info.HasLabel(index.row())) - return QString::fromStdString(info.GetLabel(index.row())); + if (par->info.HasLabel(index.row())) + return QString::fromStdString(par->info.GetLabel(index.row())); return QString("%1").arg(4*index.row(), 4, 16, QLatin1Char('0')); case 1: - return QString("%1").arg(info.code[index.row()].hex, 8, 16, QLatin1Char('0')); + return QString("%1").arg(par->info.code[index.row()].hex, 8, 16, QLatin1Char('0')); case 2: { std::stringstream output; output.flags(std::ios::hex); - Instruction instr = info.code[index.row()]; - const SwizzlePattern& swizzle = info.swizzle_info[instr.common.operand_desc_id].pattern; + Instruction instr = par->info.code[index.row()]; + const SwizzlePattern& swizzle = par->info.swizzle_info[instr.common.operand_desc_id].pattern; // longest known instruction name: "setemit " output << std::setw(8) << std::left << instr.opcode.Value().GetInfo().name; @@ -130,13 +137,13 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con print_input_indexed_compact(output, src1, swizzle.negate_src1, swizzle.SelectorToString(false).substr(0,1), instr.common.AddressRegisterName()); output << " " << instr.common.compare_op.ToString(instr.common.compare_op.x) << " "; - print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(false).substr(0,1)); + print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(true).substr(0,1)); output << ", "; print_input_indexed_compact(output, src1, swizzle.negate_src1, swizzle.SelectorToString(false).substr(1,1), instr.common.AddressRegisterName()); output << " " << instr.common.compare_op.ToString(instr.common.compare_op.y) << " "; - print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(false).substr(1,1)); + print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(true).substr(1,1)); break; } @@ -167,7 +174,7 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con // TODO: In some cases, the Address Register is used as an index for SRC2 instead of SRC1 if (instr.opcode.Value().GetInfo().subtype & OpCode::Info::Src2) { SourceRegister src2 = instr.common.GetSrc2(src_is_inverted); - print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(false)); + print_input(output, src2, swizzle.negate_src2, swizzle.SelectorToString(true)); } break; } @@ -240,6 +247,18 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con case Qt::FontRole: return QFont("monospace"); + case Qt::BackgroundRole: + // Highlight instructions which have no debug data associated to them + for (const auto& record : par->debug_data.records) + if (index.row() == record.instruction_offset) + return QVariant(); + + return QBrush(QColor(255, 255, 127)); + + + // TODO: Draw arrows for each "reachable" instruction to visualize control flow + + default: break; } @@ -247,53 +266,232 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con return QVariant(); } -void GraphicsVertexShaderModel::OnUpdate() -{ - beginResetModel(); - - info.Clear(); - - for (auto instr : Pica::g_state.vs.program_code) - info.code.push_back({instr}); +void GraphicsVertexShaderWidget::DumpShader() { + QString filename = QFileDialog::getSaveFileName(this, tr("Save Shader Dump"), "shader_dump.shbin", + tr("Shader Binary (*.shbin)")); - for (auto pattern : Pica::g_state.vs.swizzle_data) - info.swizzle_info.push_back({pattern}); + if (filename.isEmpty()) { + // If the user canceled the dialog, don't dump anything. + return; + } - info.labels.insert({ Pica::g_state.regs.vs.main_offset, "main" }); + auto& setup = Pica::g_state.vs; + auto& config = Pica::g_state.regs.vs; - endResetModel(); + Pica::DebugUtils::DumpShader(filename.toStdString(), config, setup, Pica::g_state.regs.vs_output_attributes); } - GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::DebugContext > debug_context, QWidget* parent) : BreakPointObserverDock(debug_context, "Pica Vertex Shader", parent) { setObjectName("PicaVertexShader"); - auto binary_model = new GraphicsVertexShaderModel(this); - auto binary_list = new QTreeView; - binary_list->setModel(binary_model); + auto input_data_mapper = new QSignalMapper(this); + + // TODO: Support inputting data in hexadecimal raw format + for (unsigned i = 0; i < ARRAY_SIZE(input_data); ++i) { + input_data[i] = new QLineEdit; + input_data[i]->setValidator(new QDoubleValidator(input_data[i])); + } + + breakpoint_warning = new QLabel(tr("(data only available at VertexLoaded breakpoints)")); + + // TODO: Add some button for jumping to the shader entry point + + model = new GraphicsVertexShaderModel(this); + binary_list = new QTreeView; + binary_list->setModel(model); binary_list->setRootIsDecorated(false); binary_list->setAlternatingRowColors(true); - connect(this, SIGNAL(Update()), binary_model, SLOT(OnUpdate())); + auto dump_shader = new QPushButton(QIcon::fromTheme("document-save"), tr("Dump")); + + instruction_description = new QLabel; + + cycle_index = new QSpinBox; + + connect(this, SIGNAL(SelectCommand(const QModelIndex&, QItemSelectionModel::SelectionFlags)), + binary_list->selectionModel(), SLOT(select(const QModelIndex&, QItemSelectionModel::SelectionFlags))); + + connect(dump_shader, SIGNAL(clicked()), this, SLOT(DumpShader())); + + connect(cycle_index, SIGNAL(valueChanged(int)), this, SLOT(OnCycleIndexChanged(int))); + + for (unsigned i = 0; i < ARRAY_SIZE(input_data); ++i) { + connect(input_data[i], SIGNAL(textEdited(const QString&)), input_data_mapper, SLOT(map())); + input_data_mapper->setMapping(input_data[i], i); + } + connect(input_data_mapper, SIGNAL(mapped(int)), this, SLOT(OnInputAttributeChanged(int))); auto main_widget = new QWidget; auto main_layout = new QVBoxLayout; { + auto input_data_group = new QGroupBox(tr("Input Data")); + + // For each vertex attribute, add a QHBoxLayout consisting of: + // - A QLabel denoting the source attribute index + // - Four QLineEdits for showing and manipulating attribute data + // - A QLabel denoting the shader input attribute index + auto sub_layout = new QVBoxLayout; + for (unsigned i = 0; i < 16; ++i) { + // Create an HBoxLayout to store the widgets used to specify a particular attribute + // and store it in a QWidget to allow for easy hiding and unhiding. + auto row_layout = new QHBoxLayout; + row_layout->addWidget(new QLabel(tr("Attribute %1").arg(i, 2))); + for (unsigned comp = 0; comp < 4; ++comp) + row_layout->addWidget(input_data[4 * i + comp]); + + row_layout->addWidget(input_data_mapping[i] = new QLabel); + + input_data_container[i] = new QWidget; + input_data_container[i]->setLayout(row_layout); + input_data_container[i]->hide(); + + sub_layout->addWidget(input_data_container[i]); + } + + sub_layout->addWidget(breakpoint_warning); + breakpoint_warning->hide(); + + input_data_group->setLayout(sub_layout); + main_layout->addWidget(input_data_group); + } + { auto sub_layout = new QHBoxLayout; sub_layout->addWidget(binary_list); main_layout->addLayout(sub_layout); } + main_layout->addWidget(dump_shader); + { + auto sub_layout = new QHBoxLayout; + sub_layout->addWidget(new QLabel(tr("Cycle Index:"))); + sub_layout->addWidget(cycle_index); + main_layout->addLayout(sub_layout); + } + main_layout->addWidget(instruction_description); + main_layout->addStretch(); main_widget->setLayout(main_layout); setWidget(main_widget); + + widget()->setEnabled(false); } void GraphicsVertexShaderWidget::OnBreakPointHit(Pica::DebugContext::Event event, void* data) { - emit Update(); + auto input = static_cast<Pica::Shader::InputVertex*>(data); + if (event == Pica::DebugContext::Event::VertexLoaded) { + Reload(true, data); + } else { + // No vertex data is retrievable => invalidate currently stored vertex data + Reload(true, nullptr); + } widget()->setEnabled(true); } +void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_data) { + model->beginResetModel(); + + if (replace_vertex_data) { + if (vertex_data) { + memcpy(&input_vertex, vertex_data, sizeof(input_vertex)); + for (unsigned attr = 0; attr < 16; ++attr) { + for (unsigned comp = 0; comp < 4; ++comp) { + input_data[4 * attr + comp]->setText(QString("%1").arg(input_vertex.attr[attr][comp].ToFloat32())); + } + } + breakpoint_warning->hide(); + } else { + for (unsigned attr = 0; attr < 16; ++attr) { + for (unsigned comp = 0; comp < 4; ++comp) { + input_data[4 * attr + comp]->setText(QString("???")); + } + } + breakpoint_warning->show(); + } + } + + // Reload shader code + info.Clear(); + + auto& shader_setup = Pica::g_state.vs; + auto& shader_config = Pica::g_state.regs.vs; + for (auto instr : shader_setup.program_code) + info.code.push_back({instr}); + + for (auto pattern : shader_setup.swizzle_data) + info.swizzle_info.push_back({pattern}); + + u32 entry_point = Pica::g_state.regs.vs.main_offset; + info.labels.insert({ entry_point, "main" }); + + // Generate debug information + debug_data = Pica::Shader::ProduceDebugInfo(input_vertex, 1, shader_config, shader_setup); + + // Reload widget state + + // Only show input attributes which are used as input to the shader + for (unsigned int attr = 0; attr < 16; ++attr) { + input_data_container[attr]->setVisible(false); + } + for (unsigned int attr = 0; attr < Pica::g_state.regs.vertex_attributes.GetNumTotalAttributes(); ++attr) { + unsigned source_attr = shader_config.input_register_map.GetRegisterForAttribute(attr); + input_data_mapping[source_attr]->setText(QString("-> v%1").arg(attr)); + input_data_container[source_attr]->setVisible(true); + } + + // Initialize debug info text for current cycle count + cycle_index->setMaximum(debug_data.records.size() - 1); + OnCycleIndexChanged(cycle_index->value()); + + model->endResetModel(); +} + void GraphicsVertexShaderWidget::OnResumed() { widget()->setEnabled(false); } + +void GraphicsVertexShaderWidget::OnInputAttributeChanged(int index) { + float value = input_data[index]->text().toFloat(); + Reload(); +} + +void GraphicsVertexShaderWidget::OnCycleIndexChanged(int index) { + QString text; + + auto& record = debug_data.records[index]; + if (record.mask & Pica::Shader::DebugDataRecord::SRC1) + text += tr("SRC1: %1, %2, %3, %4\n").arg(record.src1.x.ToFloat32()).arg(record.src1.y.ToFloat32()).arg(record.src1.z.ToFloat32()).arg(record.src1.w.ToFloat32()); + if (record.mask & Pica::Shader::DebugDataRecord::SRC2) + text += tr("SRC2: %1, %2, %3, %4\n").arg(record.src2.x.ToFloat32()).arg(record.src2.y.ToFloat32()).arg(record.src2.z.ToFloat32()).arg(record.src2.w.ToFloat32()); + if (record.mask & Pica::Shader::DebugDataRecord::SRC3) + text += tr("SRC3: %1, %2, %3, %4\n").arg(record.src3.x.ToFloat32()).arg(record.src3.y.ToFloat32()).arg(record.src3.z.ToFloat32()).arg(record.src3.w.ToFloat32()); + if (record.mask & Pica::Shader::DebugDataRecord::DEST_IN) + text += tr("DEST_IN: %1, %2, %3, %4\n").arg(record.dest_in.x.ToFloat32()).arg(record.dest_in.y.ToFloat32()).arg(record.dest_in.z.ToFloat32()).arg(record.dest_in.w.ToFloat32()); + if (record.mask & Pica::Shader::DebugDataRecord::DEST_OUT) + text += tr("DEST_OUT: %1, %2, %3, %4\n").arg(record.dest_out.x.ToFloat32()).arg(record.dest_out.y.ToFloat32()).arg(record.dest_out.z.ToFloat32()).arg(record.dest_out.w.ToFloat32()); + + if (record.mask & Pica::Shader::DebugDataRecord::ADDR_REG_OUT) + text += tr("Addres Registers: %1, %2\n").arg(record.address_registers[0]).arg(record.address_registers[1]); + if (record.mask & Pica::Shader::DebugDataRecord::CMP_RESULT) + text += tr("Compare Result: %1, %2\n").arg(record.conditional_code[0] ? "true" : "false").arg(record.conditional_code[1] ? "true" : "false"); + + if (record.mask & Pica::Shader::DebugDataRecord::COND_BOOL_IN) + text += tr("Static Condition: %1\n").arg(record.cond_bool ? "true" : "false"); + if (record.mask & Pica::Shader::DebugDataRecord::COND_CMP_IN) + text += tr("Dynamic Conditions: %1, %2\n").arg(record.cond_cmp[0] ? "true" : "false").arg(record.cond_cmp[1] ? "true" : "false"); + if (record.mask & Pica::Shader::DebugDataRecord::LOOP_INT_IN) + text += tr("Loop Parameters: %1 (repeats), %2 (initializer), %3 (increment), %4\n").arg(record.loop_int.x).arg(record.loop_int.y).arg(record.loop_int.z).arg(record.loop_int.w); + + text += tr("Instruction offset: 0x%1").arg(4 * record.instruction_offset, 4, 16, QLatin1Char('0')); + if (record.mask & Pica::Shader::DebugDataRecord::NEXT_INSTR) { + text += tr(" -> 0x%2").arg(4 * record.next_instruction, 4, 16, QLatin1Char('0')); + } else { + text += tr(" (last instruction)"); + } + + instruction_description->setText(text); + + // Scroll to current instruction + const QModelIndex& instr_index = model->index(record.instruction_offset, 0); + emit SelectCommand(instr_index, QItemSelectionModel::ClearAndSelect | QItemSelectionModel::Rows); + binary_list->scrollTo(instr_index, QAbstractItemView::EnsureVisible); +} diff --git a/src/citra_qt/debugger/graphics_vertex_shader.h b/src/citra_qt/debugger/graphics_vertex_shader.h index 38339dc05..1b3f1f7ec 100644 --- a/src/citra_qt/debugger/graphics_vertex_shader.h +++ b/src/citra_qt/debugger/graphics_vertex_shader.h @@ -10,11 +10,18 @@ #include "nihstro/parser_shbin.h" +#include "video_core/shader/shader.h" + +class QLabel; +class QSpinBox; + +class GraphicsVertexShaderWidget; + class GraphicsVertexShaderModel : public QAbstractItemModel { Q_OBJECT public: - GraphicsVertexShaderModel(QObject* parent); + GraphicsVertexShaderModel(GraphicsVertexShaderWidget* parent); QModelIndex index(int row, int column, const QModelIndex& parent = QModelIndex()) const override; QModelIndex parent(const QModelIndex& child) const override; @@ -23,11 +30,10 @@ public: QVariant data(const QModelIndex& index, int role = Qt::DisplayRole) const override; QVariant headerData(int section, Qt::Orientation orientation, int role = Qt::DisplayRole) const override; -public slots: - void OnUpdate(); - private: - nihstro::ShaderInfo info; + GraphicsVertexShaderWidget* par; + + friend class GraphicsVertexShaderWidget; }; class GraphicsVertexShaderWidget : public BreakPointObserverDock { @@ -43,9 +49,42 @@ private slots: void OnBreakPointHit(Pica::DebugContext::Event event, void* data) override; void OnResumed() override; + void OnInputAttributeChanged(int index); + + void OnCycleIndexChanged(int index); + + void DumpShader(); + + /** + * Reload widget based on the current PICA200 state + * @param replace_vertex_data If true, invalidate all current vertex data + * @param vertex_data New vertex data to use, as passed to OnBreakPointHit. May be nullptr to specify that no valid vertex data can be retrieved currently. Only used if replace_vertex_data is true. + */ + void Reload(bool replace_vertex_data = false, void* vertex_data = nullptr); + + signals: - void Update(); + // Call this to change the current command selection in the disassembly view + void SelectCommand(const QModelIndex&, QItemSelectionModel::SelectionFlags); private: + QLabel* instruction_description; + QTreeView* binary_list; + GraphicsVertexShaderModel* model; + + /// TODO: Move these into a single struct + std::array<QLineEdit*, 4*16> input_data; // A text box for each of the 4 components of up to 16 vertex attributes + std::array<QWidget*, 16> input_data_container; // QWidget containing the QLayout containing each vertex attribute + std::array<QLabel*, 16> input_data_mapping; // A QLabel denoting the shader input attribute which the vertex attribute maps to + + // Text to be shown when input vertex data is not retrievable + QLabel* breakpoint_warning; + + QSpinBox* cycle_index; + + nihstro::ShaderInfo info; + Pica::Shader::DebugData<true> debug_data; + Pica::Shader::InputVertex input_vertex; + friend class GraphicsVertexShaderModel; }; diff --git a/src/citra_qt/debugger/registers.cpp b/src/citra_qt/debugger/registers.cpp index 5527a2afd..4174b3945 100644 --- a/src/citra_qt/debugger/registers.cpp +++ b/src/citra_qt/debugger/registers.cpp @@ -11,32 +11,23 @@ RegistersWidget::RegistersWidget(QWidget* parent) : QDockWidget(parent) { cpu_regs_ui.setupUi(this); tree = cpu_regs_ui.treeWidget; - tree->addTopLevelItem(registers = new QTreeWidgetItem(QStringList("Registers"))); - tree->addTopLevelItem(CSPR = new QTreeWidgetItem(QStringList("CSPR"))); - - registers->setExpanded(true); - CSPR->setExpanded(true); + tree->addTopLevelItem(core_registers = new QTreeWidgetItem(QStringList(tr("Registers")))); + tree->addTopLevelItem(vfp_registers = new QTreeWidgetItem(QStringList(tr("VFP Registers")))); + tree->addTopLevelItem(vfp_system_registers = new QTreeWidgetItem(QStringList(tr("VFP System Registers")))); + tree->addTopLevelItem(cpsr = new QTreeWidgetItem(QStringList("CPSR"))); for (int i = 0; i < 16; ++i) { - QTreeWidgetItem* child = new QTreeWidgetItem(QStringList(QString("R[%1]").arg(i, 2, 10, QLatin1Char('0')))); - registers->addChild(child); + QTreeWidgetItem* child = new QTreeWidgetItem(QStringList(QString("R[%1]").arg(i))); + core_registers->addChild(child); + } + + for (int i = 0; i < 32; ++i) { + QTreeWidgetItem* child = new QTreeWidgetItem(QStringList(QString("S[%1]").arg(i))); + vfp_registers->addChild(child); } - CSPR->addChild(new QTreeWidgetItem(QStringList("M"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("T"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("F"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("I"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("A"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("E"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("IT"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("GE"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("DNM"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("J"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("Q"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("V"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("C"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("Z"))); - CSPR->addChild(new QTreeWidgetItem(QStringList("N"))); + CreateCPSRChildren(); + CreateVFPSystemRegisterChildren(); setEnabled(false); } @@ -47,25 +38,14 @@ void RegistersWidget::OnDebugModeEntered() { if (app_core == nullptr) return; - for (int i = 0; i < 16; ++i) - registers->child(i)->setText(1, QString("0x%1").arg(app_core->GetReg(i), 8, 16, QLatin1Char('0'))); - - CSPR->setText(1, QString("0x%1").arg(app_core->GetCPSR(), 8, 16, QLatin1Char('0'))); - CSPR->child(0)->setText(1, QString("b%1").arg(app_core->GetCPSR() & 0x1F, 5, 2, QLatin1Char('0'))); // M - Mode - CSPR->child(1)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 5) & 0x1)); // T - State - CSPR->child(2)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 6) & 0x1)); // F - FIQ disable - CSPR->child(3)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 7) & 0x1)); // I - IRQ disable - CSPR->child(4)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 8) & 0x1)); // A - Imprecise abort - CSPR->child(5)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 9) & 0x1)); // E - Data endianess - CSPR->child(6)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 10) & 0x3F)); // IT - If-Then state (DNM) - CSPR->child(7)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 16) & 0xF)); // GE - Greater-than-or-Equal - CSPR->child(8)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 20) & 0xF)); // DNM - Do not modify - CSPR->child(9)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 24) & 0x1)); // J - Java state - CSPR->child(10)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 27) & 0x1)); // Q - Sticky overflow - CSPR->child(11)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 28) & 0x1)); // V - Overflow - CSPR->child(12)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 29) & 0x1)); // C - Carry/Borrow/Extend - CSPR->child(13)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 30) & 0x1)); // Z - Zero - CSPR->child(14)->setText(1, QString("%1").arg((app_core->GetCPSR() >> 31) & 0x1)); // N - Negative/Less than + for (int i = 0; i < core_registers->childCount(); ++i) + core_registers->child(i)->setText(1, QString("0x%1").arg(app_core->GetReg(i), 8, 16, QLatin1Char('0'))); + + for (int i = 0; i < vfp_registers->childCount(); ++i) + vfp_registers->child(i)->setText(1, QString("0x%1").arg(app_core->GetVFPReg(i), 8, 16, QLatin1Char('0'))); + + UpdateCPSRValues(); + UpdateVFPSystemRegisterValues(); } void RegistersWidget::OnDebugModeLeft() { @@ -77,13 +57,153 @@ void RegistersWidget::OnEmulationStarting(EmuThread* emu_thread) { void RegistersWidget::OnEmulationStopping() { // Reset widget text - for (int i = 0; i < 16; ++i) - registers->child(i)->setText(1, QString("")); + for (int i = 0; i < core_registers->childCount(); ++i) + core_registers->child(i)->setText(1, QString("")); + + for (int i = 0; i < vfp_registers->childCount(); ++i) + vfp_registers->child(i)->setText(1, QString("")); + + for (int i = 0; i < cpsr->childCount(); ++i) + cpsr->child(i)->setText(1, QString("")); + + cpsr->setText(1, QString("")); + + // FPSCR + for (int i = 0; i < vfp_system_registers->child(0)->childCount(); ++i) + vfp_system_registers->child(0)->child(i)->setText(1, QString("")); - for (int i = 0; i < 15; ++i) - CSPR->child(i)->setText(1, QString("")); + // FPEXC + for (int i = 0; i < vfp_system_registers->child(1)->childCount(); ++i) + vfp_system_registers->child(1)->child(i)->setText(1, QString("")); - CSPR->setText(1, QString("")); + vfp_system_registers->child(0)->setText(1, QString("")); + vfp_system_registers->child(1)->setText(1, QString("")); + vfp_system_registers->child(2)->setText(1, QString("")); + vfp_system_registers->child(3)->setText(1, QString("")); setEnabled(false); } + +void RegistersWidget::CreateCPSRChildren() { + cpsr->addChild(new QTreeWidgetItem(QStringList("M"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("T"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("F"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("I"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("A"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("E"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("IT"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("GE"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("DNM"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("J"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("Q"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("V"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("C"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("Z"))); + cpsr->addChild(new QTreeWidgetItem(QStringList("N"))); +} + +void RegistersWidget::UpdateCPSRValues() { + const u32 cpsr_val = Core::g_app_core->GetCPSR(); + + cpsr->setText(1, QString("0x%1").arg(cpsr_val, 8, 16, QLatin1Char('0'))); + cpsr->child(0)->setText(1, QString("b%1").arg(cpsr_val & 0x1F, 5, 2, QLatin1Char('0'))); // M - Mode + cpsr->child(1)->setText(1, QString::number((cpsr_val >> 5) & 1)); // T - State + cpsr->child(2)->setText(1, QString::number((cpsr_val >> 6) & 1)); // F - FIQ disable + cpsr->child(3)->setText(1, QString::number((cpsr_val >> 7) & 1)); // I - IRQ disable + cpsr->child(4)->setText(1, QString::number((cpsr_val >> 8) & 1)); // A - Imprecise abort + cpsr->child(5)->setText(1, QString::number((cpsr_val >> 9) & 1)); // E - Data endianess + cpsr->child(6)->setText(1, QString::number((cpsr_val >> 10) & 0x3F)); // IT - If-Then state (DNM) + cpsr->child(7)->setText(1, QString::number((cpsr_val >> 16) & 0xF)); // GE - Greater-than-or-Equal + cpsr->child(8)->setText(1, QString::number((cpsr_val >> 20) & 0xF)); // DNM - Do not modify + cpsr->child(9)->setText(1, QString::number((cpsr_val >> 24) & 1)); // J - Jazelle + cpsr->child(10)->setText(1, QString::number((cpsr_val >> 27) & 1)); // Q - Saturation + cpsr->child(11)->setText(1, QString::number((cpsr_val >> 28) & 1)); // V - Overflow + cpsr->child(12)->setText(1, QString::number((cpsr_val >> 29) & 1)); // C - Carry/Borrow/Extend + cpsr->child(13)->setText(1, QString::number((cpsr_val >> 30) & 1)); // Z - Zero + cpsr->child(14)->setText(1, QString::number((cpsr_val >> 31) & 1)); // N - Negative/Less than +} + +void RegistersWidget::CreateVFPSystemRegisterChildren() { + QTreeWidgetItem* const fpscr = new QTreeWidgetItem(QStringList("FPSCR")); + fpscr->addChild(new QTreeWidgetItem(QStringList("IOC"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("DZC"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("OFC"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("UFC"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("IXC"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("IDC"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("IOE"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("DZE"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("OFE"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("UFE"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("IXE"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("IDE"))); + fpscr->addChild(new QTreeWidgetItem(QStringList(tr("Vector Length")))); + fpscr->addChild(new QTreeWidgetItem(QStringList(tr("Vector Stride")))); + fpscr->addChild(new QTreeWidgetItem(QStringList(tr("Rounding Mode")))); + fpscr->addChild(new QTreeWidgetItem(QStringList("FZ"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("DN"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("V"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("C"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("Z"))); + fpscr->addChild(new QTreeWidgetItem(QStringList("N"))); + + QTreeWidgetItem* const fpexc = new QTreeWidgetItem(QStringList("FPEXC")); + fpexc->addChild(new QTreeWidgetItem(QStringList("IOC"))); + fpexc->addChild(new QTreeWidgetItem(QStringList("OFC"))); + fpexc->addChild(new QTreeWidgetItem(QStringList("UFC"))); + fpexc->addChild(new QTreeWidgetItem(QStringList("INV"))); + fpexc->addChild(new QTreeWidgetItem(QStringList(tr("Vector Iteration Count")))); + fpexc->addChild(new QTreeWidgetItem(QStringList("FP2V"))); + fpexc->addChild(new QTreeWidgetItem(QStringList("EN"))); + fpexc->addChild(new QTreeWidgetItem(QStringList("EX"))); + + vfp_system_registers->addChild(fpscr); + vfp_system_registers->addChild(fpexc); + vfp_system_registers->addChild(new QTreeWidgetItem(QStringList("FPINST"))); + vfp_system_registers->addChild(new QTreeWidgetItem(QStringList("FPINST2"))); +} + +void RegistersWidget::UpdateVFPSystemRegisterValues() { + const u32 fpscr_val = Core::g_app_core->GetVFPSystemReg(VFP_FPSCR); + const u32 fpexc_val = Core::g_app_core->GetVFPSystemReg(VFP_FPEXC); + const u32 fpinst_val = Core::g_app_core->GetVFPSystemReg(VFP_FPINST); + const u32 fpinst2_val = Core::g_app_core->GetVFPSystemReg(VFP_FPINST2); + + QTreeWidgetItem* const fpscr = vfp_system_registers->child(0); + fpscr->setText(1, QString("0x%1").arg(fpscr_val, 8, 16, QLatin1Char('0'))); + fpscr->child(0)->setText(1, QString::number(fpscr_val & 1)); + fpscr->child(1)->setText(1, QString::number((fpscr_val >> 1) & 1)); + fpscr->child(2)->setText(1, QString::number((fpscr_val >> 2) & 1)); + fpscr->child(3)->setText(1, QString::number((fpscr_val >> 3) & 1)); + fpscr->child(4)->setText(1, QString::number((fpscr_val >> 4) & 1)); + fpscr->child(5)->setText(1, QString::number((fpscr_val >> 7) & 1)); + fpscr->child(6)->setText(1, QString::number((fpscr_val >> 8) & 1)); + fpscr->child(7)->setText(1, QString::number((fpscr_val >> 9) & 1)); + fpscr->child(8)->setText(1, QString::number((fpscr_val >> 10) & 1)); + fpscr->child(9)->setText(1, QString::number((fpscr_val >> 11) & 1)); + fpscr->child(10)->setText(1, QString::number((fpscr_val >> 12) & 1)); + fpscr->child(11)->setText(1, QString::number((fpscr_val >> 15) & 1)); + fpscr->child(12)->setText(1, QString("b%1").arg((fpscr_val >> 16) & 7, 3, 2, QLatin1Char('0'))); + fpscr->child(13)->setText(1, QString("b%1").arg((fpscr_val >> 20) & 3, 2, 2, QLatin1Char('0'))); + fpscr->child(14)->setText(1, QString("b%1").arg((fpscr_val >> 22) & 3, 2, 2, QLatin1Char('0'))); + fpscr->child(15)->setText(1, QString::number((fpscr_val >> 24) & 1)); + fpscr->child(16)->setText(1, QString::number((fpscr_val >> 25) & 1)); + fpscr->child(17)->setText(1, QString::number((fpscr_val >> 28) & 1)); + fpscr->child(18)->setText(1, QString::number((fpscr_val >> 29) & 1)); + fpscr->child(19)->setText(1, QString::number((fpscr_val >> 30) & 1)); + fpscr->child(20)->setText(1, QString::number((fpscr_val >> 31) & 1)); + + QTreeWidgetItem* const fpexc = vfp_system_registers->child(1); + fpexc->setText(1, QString("0x%1").arg(fpexc_val, 8, 16, QLatin1Char('0'))); + fpexc->child(0)->setText(1, QString::number(fpexc_val & 1)); + fpexc->child(1)->setText(1, QString::number((fpexc_val >> 2) & 1)); + fpexc->child(2)->setText(1, QString::number((fpexc_val >> 3) & 1)); + fpexc->child(3)->setText(1, QString::number((fpexc_val >> 7) & 1)); + fpexc->child(4)->setText(1, QString("b%1").arg((fpexc_val >> 8) & 7, 3, 2, QLatin1Char('0'))); + fpexc->child(5)->setText(1, QString::number((fpexc_val >> 28) & 1)); + fpexc->child(6)->setText(1, QString::number((fpexc_val >> 30) & 1)); + fpexc->child(7)->setText(1, QString::number((fpexc_val >> 31) & 1)); + + vfp_system_registers->child(2)->setText(1, QString("0x%1").arg(fpinst_val, 8, 16, QLatin1Char('0'))); + vfp_system_registers->child(3)->setText(1, QString("0x%1").arg(fpinst2_val, 8, 16, QLatin1Char('0'))); +} diff --git a/src/citra_qt/debugger/registers.h b/src/citra_qt/debugger/registers.h index 68e3fb908..09b830e80 100644 --- a/src/citra_qt/debugger/registers.h +++ b/src/citra_qt/debugger/registers.h @@ -25,10 +25,18 @@ public slots: void OnEmulationStopping(); private: + void CreateCPSRChildren(); + void UpdateCPSRValues(); + + void CreateVFPSystemRegisterChildren(); + void UpdateVFPSystemRegisterValues(); + Ui::ARMRegisters cpu_regs_ui; QTreeWidget* tree; - QTreeWidgetItem* registers; - QTreeWidgetItem* CSPR; + QTreeWidgetItem* core_registers; + QTreeWidgetItem* vfp_registers; + QTreeWidgetItem* vfp_system_registers; + QTreeWidgetItem* cpsr; }; diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp index 2746de779..a1a4865bd 100644 --- a/src/citra_qt/main.cpp +++ b/src/citra_qt/main.cpp @@ -7,6 +7,7 @@ #include <QtGui> #include <QDesktopWidget> #include <QFileDialog> +#include <QMessageBox> #include "qhexedit.h" #include "main.h" @@ -122,9 +123,8 @@ GMainWindow::GMainWindow() : emu_thread(nullptr) y = (screenRect.y() + screenRect.height()) / 2 - h * 55 / 100; setGeometry(x, y, w, h); - // Restore UI state - QSettings settings(QSettings::IniFormat, QSettings::UserScope, "Citra team", "Citra"); + QSettings settings; restoreGeometry(settings.value("geometry").toByteArray()); restoreState(settings.value("state").toByteArray()); render_window->restoreGeometry(settings.value("geometryRenderWindow").toByteArray()); @@ -132,12 +132,25 @@ GMainWindow::GMainWindow() : emu_thread(nullptr) ui.action_Use_Hardware_Renderer->setChecked(Settings::values.use_hw_renderer); SetHardwareRendererEnabled(ui.action_Use_Hardware_Renderer->isChecked()); + ui.action_Use_Shader_JIT->setChecked(Settings::values.use_shader_jit); + SetShaderJITEnabled(ui.action_Use_Shader_JIT->isChecked()); + ui.action_Single_Window_Mode->setChecked(settings.value("singleWindowMode", true).toBool()); ToggleWindowMode(); ui.actionDisplay_widget_title_bars->setChecked(settings.value("displayTitleBars", true).toBool()); OnDisplayTitleBars(ui.actionDisplay_widget_title_bars->isChecked()); + // Prepare actions for recent files + for (int i = 0; i < max_recent_files_item; ++i) { + actions_recent_files[i] = new QAction(this); + actions_recent_files[i]->setVisible(false); + connect(actions_recent_files[i], SIGNAL(triggered()), this, SLOT(OnMenuRecentFile())); + + ui.menu_recent_files->addAction(actions_recent_files[i]); + } + UpdateRecentFiles(); + // Setup connections connect(ui.action_Load_File, SIGNAL(triggered()), this, SLOT(OnMenuLoadFile())); connect(ui.action_Load_Symbol_Map, SIGNAL(triggered()), this, SLOT(OnMenuLoadSymbolMap())); @@ -145,6 +158,7 @@ GMainWindow::GMainWindow() : emu_thread(nullptr) connect(ui.action_Pause, SIGNAL(triggered()), this, SLOT(OnPauseGame())); connect(ui.action_Stop, SIGNAL(triggered()), this, SLOT(OnStopGame())); connect(ui.action_Use_Hardware_Renderer, SIGNAL(triggered(bool)), this, SLOT(SetHardwareRendererEnabled(bool))); + connect(ui.action_Use_Shader_JIT, SIGNAL(triggered(bool)), this, SLOT(SetShaderJITEnabled(bool))); connect(ui.action_Single_Window_Mode, SIGNAL(triggered(bool)), this, SLOT(ToggleWindowMode())); connect(ui.action_Hotkeys, SIGNAL(triggered()), this, SLOT(OnOpenHotkeysDialog())); @@ -207,9 +221,13 @@ void GMainWindow::OnDisplayTitleBars(bool show) } } -void GMainWindow::BootGame(std::string filename) { +void GMainWindow::BootGame(const std::string& filename) { LOG_INFO(Frontend, "Citra starting...\n"); + // Shutdown previous session if the emu thread is still active... + if (emu_thread != nullptr) + ShutdownGame(); + // Initialize the core emulation System::Init(render_window); @@ -263,40 +281,102 @@ void GMainWindow::ShutdownGame() { // Update the GUI ui.action_Start->setEnabled(false); + ui.action_Start->setText(tr("Start")); ui.action_Pause->setEnabled(false); ui.action_Stop->setEnabled(false); render_window->hide(); } -void GMainWindow::OnMenuLoadFile() -{ - QString filename = QFileDialog::getOpenFileName(this, tr("Load File"), QString(), tr("3DS executable (*.3ds *.3dsx *.elf *.axf *.cci *.cxi)")); +void GMainWindow::UpdateRecentFiles() { + QSettings settings; + QStringList recent_files = settings.value("recentFiles").toStringList(); + + unsigned int num_recent_files = std::min(recent_files.size(), static_cast<int>(max_recent_files_item)); + + for (unsigned int i = 0; i < num_recent_files; i++) { + QString text = QString("&%1. %2").arg(i + 1).arg(QFileInfo(recent_files[i]).fileName()); + actions_recent_files[i]->setText(text); + actions_recent_files[i]->setData(recent_files[i]); + actions_recent_files[i]->setVisible(true); + } + + for (int j = num_recent_files; j < max_recent_files_item; ++j) { + actions_recent_files[j]->setVisible(false); + } + + // Grey out the recent files menu if the list is empty + if (num_recent_files == 0) { + ui.menu_recent_files->setEnabled(false); + } else { + ui.menu_recent_files->setEnabled(true); + } +} + +void GMainWindow::OnMenuLoadFile() { + QSettings settings; + QString rom_path = settings.value("romsPath", QString()).toString(); + + QString filename = QFileDialog::getOpenFileName(this, tr("Load File"), rom_path, tr("3DS executable (*.3ds *.3dsx *.elf *.axf *.cci *.cxi)")); if (filename.size()) { - // Shutdown previous session if the emu thread is still active... - if (emu_thread != nullptr) - ShutdownGame(); + settings.setValue("romsPath", QFileInfo(filename).path()); + // Update recent files list + QStringList recent_files = settings.value("recentFiles").toStringList(); + recent_files.prepend(filename); + settings.setValue("recentFiles", recent_files); + UpdateRecentFiles(); // Update UI BootGame(filename.toLatin1().data()); } } void GMainWindow::OnMenuLoadSymbolMap() { - QString filename = QFileDialog::getOpenFileName(this, tr("Load Symbol Map"), QString(), tr("Symbol map (*)")); - if (filename.size()) + QSettings settings; + QString symbol_path = settings.value("symbolsPath", QString()).toString(); + + QString filename = QFileDialog::getOpenFileName(this, tr("Load Symbol Map"), symbol_path, tr("Symbol map (*)")); + if (filename.size()) { + settings.setValue("symbolsPath", QFileInfo(filename).path()); + LoadSymbolMap(filename.toLatin1().data()); + } } -void GMainWindow::OnStartGame() -{ +void GMainWindow::OnMenuRecentFile() { + QAction* action = qobject_cast<QAction*>(sender()); + assert(action); + + QString filename = action->data().toString(); + QFileInfo file_info(filename); + if (file_info.exists()) { + BootGame(filename.toLatin1().data()); + } else { + // Display an error message and remove the file from the list. + QMessageBox::information(this, tr("File not found"), tr("File \"%1\" not found").arg(filename)); + + QSettings settings; + QStringList recent_files = settings.value("recentFiles").toStringList(); + recent_files.removeOne(filename); + settings.setValue("recentFiles", recent_files); + + action->setVisible(false); + // Grey out the recent files menu if the list is empty + if (ui.menu_recent_files->isEmpty()) { + ui.menu_recent_files->setEnabled(false); + } + } +} + +void GMainWindow::OnStartGame() { emu_thread->SetRunning(true); ui.action_Start->setEnabled(false); + ui.action_Start->setText(tr("Continue")); + ui.action_Pause->setEnabled(true); ui.action_Stop->setEnabled(true); } -void GMainWindow::OnPauseGame() -{ +void GMainWindow::OnPauseGame() { emu_thread->SetRunning(false); ui.action_Start->setEnabled(true); @@ -308,8 +388,7 @@ void GMainWindow::OnStopGame() { ShutdownGame(); } -void GMainWindow::OnOpenHotkeysDialog() -{ +void GMainWindow::OnOpenHotkeysDialog() { GHotkeysDialog dialog(this); dialog.exec(); } @@ -318,6 +397,10 @@ void GMainWindow::SetHardwareRendererEnabled(bool enabled) { VideoCore::g_hw_renderer_enabled = enabled; } +void GMainWindow::SetShaderJITEnabled(bool enabled) { + VideoCore::g_shader_jit_enabled = enabled; +} + void GMainWindow::ToggleWindowMode() { if (ui.action_Single_Window_Mode->isChecked()) { // Render in the main window... @@ -337,13 +420,11 @@ void GMainWindow::ToggleWindowMode() { } } -void GMainWindow::OnConfigure() -{ +void GMainWindow::OnConfigure() { //GControllerConfigDialog* dialog = new GControllerConfigDialog(controller_ports, this); } -void GMainWindow::closeEvent(QCloseEvent* event) -{ +void GMainWindow::closeEvent(QCloseEvent* event) { // Save window layout QSettings settings(QSettings::IniFormat, QSettings::UserScope, "Citra team", "Citra"); settings.setValue("geometry", saveGeometry()); @@ -367,11 +448,15 @@ void GMainWindow::closeEvent(QCloseEvent* event) #undef main #endif -int main(int argc, char* argv[]) -{ +int main(int argc, char* argv[]) { Log::Filter log_filter(Log::Level::Info); Log::SetFilter(&log_filter); + // Init settings params + QSettings::setDefaultFormat(QSettings::IniFormat); + QCoreApplication::setOrganizationName("Citra team"); + QCoreApplication::setApplicationName("Citra"); + QApplication::setAttribute(Qt::AA_X11InitThreads); QApplication app(argc, argv); diff --git a/src/citra_qt/main.h b/src/citra_qt/main.h index 242b08c39..4b260ae8b 100644 --- a/src/citra_qt/main.h +++ b/src/citra_qt/main.h @@ -24,6 +24,8 @@ class GMainWindow : public QMainWindow { Q_OBJECT + static const int max_recent_files_item = 10; ///< Max number of recently loaded items to keep track + // TODO: Make use of this! enum { UI_IDLE, @@ -55,9 +57,11 @@ signals: void EmulationStopping(); private: - void BootGame(std::string filename); + void BootGame(const std::string& filename); void ShutdownGame(); + void UpdateRecentFiles(); + void closeEvent(QCloseEvent* event) override; private slots: @@ -66,10 +70,12 @@ private slots: void OnStopGame(); void OnMenuLoadFile(); void OnMenuLoadSymbolMap(); + void OnMenuRecentFile(); void OnOpenHotkeysDialog(); void OnConfigure(); void OnDisplayTitleBars(bool); void SetHardwareRendererEnabled(bool); + void SetShaderJITEnabled(bool); void ToggleWindowMode(); private: @@ -85,6 +91,8 @@ private: CallstackWidget* callstackWidget; GPUCommandStreamWidget* graphicsWidget; GPUCommandListWidget* graphicsCommandsWidget; + + QAction* actions_recent_files[max_recent_files_item]; }; #endif // _CITRA_QT_MAIN_HXX_ diff --git a/src/citra_qt/main.ui b/src/citra_qt/main.ui index 9a809ee6c..1ba700a3a 100644 --- a/src/citra_qt/main.ui +++ b/src/citra_qt/main.ui @@ -52,9 +52,16 @@ <property name="title"> <string>&File</string> </property> + <widget class="QMenu" name="menu_recent_files"> + <property name="title"> + <string>Recent Files</string> + </property> + </widget> <addaction name="action_Load_File"/> <addaction name="action_Load_Symbol_Map"/> <addaction name="separator"/> + <addaction name="menu_recent_files"/> + <addaction name="separator"/> <addaction name="action_Exit"/> </widget> <widget class="QMenu" name="menu_Emulation"> @@ -66,6 +73,7 @@ <addaction name="action_Stop"/> <addaction name="separator"/> <addaction name="action_Use_Hardware_Renderer"/> + <addaction name="action_Use_Shader_JIT"/> <addaction name="action_Configure"/> </widget> <widget class="QMenu" name="menu_View"> @@ -153,6 +161,14 @@ <string>Use Hardware Renderer</string> </property> </action> + <action name="action_Use_Shader_JIT"> + <property name="checkable"> + <bool>true</bool> + </property> + <property name="text"> + <string>Use Shader JIT</string> + </property> + </action> <action name="action_Configure"> <property name="text"> <string>Configure ...</string> diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 4c086cd2f..e743a026d 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -5,6 +5,7 @@ set(SRCS break_points.cpp emu_window.cpp file_util.cpp + hash.cpp key_map.cpp logging/filter.cpp logging/text_formatter.cpp @@ -24,14 +25,15 @@ set(HEADERS bit_field.h break_points.h chunk_file.h + code_block.h color.h common_funcs.h common_paths.h common_types.h - cpu_detect.h debug_interface.h emu_window.h file_util.h + hash.h key_map.h linear_disk_cache.h logging/text_formatter.h @@ -56,6 +58,18 @@ set(HEADERS vector_math.h ) +if(ARCHITECTURE_x86_64) + set(SRCS ${SRCS} + x64/abi.cpp + x64/cpu_detect.cpp + x64/emitter.cpp) + + set(HEADERS ${HEADERS} + x64/abi.h + x64/cpu_detect.h + x64/emitter.h) +endif() + create_directory_groups(${SRCS} ${HEADERS}) add_library(common STATIC ${SRCS} ${HEADERS}) diff --git a/src/common/bit_field.h b/src/common/bit_field.h index f64ebdaf6..d306ce9a9 100644 --- a/src/common/bit_field.h +++ b/src/common/bit_field.h @@ -141,22 +141,22 @@ public: BitField& operator=(const BitField&) = delete; #endif - __forceinline BitField& operator=(T val) + FORCE_INLINE BitField& operator=(T val) { Assign(val); return *this; } - __forceinline operator T() const + FORCE_INLINE operator T() const { return Value(); } - __forceinline void Assign(const T& value) { + FORCE_INLINE void Assign(const T& value) { storage = (storage & ~GetMask()) | (((StorageType)value << position) & GetMask()); } - __forceinline T Value() const + FORCE_INLINE T Value() const { if (std::numeric_limits<T>::is_signed) { @@ -170,7 +170,7 @@ public: } // TODO: we may want to change this to explicit operator bool() if it's bug-free in VS2015 - __forceinline bool ToBool() const + FORCE_INLINE bool ToBool() const { return Value() != 0; } @@ -187,7 +187,7 @@ private: // Unsigned version of StorageType typedef typename std::make_unsigned<StorageType>::type StorageTypeU; - __forceinline StorageType GetMask() const + FORCE_INLINE StorageType GetMask() const { return (((StorageTypeU)~0) >> (8 * sizeof(T)-bits)) << position; } diff --git a/src/common/code_block.h b/src/common/code_block.h new file mode 100644 index 000000000..9ef7296d3 --- /dev/null +++ b/src/common/code_block.h @@ -0,0 +1,87 @@ +// Copyright 2013 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#pragma once + +#include "common_types.h" +#include "memory_util.h" + +// Everything that needs to generate code should inherit from this. +// You get memory management for free, plus, you can use all emitter functions without +// having to prefix them with gen-> or something similar. +// Example implementation: +// class JIT : public CodeBlock<ARMXEmitter> {} +template<class T> class CodeBlock : public T, NonCopyable +{ +private: + // A privately used function to set the executable RAM space to something invalid. + // For debugging usefulness it should be used to set the RAM to a host specific breakpoint instruction + virtual void PoisonMemory() = 0; + +protected: + u8 *region; + size_t region_size; + +public: + CodeBlock() : region(nullptr), region_size(0) {} + virtual ~CodeBlock() { if (region) FreeCodeSpace(); } + + // Call this before you generate any code. + void AllocCodeSpace(int size) + { + region_size = size; + region = (u8*)AllocateExecutableMemory(region_size); + T::SetCodePtr(region); + } + + // Always clear code space with breakpoints, so that if someone accidentally executes + // uninitialized, it just breaks into the debugger. + void ClearCodeSpace() + { + PoisonMemory(); + ResetCodePtr(); + } + + // Call this when shutting down. Don't rely on the destructor, even though it'll do the job. + void FreeCodeSpace() + { +#ifdef __SYMBIAN32__ + ResetExecutableMemory(region); +#else + FreeMemoryPages(region, region_size); +#endif + region = nullptr; + region_size = 0; + } + + bool IsInSpace(const u8 *ptr) + { + return (ptr >= region) && (ptr < (region + region_size)); + } + + // Cannot currently be undone. Will write protect the entire code region. + // Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()). + void WriteProtect() + { + WriteProtectMemory(region, region_size, true); + } + + void ResetCodePtr() + { + T::SetCodePtr(region); + } + + size_t GetSpaceLeft() const + { + return region_size - (T::GetCodePtr() - region); + } + + u8 *GetBasePtr() { + return region; + } + + size_t GetOffset(const u8 *ptr) const { + return ptr - region; + } +}; diff --git a/src/common/common_funcs.h b/src/common/common_funcs.h index 59bd16dbf..88e452a16 100644 --- a/src/common/common_funcs.h +++ b/src/common/common_funcs.h @@ -20,12 +20,13 @@ #ifdef _WIN32 // Alignment + #define FORCE_INLINE __forceinline #define MEMORY_ALIGNED16(x) __declspec(align(16)) x #define MEMORY_ALIGNED32(x) __declspec(align(32)) x #define MEMORY_ALIGNED64(x) __declspec(align(64)) x #define MEMORY_ALIGNED128(x) __declspec(align(128)) x #else - #define __forceinline inline __attribute__((always_inline)) + #define FORCE_INLINE inline __attribute__((always_inline)) #define MEMORY_ALIGNED16(x) __attribute__((aligned(16))) x #define MEMORY_ALIGNED32(x) __attribute__((aligned(32))) x #define MEMORY_ALIGNED64(x) __attribute__((aligned(64))) x @@ -34,7 +35,7 @@ #ifndef _MSC_VER -#if defined(__x86_64__) || defined(_M_X64) +#ifdef ARCHITECTURE_x86_64 #define Crash() __asm__ __volatile__("int $3") #elif defined(_M_ARM) #define Crash() __asm__ __volatile__("trap") diff --git a/src/common/cpu_detect.h b/src/common/cpu_detect.h deleted file mode 100644 index b585f9608..000000000 --- a/src/common/cpu_detect.h +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2013 Dolphin Emulator Project / 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - - -// Detect the cpu, so we'll know which optimizations to use -#pragma once - -#include <string> - -enum CPUVendor -{ - VENDOR_INTEL = 0, - VENDOR_AMD = 1, - VENDOR_ARM = 2, - VENDOR_OTHER = 3, -}; - -struct CPUInfo -{ - CPUVendor vendor; - - char cpu_string[0x21]; - char brand_string[0x41]; - bool OS64bit; - bool CPU64bit; - bool Mode64bit; - - bool HTT; - int num_cores; - int logical_cpu_count; - - bool bSSE; - bool bSSE2; - bool bSSE3; - bool bSSSE3; - bool bPOPCNT; - bool bSSE4_1; - bool bSSE4_2; - bool bLZCNT; - bool bSSE4A; - bool bAVX; - bool bAES; - bool bLAHFSAHF64; - bool bLongMode; - - // ARM specific CPUInfo - bool bSwp; - bool bHalf; - bool bThumb; - bool bFastMult; - bool bVFP; - bool bEDSP; - bool bThumbEE; - bool bNEON; - bool bVFPv3; - bool bTLS; - bool bVFPv4; - bool bIDIVa; - bool bIDIVt; - bool bArmV7; // enable MOVT, MOVW etc - - // ARMv8 specific - bool bFP; - bool bASIMD; - - // Call Detect() - explicit CPUInfo(); - - // Turn the cpu info into a string we can show - std::string Summarize(); - -private: - // Detects the various cpu features - void Detect(); -}; - -extern CPUInfo cpu_info; diff --git a/src/common/hash.cpp b/src/common/hash.cpp new file mode 100644 index 000000000..413e9c6f1 --- /dev/null +++ b/src/common/hash.cpp @@ -0,0 +1,126 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#if defined(_MSC_VER) +#include <stdlib.h> +#endif + +#include "common_funcs.h" +#include "common_types.h" +#include "hash.h" + +namespace Common { + +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Block read - if your platform needs to do endian-swapping or can only handle aligned reads, do +// the conversion here + +static FORCE_INLINE u32 getblock32(const u32* p, int i) { + return p[i]; +} + +static FORCE_INLINE u64 getblock64(const u64* p, int i) { + return p[i]; +} + +// Finalization mix - force all bits of a hash block to avalanche + +static FORCE_INLINE u32 fmix32(u32 h) { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +static FORCE_INLINE u64 fmix64(u64 k) { + k ^= k >> 33; + k *= 0xff51afd7ed558ccdllu; + k ^= k >> 33; + k *= 0xc4ceb9fe1a85ec53llu; + k ^= k >> 33; + + return k; +} + +// This is the 128-bit variant of the MurmurHash3 hash function that is targetted for 64-bit +// platforms (MurmurHash3_x64_128). It was taken from: +// https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp +void MurmurHash3_128(const void* key, int len, u32 seed, void* out) { + const u8 * data = (const u8*)key; + const int nblocks = len / 16; + + u64 h1 = seed; + u64 h2 = seed; + + const u64 c1 = 0x87c37b91114253d5llu; + const u64 c2 = 0x4cf5ad432745937fllu; + + // Body + + const u64 * blocks = (const u64 *)(data); + + for (int i = 0; i < nblocks; i++) { + u64 k1 = getblock64(blocks,i*2+0); + u64 k2 = getblock64(blocks,i*2+1); + + k1 *= c1; k1 = _rotl64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = _rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = _rotl64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = _rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + // Tail + + const u8 * tail = (const u8*)(data + nblocks*16); + + u64 k1 = 0; + u64 k2 = 0; + + switch (len & 15) { + case 15: k2 ^= ((u64)tail[14]) << 48; + case 14: k2 ^= ((u64)tail[13]) << 40; + case 13: k2 ^= ((u64)tail[12]) << 32; + case 12: k2 ^= ((u64)tail[11]) << 24; + case 11: k2 ^= ((u64)tail[10]) << 16; + case 10: k2 ^= ((u64)tail[ 9]) << 8; + case 9: k2 ^= ((u64)tail[ 8]) << 0; + k2 *= c2; k2 = _rotl64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((u64)tail[ 7]) << 56; + case 7: k1 ^= ((u64)tail[ 6]) << 48; + case 6: k1 ^= ((u64)tail[ 5]) << 40; + case 5: k1 ^= ((u64)tail[ 4]) << 32; + case 4: k1 ^= ((u64)tail[ 3]) << 24; + case 3: k1 ^= ((u64)tail[ 2]) << 16; + case 2: k1 ^= ((u64)tail[ 1]) << 8; + case 1: k1 ^= ((u64)tail[ 0]) << 0; + k1 *= c1; k1 = _rotl64(k1,31); k1 *= c2; h1 ^= k1; + }; + + // Finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((u64*)out)[0] = h1; + ((u64*)out)[1] = h2; +} + +} // namespace Common diff --git a/src/common/hash.h b/src/common/hash.h new file mode 100644 index 000000000..a3850be68 --- /dev/null +++ b/src/common/hash.h @@ -0,0 +1,25 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_types.h" + +namespace Common { + +void MurmurHash3_128(const void* key, int len, u32 seed, void* out); + +/** + * Computes a 64-bit hash over the specified block of data + * @param data Block of data to compute hash over + * @param len Length of data (in bytes) to compute hash over + * @returns 64-bit hash value that was computed over the data block + */ +static inline u64 ComputeHash64(const void* data, int len) { + u64 res[2]; + MurmurHash3_128(data, len, 0, res); + return res[0]; +} + +} // namespace Common diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp index d85e58373..92e8e742d 100644 --- a/src/common/logging/backend.cpp +++ b/src/common/logging/backend.cpp @@ -6,6 +6,7 @@ #include <array> #include <cstdio> +#include "common/assert.h" #include "common/common_funcs.h" // snprintf compatibility define #include "common/logging/backend.h" #include "common/logging/filter.h" @@ -65,8 +66,9 @@ const char* GetLogClassName(Class log_class) { ALL_LOG_CLASSES() #undef CLS #undef SUB + case Class::Count: + UNREACHABLE(); } - return "Unknown"; } const char* GetLevelName(Level log_level) { @@ -78,8 +80,9 @@ const char* GetLevelName(Level log_level) { LVL(Warning); LVL(Error); LVL(Critical); + case Level::Count: + UNREACHABLE(); } - return "Unknown"; #undef LVL } diff --git a/src/common/logging/text_formatter.cpp b/src/common/logging/text_formatter.cpp index 94f3dfc1f..de195b0f7 100644 --- a/src/common/logging/text_formatter.cpp +++ b/src/common/logging/text_formatter.cpp @@ -14,6 +14,7 @@ #include "common/logging/log.h" #include "common/logging/text_formatter.h" +#include "common/assert.h" #include "common/common_funcs.h" #include "common/string_util.h" @@ -82,6 +83,8 @@ void PrintColoredMessage(const Entry& entry) { color = FOREGROUND_RED | FOREGROUND_INTENSITY; break; case Level::Critical: // Bright magenta color = FOREGROUND_RED | FOREGROUND_BLUE | FOREGROUND_INTENSITY; break; + case Level::Count: + UNREACHABLE(); } SetConsoleTextAttribute(console_handle, color); @@ -101,6 +104,8 @@ void PrintColoredMessage(const Entry& entry) { color = ESC "[1;31m"; break; case Level::Critical: // Bright magenta color = ESC "[1;35m"; break; + case Level::Count: + UNREACHABLE(); } fputs(color, stderr); diff --git a/src/common/memory_util.cpp b/src/common/memory_util.cpp index 2b3ace528..5ef784224 100644 --- a/src/common/memory_util.cpp +++ b/src/common/memory_util.cpp @@ -16,7 +16,7 @@ #include <sys/mman.h> #endif -#if !defined(_WIN32) && defined(__x86_64__) && !defined(MAP_32BIT) +#if !defined(_WIN32) && defined(ARCHITECTURE_X64) && !defined(MAP_32BIT) #include <unistd.h> #define PAGE_MASK (getpagesize() - 1) #define round_page(x) ((((unsigned long)(x)) + PAGE_MASK) & ~(PAGE_MASK)) @@ -31,7 +31,7 @@ void* AllocateExecutableMemory(size_t size, bool low) void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); #else static char *map_hint = 0; -#if defined(__x86_64__) && !defined(MAP_32BIT) +#if defined(ARCHITECTURE_X64) && !defined(MAP_32BIT) // This OS has no flag to enforce allocation below the 4 GB boundary, // but if we hint that we want a low address it is very likely we will // get one. @@ -43,7 +43,7 @@ void* AllocateExecutableMemory(size_t size, bool low) #endif void* ptr = mmap(map_hint, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE -#if defined(__x86_64__) && defined(MAP_32BIT) +#if defined(ARCHITECTURE_X64) && defined(MAP_32BIT) | (low ? MAP_32BIT : 0) #endif , -1, 0); @@ -62,7 +62,7 @@ void* AllocateExecutableMemory(size_t size, bool low) #endif LOG_ERROR(Common_Memory, "Failed to allocate executable memory"); } -#if !defined(_WIN32) && defined(__x86_64__) && !defined(MAP_32BIT) +#if !defined(_WIN32) && defined(ARCHITECTURE_X64) && !defined(MAP_32BIT) else { if (low) diff --git a/src/common/platform.h b/src/common/platform.h index 0a912dda3..9ba4db11b 100644 --- a/src/common/platform.h +++ b/src/common/platform.h @@ -27,7 +27,7 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// // Platform detection -#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) +#if defined(ARCHITECTURE_x86_64) || defined(__aarch64__) #define EMU_ARCH_BITS 64 #elif defined(__i386) || defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) #define EMU_ARCH_BITS 32 diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index b2f7f7b1d..6d6fc591f 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp @@ -296,14 +296,28 @@ std::string ReplaceAll(std::string result, const std::string& src, const std::st std::string UTF16ToUTF8(const std::u16string& input) { +#if _MSC_VER >= 1900 + // Workaround for missing char16_t/char32_t instantiations in MSVC2015 + std::wstring_convert<std::codecvt_utf8_utf16<__int16>, __int16> convert; + std::basic_string<__int16> tmp_buffer(input.cbegin(), input.cend()); + return convert.to_bytes(tmp_buffer); +#else std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert; return convert.to_bytes(input); +#endif } std::u16string UTF8ToUTF16(const std::string& input) { +#if _MSC_VER >= 1900 + // Workaround for missing char16_t/char32_t instantiations in MSVC2015 + std::wstring_convert<std::codecvt_utf8_utf16<__int16>, __int16> convert; + auto tmp_buffer = convert.from_bytes(input); + return std::u16string(tmp_buffer.cbegin(), tmp_buffer.cend()); +#else std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert; return convert.from_bytes(input); +#endif } static std::string UTF16ToUTF8(const std::wstring& input) diff --git a/src/common/x64/abi.cpp b/src/common/x64/abi.cpp new file mode 100644 index 000000000..4c07a6ebe --- /dev/null +++ b/src/common/x64/abi.cpp @@ -0,0 +1,680 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include "abi.h" +#include "emitter.h" + +using namespace Gen; + +// Shared code between Win64 and Unix64 + +// Sets up a __cdecl function. +void XEmitter::ABI_EmitPrologue(int maxCallParams) +{ +#ifdef _M_IX86 + // Don't really need to do anything +#elif defined(ARCHITECTURE_x86_64) +#if _WIN32 + int stacksize = ((maxCallParams + 1) & ~1) * 8 + 8; + // Set up a stack frame so that we can call functions + // TODO: use maxCallParams + SUB(64, R(RSP), Imm8(stacksize)); +#endif +#else +#error Arch not supported +#endif +} + +void XEmitter::ABI_EmitEpilogue(int maxCallParams) +{ +#ifdef _M_IX86 + RET(); +#elif defined(ARCHITECTURE_x86_64) +#ifdef _WIN32 + int stacksize = ((maxCallParams+1)&~1)*8 + 8; + ADD(64, R(RSP), Imm8(stacksize)); +#endif + RET(); +#else +#error Arch not supported + + +#endif +} + +#ifdef _M_IX86 // All32 + +// Shared code between Win32 and Unix32 +void XEmitter::ABI_CallFunction(const void *func) { + ABI_AlignStack(0); + CALL(func); + ABI_RestoreStack(0); +} + +void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { + ABI_AlignStack(1 * 2); + PUSH(16, Imm16(param1)); + CALL(func); + ABI_RestoreStack(1 * 2); +} + +void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { + ABI_AlignStack(1 * 2 + 1 * 4); + PUSH(16, Imm16(param2)); + PUSH(32, Imm32(param1)); + CALL(func); + ABI_RestoreStack(1 * 2 + 1 * 4); +} + +void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) { + ABI_AlignStack(1 * 4); + PUSH(32, Imm32(param1)); + CALL(func); + ABI_RestoreStack(1 * 4); +} + +void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) { + ABI_AlignStack(2 * 4); + PUSH(32, Imm32(param2)); + PUSH(32, Imm32(param1)); + CALL(func); + ABI_RestoreStack(2 * 4); +} + +void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) { + ABI_AlignStack(3 * 4); + PUSH(32, Imm32(param3)); + PUSH(32, Imm32(param2)); + PUSH(32, Imm32(param1)); + CALL(func); + ABI_RestoreStack(3 * 4); +} + +void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) { + ABI_AlignStack(3 * 4); + PUSH(32, ImmPtr(param3)); + PUSH(32, Imm32(param2)); + PUSH(32, Imm32(param1)); + CALL(func); + ABI_RestoreStack(3 * 4); +} + +void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2,u32 param3, void *param4) { + ABI_AlignStack(4 * 4); + PUSH(32, ImmPtr(param4)); + PUSH(32, Imm32(param3)); + PUSH(32, Imm32(param2)); + PUSH(32, Imm32(param1)); + CALL(func); + ABI_RestoreStack(4 * 4); +} + +void XEmitter::ABI_CallFunctionP(const void *func, void *param1) { + ABI_AlignStack(1 * 4); + PUSH(32, ImmPtr(param1)); + CALL(func); + ABI_RestoreStack(1 * 4); +} + +void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) { + ABI_AlignStack(2 * 4); + PUSH(32, arg2); + PUSH(32, ImmPtr(param1)); + CALL(func); + ABI_RestoreStack(2 * 4); +} + +void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) { + ABI_AlignStack(3 * 4); + PUSH(32, arg3); + PUSH(32, arg2); + PUSH(32, ImmPtr(param1)); + CALL(func); + ABI_RestoreStack(3 * 4); +} + +void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) { + ABI_AlignStack(3 * 4); + PUSH(32, Imm32(param3)); + PUSH(32, ImmPtr(param2)); + PUSH(32, ImmPtr(param1)); + CALL(func); + ABI_RestoreStack(3 * 4); +} + +// Pass a register as a parameter. +void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) { + ABI_AlignStack(1 * 4); + PUSH(32, R(reg1)); + CALL(func); + ABI_RestoreStack(1 * 4); +} + +// Pass two registers as parameters. +void XEmitter::ABI_CallFunctionRR(const void *func, Gen::X64Reg reg1, Gen::X64Reg reg2) +{ + ABI_AlignStack(2 * 4); + PUSH(32, R(reg2)); + PUSH(32, R(reg1)); + CALL(func); + ABI_RestoreStack(2 * 4); +} + +void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2) +{ + ABI_AlignStack(2 * 4); + PUSH(32, Imm32(param2)); + PUSH(32, arg1); + CALL(func); + ABI_RestoreStack(2 * 4); +} + +void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3) +{ + ABI_AlignStack(3 * 4); + PUSH(32, Imm32(param3)); + PUSH(32, Imm32(param2)); + PUSH(32, arg1); + CALL(func); + ABI_RestoreStack(3 * 4); +} + +void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1) +{ + ABI_AlignStack(1 * 4); + PUSH(32, arg1); + CALL(func); + ABI_RestoreStack(1 * 4); +} + +void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2) +{ + ABI_AlignStack(2 * 4); + PUSH(32, arg2); + PUSH(32, arg1); + CALL(func); + ABI_RestoreStack(2 * 4); +} + +void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { + // Note: 4 * 4 = 16 bytes, so alignment is preserved. + PUSH(EBP); + PUSH(EBX); + PUSH(ESI); + PUSH(EDI); +} + +void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { + POP(EDI); + POP(ESI); + POP(EBX); + POP(EBP); +} + +unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { + frameSize += 4; // reserve space for return address + unsigned int alignedSize = +#ifdef __GNUC__ + (frameSize + 15) & -16; +#else + (frameSize + 3) & -4; +#endif + return alignedSize; +} + + +void XEmitter::ABI_AlignStack(unsigned int frameSize) { +// Mac OS X requires the stack to be 16-byte aligned before every call. +// Linux requires the stack to be 16-byte aligned before calls that put SSE +// vectors on the stack, but since we do not keep track of which calls do that, +// it is effectively every call as well. +// Windows binaries compiled with MSVC do not have such a restriction*, but I +// expect that GCC on Windows acts the same as GCC on Linux in this respect. +// It would be nice if someone could verify this. +// *However, the MSVC optimizing compiler assumes a 4-byte-aligned stack at times. + unsigned int fillSize = + ABI_GetAlignedFrameSize(frameSize) - (frameSize + 4); + if (fillSize != 0) { + SUB(32, R(ESP), Imm8(fillSize)); + } +} + +void XEmitter::ABI_RestoreStack(unsigned int frameSize) { + unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize); + alignedSize -= 4; // return address is POPped at end of call + if (alignedSize != 0) { + ADD(32, R(ESP), Imm8(alignedSize)); + } +} + +#else //64bit + +// Common functions +void XEmitter::ABI_CallFunction(const void *func) { + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { + MOV(32, R(ABI_PARAM1), Imm32((u32)param1)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(32, R(ABI_PARAM2), Imm32((u32)param2)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + MOV(32, R(ABI_PARAM3), Imm32(param3)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + MOV(64, R(ABI_PARAM3), ImmPtr(param3)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4) { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + MOV(32, R(ABI_PARAM3), Imm32(param3)); + MOV(64, R(ABI_PARAM4), ImmPtr(param4)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionP(const void *func, void *param1) { + MOV(64, R(ABI_PARAM1), ImmPtr(param1)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) { + MOV(64, R(ABI_PARAM1), ImmPtr(param1)); + if (!arg2.IsSimpleReg(ABI_PARAM2)) + MOV(32, R(ABI_PARAM2), arg2); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) { + MOV(64, R(ABI_PARAM1), ImmPtr(param1)); + if (!arg2.IsSimpleReg(ABI_PARAM2)) + MOV(32, R(ABI_PARAM2), arg2); + if (!arg3.IsSimpleReg(ABI_PARAM3)) + MOV(32, R(ABI_PARAM3), arg3); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) { + MOV(64, R(ABI_PARAM1), ImmPtr(param1)); + MOV(64, R(ABI_PARAM2), ImmPtr(param2)); + MOV(32, R(ABI_PARAM3), Imm32(param3)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +// Pass a register as a parameter. +void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) { + if (reg1 != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(reg1)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +// Pass two registers as parameters. +void XEmitter::ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2) { + if (reg2 != ABI_PARAM1) { + if (reg1 != ABI_PARAM1) + MOV(64, R(ABI_PARAM1), R(reg1)); + if (reg2 != ABI_PARAM2) + MOV(64, R(ABI_PARAM2), R(reg2)); + } else { + if (reg2 != ABI_PARAM2) + MOV(64, R(ABI_PARAM2), R(reg2)); + if (reg1 != ABI_PARAM1) + MOV(64, R(ABI_PARAM1), R(reg1)); + } + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2) +{ + if (!arg1.IsSimpleReg(ABI_PARAM1)) + MOV(32, R(ABI_PARAM1), arg1); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3) +{ + if (!arg1.IsSimpleReg(ABI_PARAM1)) + MOV(32, R(ABI_PARAM1), arg1); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + MOV(64, R(ABI_PARAM3), Imm64(param3)); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1) +{ + if (!arg1.IsSimpleReg(ABI_PARAM1)) + MOV(32, R(ABI_PARAM1), arg1); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2) +{ + if (!arg1.IsSimpleReg(ABI_PARAM1)) + MOV(32, R(ABI_PARAM1), arg1); + if (!arg2.IsSimpleReg(ABI_PARAM2)) + MOV(32, R(ABI_PARAM2), arg2); + u64 distance = u64(func) - (u64(code) + 5); + if (distance >= 0x0000000080000000ULL + && distance < 0xFFFFFFFF80000000ULL) { + // Far call + MOV(64, R(RAX), ImmPtr(func)); + CALLptr(R(RAX)); + } else { + CALL(func); + } +} + +unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { + return frameSize; +} + +#ifdef _WIN32 + +// The Windows x64 ABI requires XMM6 - XMM15 to be callee saved. 10 regs. +// But, not saving XMM4 and XMM5 breaks things in VS 2010, even though they are volatile regs. +// Let's just save all 16. +const int XMM_STACK_SPACE = 16 * 16; + +// Win64 Specific Code +void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { + //we only want to do this once + PUSH(RBX); + PUSH(RSI); + PUSH(RDI); + PUSH(RBP); + PUSH(R12); + PUSH(R13); + PUSH(R14); + PUSH(R15); + ABI_AlignStack(0); + + // Do this after aligning, because before it's offset by 8. + SUB(64, R(RSP), Imm32(XMM_STACK_SPACE)); + for (int i = 0; i < 16; ++i) + MOVAPS(MDisp(RSP, i * 16), (X64Reg)(XMM0 + i)); +} + +void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { + for (int i = 0; i < 16; ++i) + MOVAPS((X64Reg)(XMM0 + i), MDisp(RSP, i * 16)); + ADD(64, R(RSP), Imm32(XMM_STACK_SPACE)); + + ABI_RestoreStack(0); + POP(R15); + POP(R14); + POP(R13); + POP(R12); + POP(RBP); + POP(RDI); + POP(RSI); + POP(RBX); +} + +// Win64 Specific Code +void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { + PUSH(RCX); + PUSH(RDX); + PUSH(RSI); + PUSH(RDI); + PUSH(R8); + PUSH(R9); + PUSH(R10); + PUSH(R11); + // TODO: Callers preserve XMM4-5 (XMM0-3 are args.) + ABI_AlignStack(0); +} + +void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { + ABI_RestoreStack(0); + POP(R11); + POP(R10); + POP(R9); + POP(R8); + POP(RDI); + POP(RSI); + POP(RDX); + POP(RCX); +} + +void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { + SUB(64, R(RSP), Imm8(0x28)); +} + +void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { + ADD(64, R(RSP), Imm8(0x28)); +} + +#else +// Unix64 Specific Code +void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { + PUSH(RBX); + PUSH(RBP); + PUSH(R12); + PUSH(R13); + PUSH(R14); + PUSH(R15); + PUSH(R15); //just to align stack. duped push/pop doesn't hurt. + // TODO: XMM? +} + +void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { + POP(R15); + POP(R15); + POP(R14); + POP(R13); + POP(R12); + POP(RBP); + POP(RBX); +} + +void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { + PUSH(RCX); + PUSH(RDX); + PUSH(RSI); + PUSH(RDI); + PUSH(R8); + PUSH(R9); + PUSH(R10); + PUSH(R11); + PUSH(R11); +} + +void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { + POP(R11); + POP(R11); + POP(R10); + POP(R9); + POP(R8); + POP(RDI); + POP(RSI); + POP(RDX); + POP(RCX); +} + +void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { + SUB(64, R(RSP), Imm8(0x08)); +} + +void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { + ADD(64, R(RSP), Imm8(0x08)); +} + +#endif // WIN32 + +#endif // 32bit diff --git a/src/common/x64/abi.h b/src/common/x64/abi.h new file mode 100644 index 000000000..7e9c156ae --- /dev/null +++ b/src/common/x64/abi.h @@ -0,0 +1,78 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#pragma once + +#include "common/common_types.h" + +// x86/x64 ABI:s, and helpers to help follow them when JIT-ing code. +// All convensions return values in EAX (+ possibly EDX). + +// Linux 32-bit, Windows 32-bit (cdecl, System V): +// * Caller pushes left to right +// * Caller fixes stack after call +// * function subtract from stack for local storage only. +// Scratch: EAX ECX EDX +// Callee-save: EBX ESI EDI EBP +// Parameters: - + +// Windows 64-bit +// * 4-reg "fastcall" variant, very new-skool stack handling +// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself calls_ +// * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space. +// Scratch: RAX RCX RDX R8 R9 R10 R11 +// Callee-save: RBX RSI RDI RBP R12 R13 R14 R15 +// Parameters: RCX RDX R8 R9, further MOV-ed + +// Linux 64-bit +// * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed) +// Scratch: RAX RCX RDX RSI RDI R8 R9 R10 R11 +// Callee-save: RBX RBP R12 R13 R14 R15 +// Parameters: RDI RSI RDX RCX R8 R9 + +#ifdef _M_IX86 // 32 bit calling convention, shared by all + +// 32-bit don't pass parameters in regs, but these are convenient to have anyway when we have to +// choose regs to put stuff in. +#define ABI_PARAM1 RCX +#define ABI_PARAM2 RDX + +// There are no ABI_PARAM* here, since args are pushed. +// 32-bit bog standard cdecl, shared between linux and windows +// MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about. + +#elif ARCHITECTURE_x86_64 // 64 bit calling convention + +#ifdef _WIN32 // 64-bit Windows - the really exotic calling convention + +#define ABI_PARAM1 RCX +#define ABI_PARAM2 RDX +#define ABI_PARAM3 R8 +#define ABI_PARAM4 R9 + +#else //64-bit Unix (hopefully MacOSX too) + +#define ABI_PARAM1 RDI +#define ABI_PARAM2 RSI +#define ABI_PARAM3 RDX +#define ABI_PARAM4 RCX +#define ABI_PARAM5 R8 +#define ABI_PARAM6 R9 + +#endif // WIN32 + +#endif // X86 diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp new file mode 100644 index 000000000..d9c430c67 --- /dev/null +++ b/src/common/x64/cpu_detect.cpp @@ -0,0 +1,187 @@ +// Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstring> +#include <string> +#include <thread> + +#include "common/common_types.h" + +#include "cpu_detect.h" + +namespace Common { + +#ifndef _MSC_VER + +#ifdef __FreeBSD__ +#include <sys/types.h> +#include <machine/cpufunc.h> +#endif + +static inline void __cpuidex(int info[4], int function_id, int subfunction_id) { +#ifdef __FreeBSD__ + // Despite the name, this is just do_cpuid() with ECX as second input. + cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info); +#else + info[0] = function_id; // eax + info[2] = subfunction_id; // ecx + __asm__( + "cpuid" + : "=a" (info[0]), + "=b" (info[1]), + "=c" (info[2]), + "=d" (info[3]) + : "a" (function_id), + "c" (subfunction_id) + ); +#endif +} + +static inline void __cpuid(int info[4], int function_id) { + return __cpuidex(info, function_id, 0); +} + +#define _XCR_XFEATURE_ENABLED_MASK 0 +static u64 _xgetbv(u32 index) { + u32 eax, edx; + __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); + return ((u64)edx << 32) | eax; +} + +#endif // ifndef _MSC_VER + +// Detects the various CPU features +static CPUCaps Detect() { + CPUCaps caps = {}; + + caps.num_cores = std::thread::hardware_concurrency(); + + // Assumes the CPU supports the CPUID instruction. Those that don't would likely not support + // Citra at all anyway + + int cpu_id[4]; + memset(caps.brand_string, 0, sizeof(caps.brand_string)); + + // Detect CPU's CPUID capabilities and grab CPU string + __cpuid(cpu_id, 0x00000000); + u32 max_std_fn = cpu_id[0]; // EAX + + std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(int)); + std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(int)); + std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(int)); + + __cpuid(cpu_id, 0x80000000); + + u32 max_ex_fn = cpu_id[0]; + if (!strcmp(caps.brand_string, "GenuineIntel")) + caps.vendor = CPUVendor::INTEL; + else if (!strcmp(caps.brand_string, "AuthenticAMD")) + caps.vendor = CPUVendor::AMD; + else + caps.vendor = CPUVendor::OTHER; + + // Set reasonable default brand string even if brand string not available + strcpy(caps.cpu_string, caps.brand_string); + + // Detect family and other miscellaneous features + if (max_std_fn >= 1) { + __cpuid(cpu_id, 0x00000001); + + if ((cpu_id[3] >> 25) & 1) caps.sse = true; + if ((cpu_id[3] >> 26) & 1) caps.sse2 = true; + if ((cpu_id[2]) & 1) caps.sse3 = true; + if ((cpu_id[2] >> 9) & 1) caps.ssse3 = true; + if ((cpu_id[2] >> 19) & 1) caps.sse4_1 = true; + if ((cpu_id[2] >> 20) & 1) caps.sse4_2 = true; + if ((cpu_id[2] >> 22) & 1) caps.movbe = true; + if ((cpu_id[2] >> 25) & 1) caps.aes = true; + + if ((cpu_id[3] >> 24) & 1) { + caps.fxsave_fxrstor = true; + } + + // AVX support requires 3 separate checks: + // - Is the AVX bit set in CPUID? + // - Is the XSAVE bit set in CPUID? + // - XGETBV result has the XCR bit set. + if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1)) { + if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) { + caps.avx = true; + if ((cpu_id[2] >> 12) & 1) + caps.fma = true; + } + } + + if (max_std_fn >= 7) { + __cpuidex(cpu_id, 0x00000007, 0x00000000); + // Can't enable AVX2 unless the XSAVE/XGETBV checks above passed + if ((cpu_id[1] >> 5) & 1) + caps.avx2 = caps.avx; + if ((cpu_id[1] >> 3) & 1) + caps.bmi1 = true; + if ((cpu_id[1] >> 8) & 1) + caps.bmi2 = true; + } + } + + caps.flush_to_zero = caps.sse; + + if (max_ex_fn >= 0x80000004) { + // Extract CPU model string + __cpuid(cpu_id, 0x80000002); + std::memcpy(caps.cpu_string, cpu_id, sizeof(cpu_id)); + __cpuid(cpu_id, 0x80000003); + std::memcpy(caps.cpu_string + 16, cpu_id, sizeof(cpu_id)); + __cpuid(cpu_id, 0x80000004); + std::memcpy(caps.cpu_string + 32, cpu_id, sizeof(cpu_id)); + } + + if (max_ex_fn >= 0x80000001) { + // Check for more features + __cpuid(cpu_id, 0x80000001); + if (cpu_id[2] & 1) caps.lahf_sahf_64 = true; + if ((cpu_id[2] >> 5) & 1) caps.lzcnt = true; + if ((cpu_id[2] >> 16) & 1) caps.fma4 = true; + if ((cpu_id[3] >> 29) & 1) caps.long_mode = true; + } + + return caps; +} + +const CPUCaps& GetCPUCaps() { + static CPUCaps caps = Detect(); + return caps; +} + +std::string GetCPUCapsString() { + auto caps = GetCPUCaps(); + + std::string sum(caps.cpu_string); + sum += " ("; + sum += caps.brand_string; + sum += ")"; + + if (caps.sse) sum += ", SSE"; + if (caps.sse2) { + sum += ", SSE2"; + if (!caps.flush_to_zero) sum += " (without DAZ)"; + } + + if (caps.sse3) sum += ", SSE3"; + if (caps.ssse3) sum += ", SSSE3"; + if (caps.sse4_1) sum += ", SSE4.1"; + if (caps.sse4_2) sum += ", SSE4.2"; + if (caps.avx) sum += ", AVX"; + if (caps.avx2) sum += ", AVX2"; + if (caps.bmi1) sum += ", BMI1"; + if (caps.bmi2) sum += ", BMI2"; + if (caps.fma) sum += ", FMA"; + if (caps.aes) sum += ", AES"; + if (caps.movbe) sum += ", MOVBE"; + if (caps.long_mode) sum += ", 64-bit support"; + + return sum; +} + +} // namespace Common diff --git a/src/common/x64/cpu_detect.h b/src/common/x64/cpu_detect.h new file mode 100644 index 000000000..0af3a8adb --- /dev/null +++ b/src/common/x64/cpu_detect.h @@ -0,0 +1,66 @@ +// Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <string> + +namespace Common { + +/// x86/x64 CPU vendors that may be detected by this module +enum class CPUVendor { + INTEL, + AMD, + OTHER, +}; + +/// x86/x64 CPU capabilities that may be detected by this module +struct CPUCaps { + CPUVendor vendor; + char cpu_string[0x21]; + char brand_string[0x41]; + int num_cores; + bool sse; + bool sse2; + bool sse3; + bool ssse3; + bool sse4_1; + bool sse4_2; + bool lzcnt; + bool avx; + bool avx2; + bool bmi1; + bool bmi2; + bool fma; + bool fma4; + bool aes; + + // Support for the FXSAVE and FXRSTOR instructions + bool fxsave_fxrstor; + + bool movbe; + + // This flag indicates that the hardware supports some mode in which denormal inputs and outputs + // are automatically set to (signed) zero. + bool flush_to_zero; + + // Support for LAHF and SAHF instructions in 64-bit mode + bool lahf_sahf_64; + + bool long_mode; +}; + +/** + * Gets the supported capabilities of the host CPU + * @return Reference to a CPUCaps struct with the detected host CPU capabilities + */ +const CPUCaps& GetCPUCaps(); + +/** + * Gets a string summary of the name and supported capabilities of the host CPU + * @return String summary + */ +std::string GetCPUCapsString(); + +} // namespace Common diff --git a/src/common/x64/emitter.cpp b/src/common/x64/emitter.cpp new file mode 100644 index 000000000..4b79acd1f --- /dev/null +++ b/src/common/x64/emitter.cpp @@ -0,0 +1,1989 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include <cstring> + +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/memory_util.h" + +#include "abi.h" +#include "cpu_detect.h" +#include "emitter.h" + +#define PRIx64 "llx" + +// Minimize the diff against Dolphin +#define DYNA_REC JIT + +namespace Gen +{ + +struct NormalOpDef +{ + u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext; +}; + +// 0xCC is code for invalid combination of immediates +static const NormalOpDef normalops[11] = +{ + {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0}, //ADD + {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2}, //ADC + + {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5}, //SUB + {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3}, //SBB + + {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4}, //AND + {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1}, //OR + + {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6}, //XOR + {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0}, //MOV + + {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0}, //TEST (to == from) + {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7}, //CMP + + {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7}, //XCHG +}; + +enum NormalSSEOps +{ + sseCMP = 0xC2, + sseADD = 0x58, //ADD + sseSUB = 0x5C, //SUB + sseAND = 0x54, //AND + sseANDN = 0x55, //ANDN + sseOR = 0x56, + sseXOR = 0x57, + sseMUL = 0x59, //MUL + sseDIV = 0x5E, //DIV + sseMIN = 0x5D, //MIN + sseMAX = 0x5F, //MAX + sseCOMIS = 0x2F, //COMIS + sseUCOMIS = 0x2E, //UCOMIS + sseSQRT = 0x51, //SQRT + sseRSQRT = 0x52, //RSQRT (NO DOUBLE PRECISION!!!) + sseRCP = 0x53, //RCP + sseMOVAPfromRM = 0x28, //MOVAP from RM + sseMOVAPtoRM = 0x29, //MOVAP to RM + sseMOVUPfromRM = 0x10, //MOVUP from RM + sseMOVUPtoRM = 0x11, //MOVUP to RM + sseMOVLPfromRM= 0x12, + sseMOVLPtoRM = 0x13, + sseMOVHPfromRM= 0x16, + sseMOVHPtoRM = 0x17, + sseMOVHLPS = 0x12, + sseMOVLHPS = 0x16, + sseMOVDQfromRM = 0x6F, + sseMOVDQtoRM = 0x7F, + sseMASKMOVDQU = 0xF7, + sseLDDQU = 0xF0, + sseSHUF = 0xC6, + sseMOVNTDQ = 0xE7, + sseMOVNTP = 0x2B, + sseHADD = 0x7C, +}; + + +void XEmitter::SetCodePtr(u8 *ptr) +{ + code = ptr; +} + +const u8 *XEmitter::GetCodePtr() const +{ + return code; +} + +u8 *XEmitter::GetWritableCodePtr() +{ + return code; +} + +void XEmitter::ReserveCodeSpace(int bytes) +{ + for (int i = 0; i < bytes; i++) + *code++ = 0xCC; +} + +const u8 *XEmitter::AlignCode4() +{ + int c = int((u64)code & 3); + if (c) + ReserveCodeSpace(4-c); + return code; +} + +const u8 *XEmitter::AlignCode16() +{ + int c = int((u64)code & 15); + if (c) + ReserveCodeSpace(16-c); + return code; +} + +const u8 *XEmitter::AlignCodePage() +{ + int c = int((u64)code & 4095); + if (c) + ReserveCodeSpace(4096-c); + return code; +} + +// This operation modifies flags; check to see the flags are locked. +// If the flags are locked, we should immediately and loudly fail before +// causing a subtle JIT bug. +void XEmitter::CheckFlags() +{ + ASSERT_MSG(!flags_locked, "Attempt to modify flags while flags locked!"); +} + +void XEmitter::WriteModRM(int mod, int reg, int rm) +{ + Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7))); +} + +void XEmitter::WriteSIB(int scale, int index, int base) +{ + Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7))); +} + +void OpArg::WriteRex(XEmitter *emit, int opBits, int bits, int customOp) const +{ + if (customOp == -1) customOp = operandReg; +#ifdef ARCHITECTURE_x86_64 + u8 op = 0x40; + // REX.W (whether operation is a 64-bit operation) + if (opBits == 64) op |= 8; + // REX.R (whether ModR/M reg field refers to R8-R15. + if (customOp & 8) op |= 4; + // REX.X (whether ModR/M SIB index field refers to R8-R15) + if (indexReg & 8) op |= 2; + // REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15) + if (offsetOrBaseReg & 8) op |= 1; + // Write REX if wr have REX bits to write, or if the operation accesses + // SIL, DIL, BPL, or SPL. + if (op != 0x40 || + (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) || + (opBits == 8 && (customOp & 0x10c) == 4)) + { + emit->Write8(op); + // Check the operation doesn't access AH, BH, CH, or DH. + DEBUG_ASSERT((offsetOrBaseReg & 0x100) == 0); + DEBUG_ASSERT((customOp & 0x100) == 0); + } +#else + DEBUG_ASSERT(opBits != 64); + DEBUG_ASSERT((customOp & 8) == 0 || customOp == -1); + DEBUG_ASSERT((indexReg & 8) == 0); + DEBUG_ASSERT((offsetOrBaseReg & 8) == 0); + DEBUG_ASSERT(opBits != 8 || (customOp & 0x10c) != 4 || customOp == -1); + DEBUG_ASSERT(scale == SCALE_ATREG || bits != 8 || (offsetOrBaseReg & 0x10c) != 4); +#endif +} + +void OpArg::WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W) const +{ + int R = !(regOp1 & 8); + int X = !(indexReg & 8); + int B = !(offsetOrBaseReg & 8); + + int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf); + + // do we need any VEX fields that only appear in the three-byte form? + if (X == 1 && B == 1 && W == 0 && mmmmm == 1) + { + u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 1) | pp; + emit->Write8(0xC5); + emit->Write8(RvvvvLpp); + } + else + { + u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm; + u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 1) | pp; + emit->Write8(0xC4); + emit->Write8(RXBmmmmm); + emit->Write8(WvvvvLpp); + } +} + +void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg, + bool warn_64bit_offset) const +{ + if (_operandReg == INVALID_REG) + _operandReg = (X64Reg)this->operandReg; + int mod = 0; + int ireg = indexReg; + bool SIB = false; + int _offsetOrBaseReg = this->offsetOrBaseReg; + + if (scale == SCALE_RIP) //Also, on 32-bit, just an immediate address + { + // Oh, RIP addressing. + _offsetOrBaseReg = 5; + emit->WriteModRM(0, _operandReg, _offsetOrBaseReg); + //TODO : add some checks +#ifdef ARCHITECTURE_x86_64 + u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes; + s64 distance = (s64)offset - (s64)ripAddr; + ASSERT_MSG( + (distance < 0x80000000LL && + distance >= -0x80000000LL) || + !warn_64bit_offset, + "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")", + ripAddr, offset); + s32 offs = (s32)distance; + emit->Write32((u32)offs); +#else + emit->Write32((u32)offset); +#endif + return; + } + + if (scale == 0) + { + // Oh, no memory, Just a reg. + mod = 3; //11 + } + else if (scale >= 1) + { + //Ah good, no scaling. + if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5)) + { + //Okay, we're good. No SIB necessary. + int ioff = (int)offset; + if (ioff == 0) + { + mod = 0; + } + else if (ioff<-128 || ioff>127) + { + mod = 2; //32-bit displacement + } + else + { + mod = 1; //8-bit displacement + } + } + else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8) + { + SIB = true; + mod = 0; + _offsetOrBaseReg = 5; + } + else //if (scale != SCALE_ATREG) + { + if ((_offsetOrBaseReg & 7) == 4) //this would occupy the SIB encoding :( + { + //So we have to fake it with SIB encoding :( + SIB = true; + } + + if (scale >= SCALE_1 && scale < SCALE_ATREG) + { + SIB = true; + } + + if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4)) + { + SIB = true; + ireg = _offsetOrBaseReg; + } + + //Okay, we're fine. Just disp encoding. + //We need displacement. Which size? + int ioff = (int)(s64)offset; + if (ioff < -128 || ioff > 127) + { + mod = 2; //32-bit displacement + } + else + { + mod = 1; //8-bit displacement + } + } + } + + // Okay. Time to do the actual writing + // ModRM byte: + int oreg = _offsetOrBaseReg; + if (SIB) + oreg = 4; + + // TODO(ector): WTF is this if about? I don't remember writing it :-) + //if (RIP) + // oreg = 5; + + emit->WriteModRM(mod, _operandReg&7, oreg&7); + + if (SIB) + { + //SIB byte + int ss; + switch (scale) + { + case SCALE_NONE: _offsetOrBaseReg = 4; ss = 0; break; //RSP + case SCALE_1: ss = 0; break; + case SCALE_2: ss = 1; break; + case SCALE_4: ss = 2; break; + case SCALE_8: ss = 3; break; + case SCALE_NOBASE_2: ss = 1; break; + case SCALE_NOBASE_4: ss = 2; break; + case SCALE_NOBASE_8: ss = 3; break; + case SCALE_ATREG: ss = 0; break; + default: ASSERT_MSG(0, "Invalid scale for SIB byte"); ss = 0; break; + } + emit->Write8((u8)((ss << 6) | ((ireg&7)<<3) | (_offsetOrBaseReg&7))); + } + + if (mod == 1) //8-bit disp + { + emit->Write8((u8)(s8)(s32)offset); + } + else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)) //32-bit disp + { + emit->Write32((u32)offset); + } +} + +// W = operand extended width (1 if 64-bit) +// R = register# upper bit +// X = scale amnt upper bit +// B = base register# upper bit +void XEmitter::Rex(int w, int r, int x, int b) +{ + w = w ? 1 : 0; + r = r ? 1 : 0; + x = x ? 1 : 0; + b = b ? 1 : 0; + u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b)); + if (rx != 0x40) + Write8(rx); +} + +void XEmitter::JMP(const u8 *addr, bool force5Bytes) +{ + u64 fn = (u64)addr; + if (!force5Bytes) + { + s64 distance = (s64)(fn - ((u64)code + 2)); + ASSERT_MSG(distance >= -0x80 && distance < 0x80, + "Jump target too far away, needs force5Bytes = true"); + //8 bits will do + Write8(0xEB); + Write8((u8)(s8)distance); + } + else + { + s64 distance = (s64)(fn - ((u64)code + 5)); + + ASSERT_MSG( + distance >= -0x80000000LL && distance < 0x80000000LL, + "Jump target too far away, needs indirect register"); + Write8(0xE9); + Write32((u32)(s32)distance); + } +} + +void XEmitter::JMPptr(const OpArg &arg2) +{ + OpArg arg = arg2; + if (arg.IsImm()) ASSERT_MSG(0, "JMPptr - Imm argument"); + arg.operandReg = 4; + arg.WriteRex(this, 0, 0); + Write8(0xFF); + arg.WriteRest(this); +} + +//Can be used to trap other processors, before overwriting their code +// not used in dolphin +void XEmitter::JMPself() +{ + Write8(0xEB); + Write8(0xFE); +} + +void XEmitter::CALLptr(OpArg arg) +{ + if (arg.IsImm()) ASSERT_MSG(0, "CALLptr - Imm argument"); + arg.operandReg = 2; + arg.WriteRex(this, 0, 0); + Write8(0xFF); + arg.WriteRest(this); +} + +void XEmitter::CALL(const void *fnptr) +{ + u64 distance = u64(fnptr) - (u64(code) + 5); + ASSERT_MSG( + distance < 0x0000000080000000ULL || + distance >= 0xFFFFFFFF80000000ULL, + "CALL out of range (%p calls %p)", code, fnptr); + Write8(0xE8); + Write32(u32(distance)); +} + +FixupBranch XEmitter::J(bool force5bytes) +{ + FixupBranch branch; + branch.type = force5bytes ? 1 : 0; + branch.ptr = code + (force5bytes ? 5 : 2); + if (!force5bytes) + { + //8 bits will do + Write8(0xEB); + Write8(0); + } + else + { + Write8(0xE9); + Write32(0); + } + return branch; +} + +FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes) +{ + FixupBranch branch; + branch.type = force5bytes ? 1 : 0; + branch.ptr = code + (force5bytes ? 6 : 2); + if (!force5bytes) + { + //8 bits will do + Write8(0x70 + conditionCode); + Write8(0); + } + else + { + Write8(0x0F); + Write8(0x80 + conditionCode); + Write32(0); + } + return branch; +} + +void XEmitter::J_CC(CCFlags conditionCode, const u8* addr, bool force5bytes) +{ + u64 fn = (u64)addr; + s64 distance = (s64)(fn - ((u64)code + 2)); + if (distance < -0x80 || distance >= 0x80 || force5bytes) + { + distance = (s64)(fn - ((u64)code + 6)); + ASSERT_MSG( + distance >= -0x80000000LL && distance < 0x80000000LL, + "Jump target too far away, needs indirect register"); + Write8(0x0F); + Write8(0x80 + conditionCode); + Write32((u32)(s32)distance); + } + else + { + Write8(0x70 + conditionCode); + Write8((u8)(s8)distance); + } +} + +void XEmitter::SetJumpTarget(const FixupBranch &branch) +{ + if (branch.type == 0) + { + s64 distance = (s64)(code - branch.ptr); + ASSERT_MSG(distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true"); + branch.ptr[-1] = (u8)(s8)distance; + } + else if (branch.type == 1) + { + s64 distance = (s64)(code - branch.ptr); + ASSERT_MSG(distance >= -0x80000000LL && distance < 0x80000000LL, "Jump target too far away, needs indirect register"); + ((s32*)branch.ptr)[-1] = (s32)distance; + } +} + +// INC/DEC considered harmful on newer CPUs due to partial flag set. +// Use ADD, SUB instead. + +/* +void XEmitter::INC(int bits, OpArg arg) +{ + if (arg.IsImm()) ASSERT_MSG(0, "INC - Imm argument"); + arg.operandReg = 0; + if (bits == 16) {Write8(0x66);} + arg.WriteRex(this, bits, bits); + Write8(bits == 8 ? 0xFE : 0xFF); + arg.WriteRest(this); +} +void XEmitter::DEC(int bits, OpArg arg) +{ + if (arg.IsImm()) ASSERT_MSG(0, "DEC - Imm argument"); + arg.operandReg = 1; + if (bits == 16) {Write8(0x66);} + arg.WriteRex(this, bits, bits); + Write8(bits == 8 ? 0xFE : 0xFF); + arg.WriteRest(this); +} +*/ + +//Single byte opcodes +//There is no PUSHAD/POPAD in 64-bit mode. +void XEmitter::INT3() {Write8(0xCC);} +void XEmitter::RET() {Write8(0xC3);} +void XEmitter::RET_FAST() {Write8(0xF3); Write8(0xC3);} //two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to a ret + +// The first sign of decadence: optimized NOPs. +void XEmitter::NOP(size_t size) +{ + DEBUG_ASSERT((int)size > 0); + while (true) + { + switch (size) + { + case 0: + return; + case 1: + Write8(0x90); + return; + case 2: + Write8(0x66); Write8(0x90); + return; + case 3: + Write8(0x0F); Write8(0x1F); Write8(0x00); + return; + case 4: + Write8(0x0F); Write8(0x1F); Write8(0x40); Write8(0x00); + return; + case 5: + Write8(0x0F); Write8(0x1F); Write8(0x44); Write8(0x00); + Write8(0x00); + return; + case 6: + Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x44); + Write8(0x00); Write8(0x00); + return; + case 7: + Write8(0x0F); Write8(0x1F); Write8(0x80); Write8(0x00); + Write8(0x00); Write8(0x00); Write8(0x00); + return; + case 8: + Write8(0x0F); Write8(0x1F); Write8(0x84); Write8(0x00); + Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00); + return; + case 9: + Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x84); + Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00); + Write8(0x00); + return; + case 10: + Write8(0x66); Write8(0x66); Write8(0x0F); Write8(0x1F); + Write8(0x84); Write8(0x00); Write8(0x00); Write8(0x00); + Write8(0x00); Write8(0x00); + return; + default: + // Even though x86 instructions are allowed to be up to 15 bytes long, + // AMD advises against using NOPs longer than 11 bytes because they + // carry a performance penalty on CPUs older than AMD family 16h. + Write8(0x66); Write8(0x66); Write8(0x66); Write8(0x0F); + Write8(0x1F); Write8(0x84); Write8(0x00); Write8(0x00); + Write8(0x00); Write8(0x00); Write8(0x00); + size -= 11; + continue; + } + } +} + +void XEmitter::PAUSE() {Write8(0xF3); NOP();} //use in tight spinloops for energy saving on some cpu +void XEmitter::CLC() {CheckFlags(); Write8(0xF8);} //clear carry +void XEmitter::CMC() {CheckFlags(); Write8(0xF5);} //flip carry +void XEmitter::STC() {CheckFlags(); Write8(0xF9);} //set carry + +//TODO: xchg ah, al ??? +void XEmitter::XCHG_AHAL() +{ + Write8(0x86); + Write8(0xe0); + // alt. 86 c4 +} + +//These two can not be executed on early Intel 64-bit CPU:s, only on AMD! +void XEmitter::LAHF() {Write8(0x9F);} +void XEmitter::SAHF() {CheckFlags(); Write8(0x9E);} + +void XEmitter::PUSHF() {Write8(0x9C);} +void XEmitter::POPF() {CheckFlags(); Write8(0x9D);} + +void XEmitter::LFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xE8);} +void XEmitter::MFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF0);} +void XEmitter::SFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF8);} + +void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg) +{ + if (bits == 16) + Write8(0x66); + Rex(bits == 64, 0, 0, (int)reg >> 3); + Write8(byte + ((int)reg & 7)); +} + +void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg) +{ + if (bits == 16) + Write8(0x66); + Rex(bits==64, 0, 0, (int)reg >> 3); + Write8(byte1); + Write8(byte2 + ((int)reg & 7)); +} + +void XEmitter::CWD(int bits) +{ + if (bits == 16) + Write8(0x66); + Rex(bits == 64, 0, 0, 0); + Write8(0x99); +} + +void XEmitter::CBW(int bits) +{ + if (bits == 8) + Write8(0x66); + Rex(bits == 32, 0, 0, 0); + Write8(0x98); +} + +//Simple opcodes + + +//push/pop do not need wide to be 64-bit +void XEmitter::PUSH(X64Reg reg) {WriteSimple1Byte(32, 0x50, reg);} +void XEmitter::POP(X64Reg reg) {WriteSimple1Byte(32, 0x58, reg);} + +void XEmitter::PUSH(int bits, const OpArg ®) +{ + if (reg.IsSimpleReg()) + PUSH(reg.GetSimpleReg()); + else if (reg.IsImm()) + { + switch (reg.GetImmBits()) + { + case 8: + Write8(0x6A); + Write8((u8)(s8)reg.offset); + break; + case 16: + Write8(0x66); + Write8(0x68); + Write16((u16)(s16)(s32)reg.offset); + break; + case 32: + Write8(0x68); + Write32((u32)reg.offset); + break; + default: + ASSERT_MSG(0, "PUSH - Bad imm bits"); + break; + } + } + else + { + if (bits == 16) + Write8(0x66); + reg.WriteRex(this, bits, bits); + Write8(0xFF); + reg.WriteRest(this, 0, (X64Reg)6); + } +} + +void XEmitter::POP(int /*bits*/, const OpArg ®) +{ + if (reg.IsSimpleReg()) + POP(reg.GetSimpleReg()); + else + ASSERT_MSG(0, "POP - Unsupported encoding"); +} + +void XEmitter::BSWAP(int bits, X64Reg reg) +{ + if (bits >= 32) + { + WriteSimple2Byte(bits, 0x0F, 0xC8, reg); + } + else if (bits == 16) + { + ROL(16, R(reg), Imm8(8)); + } + else if (bits == 8) + { + // Do nothing - can't bswap a single byte... + } + else + { + ASSERT_MSG(0, "BSWAP - Wrong number of bits"); + } +} + +// Undefined opcode - reserved +// If we ever need a way to always cause a non-breakpoint hard exception... +void XEmitter::UD2() +{ + Write8(0x0F); + Write8(0x0B); +} + +void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg) +{ + ASSERT_MSG(!arg.IsImm(), "PREFETCH - Imm argument"); + arg.operandReg = (u8)level; + arg.WriteRex(this, 0, 0); + Write8(0x0F); + Write8(0x18); + arg.WriteRest(this); +} + +void XEmitter::SETcc(CCFlags flag, OpArg dest) +{ + ASSERT_MSG(!dest.IsImm(), "SETcc - Imm argument"); + dest.operandReg = 0; + dest.WriteRex(this, 0, 8); + Write8(0x0F); + Write8(0x90 + (u8)flag); + dest.WriteRest(this); +} + +void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag) +{ + ASSERT_MSG(!src.IsImm(), "CMOVcc - Imm argument"); + ASSERT_MSG(bits != 8, "CMOVcc - 8 bits unsupported"); + if (bits == 16) + Write8(0x66); + src.operandReg = dest; + src.WriteRex(this, bits, bits); + Write8(0x0F); + Write8(0x40 + (u8)flag); + src.WriteRest(this); +} + +void XEmitter::WriteMulDivType(int bits, OpArg src, int ext) +{ + ASSERT_MSG(!src.IsImm(), "WriteMulDivType - Imm argument"); + CheckFlags(); + src.operandReg = ext; + if (bits == 16) + Write8(0x66); + src.WriteRex(this, bits, bits, 0); + if (bits == 8) + { + Write8(0xF6); + } + else + { + Write8(0xF7); + } + src.WriteRest(this); +} + +void XEmitter::MUL(int bits, OpArg src) {WriteMulDivType(bits, src, 4);} +void XEmitter::DIV(int bits, OpArg src) {WriteMulDivType(bits, src, 6);} +void XEmitter::IMUL(int bits, OpArg src) {WriteMulDivType(bits, src, 5);} +void XEmitter::IDIV(int bits, OpArg src) {WriteMulDivType(bits, src, 7);} +void XEmitter::NEG(int bits, OpArg src) {WriteMulDivType(bits, src, 3);} +void XEmitter::NOT(int bits, OpArg src) {WriteMulDivType(bits, src, 2);} + +void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep) +{ + ASSERT_MSG(!src.IsImm(), "WriteBitSearchType - Imm argument"); + CheckFlags(); + src.operandReg = (u8)dest; + if (bits == 16) + Write8(0x66); + if (rep) + Write8(0xF3); + src.WriteRex(this, bits, bits); + Write8(0x0F); + Write8(byte2); + src.WriteRest(this); +} + +void XEmitter::MOVNTI(int bits, OpArg dest, X64Reg src) +{ + if (bits <= 16) + ASSERT_MSG(0, "MOVNTI - bits<=16"); + WriteBitSearchType(bits, src, dest, 0xC3); +} + +void XEmitter::BSF(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBC);} //bottom bit to top bit +void XEmitter::BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBD);} //top bit to bottom bit + +void XEmitter::TZCNT(int bits, X64Reg dest, OpArg src) +{ + CheckFlags(); + if (!Common::GetCPUCaps().bmi1) + ASSERT_MSG(0, "Trying to use BMI1 on a system that doesn't support it. Bad programmer."); + WriteBitSearchType(bits, dest, src, 0xBC, true); +} +void XEmitter::LZCNT(int bits, X64Reg dest, OpArg src) +{ + CheckFlags(); + if (!Common::GetCPUCaps().lzcnt) + ASSERT_MSG(0, "Trying to use LZCNT on a system that doesn't support it. Bad programmer."); + WriteBitSearchType(bits, dest, src, 0xBD, true); +} + +void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src) +{ + ASSERT_MSG(!src.IsImm(), "MOVSX - Imm argument"); + if (dbits == sbits) + { + MOV(dbits, R(dest), src); + return; + } + src.operandReg = (u8)dest; + if (dbits == 16) + Write8(0x66); + src.WriteRex(this, dbits, sbits); + if (sbits == 8) + { + Write8(0x0F); + Write8(0xBE); + } + else if (sbits == 16) + { + Write8(0x0F); + Write8(0xBF); + } + else if (sbits == 32 && dbits == 64) + { + Write8(0x63); + } + else + { + Crash(); + } + src.WriteRest(this); +} + +void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src) +{ + ASSERT_MSG(!src.IsImm(), "MOVZX - Imm argument"); + if (dbits == sbits) + { + MOV(dbits, R(dest), src); + return; + } + src.operandReg = (u8)dest; + if (dbits == 16) + Write8(0x66); + //the 32bit result is automatically zero extended to 64bit + src.WriteRex(this, dbits == 64 ? 32 : dbits, sbits); + if (sbits == 8) + { + Write8(0x0F); + Write8(0xB6); + } + else if (sbits == 16) + { + Write8(0x0F); + Write8(0xB7); + } + else if (sbits == 32 && dbits == 64) + { + Write8(0x8B); + } + else + { + ASSERT_MSG(0, "MOVZX - Invalid size"); + } + src.WriteRest(this); +} + +void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src) +{ + ASSERT_MSG(Common::GetCPUCaps().movbe, "Generating MOVBE on a system that does not support it."); + if (bits == 8) + { + MOV(bits, dest, src); + return; + } + + if (bits == 16) + Write8(0x66); + + if (dest.IsSimpleReg()) + { + ASSERT_MSG(!src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem"); + src.WriteRex(this, bits, bits, dest.GetSimpleReg()); + Write8(0x0F); Write8(0x38); Write8(0xF0); + src.WriteRest(this, 0, dest.GetSimpleReg()); + } + else if (src.IsSimpleReg()) + { + ASSERT_MSG(!dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem"); + dest.WriteRex(this, bits, bits, src.GetSimpleReg()); + Write8(0x0F); Write8(0x38); Write8(0xF1); + dest.WriteRest(this, 0, src.GetSimpleReg()); + } + else + { + ASSERT_MSG(0, "MOVBE: Not loading or storing to mem"); + } +} + + +void XEmitter::LEA(int bits, X64Reg dest, OpArg src) +{ + ASSERT_MSG(!src.IsImm(), "LEA - Imm argument"); + src.operandReg = (u8)dest; + if (bits == 16) + Write8(0x66); //TODO: performance warning + src.WriteRex(this, bits, bits); + Write8(0x8D); + src.WriteRest(this, 0, INVALID_REG, bits == 64); +} + +//shift can be either imm8 or cl +void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext) +{ + CheckFlags(); + bool writeImm = false; + if (dest.IsImm()) + { + ASSERT_MSG(0, "WriteShift - can't shift imms"); + } + if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8)) + { + ASSERT_MSG(0, "WriteShift - illegal argument"); + } + dest.operandReg = ext; + if (bits == 16) + Write8(0x66); + dest.WriteRex(this, bits, bits, 0); + if (shift.GetImmBits() == 8) + { + //ok an imm + u8 imm = (u8)shift.offset; + if (imm == 1) + { + Write8(bits == 8 ? 0xD0 : 0xD1); + } + else + { + writeImm = true; + Write8(bits == 8 ? 0xC0 : 0xC1); + } + } + else + { + Write8(bits == 8 ? 0xD2 : 0xD3); + } + dest.WriteRest(this, writeImm ? 1 : 0); + if (writeImm) + Write8((u8)shift.offset); +} + +// large rotates and shift are slower on intel than amd +// intel likes to rotate by 1, and the op is smaller too +void XEmitter::ROL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 0);} +void XEmitter::ROR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 1);} +void XEmitter::RCL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 2);} +void XEmitter::RCR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 3);} +void XEmitter::SHL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 4);} +void XEmitter::SHR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 5);} +void XEmitter::SAR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 7);} + +// index can be either imm8 or register, don't use memory destination because it's slow +void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext) +{ + CheckFlags(); + if (dest.IsImm()) + { + ASSERT_MSG(0, "WriteBitTest - can't test imms"); + } + if ((index.IsImm() && index.GetImmBits() != 8)) + { + ASSERT_MSG(0, "WriteBitTest - illegal argument"); + } + if (bits == 16) + Write8(0x66); + if (index.IsImm()) + { + dest.WriteRex(this, bits, bits); + Write8(0x0F); Write8(0xBA); + dest.WriteRest(this, 1, (X64Reg)ext); + Write8((u8)index.offset); + } + else + { + X64Reg operand = index.GetSimpleReg(); + dest.WriteRex(this, bits, bits, operand); + Write8(0x0F); Write8(0x83 + 8*ext); + dest.WriteRest(this, 1, operand); + } +} + +void XEmitter::BT(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 4);} +void XEmitter::BTS(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 5);} +void XEmitter::BTR(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 6);} +void XEmitter::BTC(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 7);} + +//shift can be either imm8 or cl +void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift) +{ + CheckFlags(); + if (dest.IsImm()) + { + ASSERT_MSG(0, "SHRD - can't use imms as destination"); + } + if (!src.IsSimpleReg()) + { + ASSERT_MSG(0, "SHRD - must use simple register as source"); + } + if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8)) + { + ASSERT_MSG(0, "SHRD - illegal shift"); + } + if (bits == 16) + Write8(0x66); + X64Reg operand = src.GetSimpleReg(); + dest.WriteRex(this, bits, bits, operand); + if (shift.GetImmBits() == 8) + { + Write8(0x0F); Write8(0xAC); + dest.WriteRest(this, 1, operand); + Write8((u8)shift.offset); + } + else + { + Write8(0x0F); Write8(0xAD); + dest.WriteRest(this, 0, operand); + } +} + +void XEmitter::SHLD(int bits, OpArg dest, OpArg src, OpArg shift) +{ + CheckFlags(); + if (dest.IsImm()) + { + ASSERT_MSG(0, "SHLD - can't use imms as destination"); + } + if (!src.IsSimpleReg()) + { + ASSERT_MSG(0, "SHLD - must use simple register as source"); + } + if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8)) + { + ASSERT_MSG(0, "SHLD - illegal shift"); + } + if (bits == 16) + Write8(0x66); + X64Reg operand = src.GetSimpleReg(); + dest.WriteRex(this, bits, bits, operand); + if (shift.GetImmBits() == 8) + { + Write8(0x0F); Write8(0xA4); + dest.WriteRest(this, 1, operand); + Write8((u8)shift.offset); + } + else + { + Write8(0x0F); Write8(0xA5); + dest.WriteRest(this, 0, operand); + } +} + +void OpArg::WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg _operandReg, int bits) +{ + if (bits == 16) + emit->Write8(0x66); + + this->operandReg = (u8)_operandReg; + WriteRex(emit, bits, bits); + emit->Write8(op); + WriteRest(emit); +} + +//operand can either be immediate or register +void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const +{ + X64Reg _operandReg; + if (IsImm()) + { + ASSERT_MSG(0, "WriteNormalOp - Imm argument, wrong order"); + } + + if (bits == 16) + emit->Write8(0x66); + + int immToWrite = 0; + + if (operand.IsImm()) + { + WriteRex(emit, bits, bits); + + if (!toRM) + { + ASSERT_MSG(0, "WriteNormalOp - Writing to Imm (!toRM)"); + } + + if (operand.scale == SCALE_IMM8 && bits == 8) + { + // op al, imm8 + if (!scale && offsetOrBaseReg == AL && normalops[op].eaximm8 != 0xCC) + { + emit->Write8(normalops[op].eaximm8); + emit->Write8((u8)operand.offset); + return; + } + // mov reg, imm8 + if (!scale && op == nrmMOV) + { + emit->Write8(0xB0 + (offsetOrBaseReg & 7)); + emit->Write8((u8)operand.offset); + return; + } + // op r/m8, imm8 + emit->Write8(normalops[op].imm8); + immToWrite = 8; + } + else if ((operand.scale == SCALE_IMM16 && bits == 16) || + (operand.scale == SCALE_IMM32 && bits == 32) || + (operand.scale == SCALE_IMM32 && bits == 64)) + { + // Try to save immediate size if we can, but first check to see + // if the instruction supports simm8. + // op r/m, imm8 + if (normalops[op].simm8 != 0xCC && + ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) || + (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset))) + { + emit->Write8(normalops[op].simm8); + immToWrite = 8; + } + else + { + // mov reg, imm + if (!scale && op == nrmMOV && bits != 64) + { + emit->Write8(0xB8 + (offsetOrBaseReg & 7)); + if (bits == 16) + emit->Write16((u16)operand.offset); + else + emit->Write32((u32)operand.offset); + return; + } + // op eax, imm + if (!scale && offsetOrBaseReg == EAX && normalops[op].eaximm32 != 0xCC) + { + emit->Write8(normalops[op].eaximm32); + if (bits == 16) + emit->Write16((u16)operand.offset); + else + emit->Write32((u32)operand.offset); + return; + } + // op r/m, imm + emit->Write8(normalops[op].imm32); + immToWrite = bits == 16 ? 16 : 32; + } + } + else if ((operand.scale == SCALE_IMM8 && bits == 16) || + (operand.scale == SCALE_IMM8 && bits == 32) || + (operand.scale == SCALE_IMM8 && bits == 64)) + { + // op r/m, imm8 + emit->Write8(normalops[op].simm8); + immToWrite = 8; + } + else if (operand.scale == SCALE_IMM64 && bits == 64) + { + if (scale) + { + ASSERT_MSG(0, "WriteNormalOp - MOV with 64-bit imm requres register destination"); + } + // mov reg64, imm64 + else if (op == nrmMOV) + { + emit->Write8(0xB8 + (offsetOrBaseReg & 7)); + emit->Write64((u64)operand.offset); + return; + } + ASSERT_MSG(0, "WriteNormalOp - Only MOV can take 64-bit imm"); + } + else + { + ASSERT_MSG(0, "WriteNormalOp - Unhandled case"); + } + _operandReg = (X64Reg)normalops[op].ext; //pass extension in REG of ModRM + } + else + { + _operandReg = (X64Reg)operand.offsetOrBaseReg; + WriteRex(emit, bits, bits, _operandReg); + // op r/m, reg + if (toRM) + { + emit->Write8(bits == 8 ? normalops[op].toRm8 : normalops[op].toRm32); + } + // op reg, r/m + else + { + emit->Write8(bits == 8 ? normalops[op].fromRm8 : normalops[op].fromRm32); + } + } + WriteRest(emit, immToWrite >> 3, _operandReg); + switch (immToWrite) + { + case 0: + break; + case 8: + emit->Write8((u8)operand.offset); + break; + case 16: + emit->Write16((u16)operand.offset); + break; + case 32: + emit->Write32((u32)operand.offset); + break; + default: + ASSERT_MSG(0, "WriteNormalOp - Unhandled case"); + } +} + +void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2) +{ + if (a1.IsImm()) + { + //Booh! Can't write to an imm + ASSERT_MSG(0, "WriteNormalOp - a1 cannot be imm"); + return; + } + if (a2.IsImm()) + { + a1.WriteNormalOp(emit, true, op, a2, bits); + } + else + { + if (a1.IsSimpleReg()) + { + a2.WriteNormalOp(emit, false, op, a1, bits); + } + else + { + ASSERT_MSG(a2.IsSimpleReg() || a2.IsImm(), "WriteNormalOp - a1 and a2 cannot both be memory"); + a1.WriteNormalOp(emit, true, op, a2, bits); + } + } +} + +void XEmitter::ADD (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADD, a1, a2);} +void XEmitter::ADC (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADC, a1, a2);} +void XEmitter::SUB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSUB, a1, a2);} +void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSBB, a1, a2);} +void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmAND, a1, a2);} +void XEmitter::OR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmOR , a1, a2);} +void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmXOR, a1, a2);} +void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2) +{ + if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg()) + LOG_ERROR(Common, "Redundant MOV @ %p - bug in JIT?", code); + WriteNormalOp(this, bits, nrmMOV, a1, a2); +} +void XEmitter::TEST(int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmTEST, a1, a2);} +void XEmitter::CMP (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmCMP, a1, a2);} +void XEmitter::XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);} + +void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2) +{ + CheckFlags(); + if (bits == 8) + { + ASSERT_MSG(0, "IMUL - illegal bit size!"); + return; + } + + if (a1.IsImm()) + { + ASSERT_MSG(0, "IMUL - second arg cannot be imm!"); + return; + } + + if (!a2.IsImm()) + { + ASSERT_MSG(0, "IMUL - third arg must be imm!"); + return; + } + + if (bits == 16) + Write8(0x66); + a1.WriteRex(this, bits, bits, regOp); + + if (a2.GetImmBits() == 8 || + (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) || + (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset)) + { + Write8(0x6B); + a1.WriteRest(this, 1, regOp); + Write8((u8)a2.offset); + } + else + { + Write8(0x69); + if (a2.GetImmBits() == 16 && bits == 16) + { + a1.WriteRest(this, 2, regOp); + Write16((u16)a2.offset); + } + else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64)) + { + a1.WriteRest(this, 4, regOp); + Write32((u32)a2.offset); + } + else + { + ASSERT_MSG(0, "IMUL - unhandled case!"); + } + } +} + +void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a) +{ + CheckFlags(); + if (bits == 8) + { + ASSERT_MSG(0, "IMUL - illegal bit size!"); + return; + } + + if (a.IsImm()) + { + IMUL(bits, regOp, R(regOp), a) ; + return; + } + + if (bits == 16) + Write8(0x66); + a.WriteRex(this, bits, bits, regOp); + Write8(0x0F); + Write8(0xAF); + a.WriteRest(this, 0, regOp); +} + + +void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +{ + if (opPrefix) + Write8(opPrefix); + arg.operandReg = regOp; + arg.WriteRex(this, 0, 0); + Write8(0x0F); + if (op > 0xFF) + Write8((op >> 8) & 0xFF); + Write8(op & 0xFF); + arg.WriteRest(this, extrabytes); +} + +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +{ + WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes); +} + +static int GetVEXmmmmm(u16 op) +{ + // Currently, only 0x38 and 0x3A are used as secondary escape byte. + if ((op >> 8) == 0x3A) + return 3; + else if ((op >> 8) == 0x38) + return 2; + else + return 1; +} + +static int GetVEXpp(u8 opPrefix) +{ + if (opPrefix == 0x66) + return 1; + else if (opPrefix == 0xF3) + return 2; + else if (opPrefix == 0xF2) + return 3; + else + return 0; +} + +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +{ + if (!Common::GetCPUCaps().avx) + ASSERT_MSG(0, "Trying to use AVX on a system that doesn't support it. Bad programmer."); + int mmmmm = GetVEXmmmmm(op); + int pp = GetVEXpp(opPrefix); + // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here + arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm); + Write8(op & 0xFF); + arg.WriteRest(this, extrabytes, regOp1); +} + +// Like the above, but more general; covers GPR-based VEX operations, like BMI1/2 +void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +{ + if (size != 32 && size != 64) + ASSERT_MSG(0, "VEX GPR instructions only support 32-bit and 64-bit modes!"); + int mmmmm = GetVEXmmmmm(op); + int pp = GetVEXpp(opPrefix); + arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, size == 64); + Write8(op & 0xFF); + arg.WriteRest(this, extrabytes, regOp1); +} + +void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +{ + CheckFlags(); + if (!Common::GetCPUCaps().bmi1) + ASSERT_MSG(0, "Trying to use BMI1 on a system that doesn't support it. Bad programmer."); + WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes); +} + +void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +{ + CheckFlags(); + if (!Common::GetCPUCaps().bmi2) + ASSERT_MSG(0, "Trying to use BMI2 on a system that doesn't support it. Bad programmer."); + WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes); +} + +void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6E, dest, arg, 0);} +void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(0x66, 0x7E, src, arg, 0);} + +void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) +{ +#ifdef ARCHITECTURE_x86_64 + // Alternate encoding + // This does not display correctly in MSVC's debugger, it thinks it's a MOVD + arg.operandReg = dest; + Write8(0x66); + arg.WriteRex(this, 64, 0); + Write8(0x0f); + Write8(0x6E); + arg.WriteRest(this, 0); +#else + arg.operandReg = dest; + Write8(0xF3); + Write8(0x0f); + Write8(0x7E); + arg.WriteRest(this, 0); +#endif +} + +void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) +{ + if (src > 7 || arg.IsSimpleReg()) + { + // Alternate encoding + // This does not display correctly in MSVC's debugger, it thinks it's a MOVD + arg.operandReg = src; + Write8(0x66); + arg.WriteRex(this, 64, 0); + Write8(0x0f); + Write8(0x7E); + arg.WriteRest(this, 0); + } + else + { + arg.operandReg = src; + arg.WriteRex(this, 0, 0); + Write8(0x66); + Write8(0x0f); + Write8(0xD6); + arg.WriteRest(this, 0); + } +} + +void XEmitter::WriteMXCSR(OpArg arg, int ext) +{ + if (arg.IsImm() || arg.IsSimpleReg()) + ASSERT_MSG(0, "MXCSR - invalid operand"); + + arg.operandReg = ext; + arg.WriteRex(this, 0, 0); + Write8(0x0F); + Write8(0xAE); + arg.WriteRest(this); +} + +void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);} +void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);} + +void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);} +void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);} +void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);} + +void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseADD, regOp, arg);} +void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseADD, regOp, arg);} +void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSUB, regOp, arg);} +void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSUB, regOp, arg);} +void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMUL, regOp, arg);} +void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMUL, regOp, arg);} +void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseDIV, regOp, arg);} +void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseDIV, regOp, arg);} +void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMIN, regOp, arg);} +void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMIN, regOp, arg);} +void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMAX, regOp, arg);} +void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMAX, regOp, arg);} +void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSQRT, regOp, arg);} +void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSQRT, regOp, arg);} +void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);} + +void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseADD, regOp, arg);} +void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseADD, regOp, arg);} +void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSUB, regOp, arg);} +void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSUB, regOp, arg);} +void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseAND, regOp, arg);} +void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseAND, regOp, arg);} +void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseANDN, regOp, arg);} +void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseANDN, regOp, arg);} +void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseOR, regOp, arg);} +void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseOR, regOp, arg);} +void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseXOR, regOp, arg);} +void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseXOR, regOp, arg);} +void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMUL, regOp, arg);} +void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMUL, regOp, arg);} +void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseDIV, regOp, arg);} +void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseDIV, regOp, arg);} +void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMIN, regOp, arg);} +void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMIN, regOp, arg);} +void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMAX, regOp, arg);} +void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMAX, regOp, arg);} +void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSQRT, regOp, arg);} +void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSQRT, regOp, arg);} +void XEmitter::RCPPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseRCP, regOp, arg); } +void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);} +void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);} +void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);} + +void XEmitter::HADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseHADD, regOp, arg);} + +void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed +void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered +void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered +void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);} + +void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);} +void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);} +void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);} +void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);} + +void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);} +void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);} + +void XEmitter::MOVDQA(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);} +void XEmitter::MOVDQA(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);} +void XEmitter::MOVDQU(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);} +void XEmitter::MOVDQU(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);} + +void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);} +void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);} + +void XEmitter::MOVLPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg); } +void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) { WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg); } +void XEmitter::MOVLPS(OpArg arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg); } +void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg); } + +void XEmitter::MOVHPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg); } +void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) { WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg); } +void XEmitter::MOVHPS(OpArg arg, X64Reg regOp) { WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg); } +void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) { WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg); } + +void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));} +void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));} + +void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);} +void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);} + +void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);} +void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);} +void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);} +void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);} +void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);} +void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);} + +void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);} +void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);} +void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);} +void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);} + +void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);} +void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);} +void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);} +void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);} + +void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));} + +void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);} +void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);} + +void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only + +// THESE TWO ARE UNTESTED. +void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);} +void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);} + +void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);} +void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);} + +void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg) +{ + if (Common::GetCPUCaps().sse3) + { + WriteSSEOp(0xF2, 0x12, regOp, arg); //SSE3 movddup + } + else + { + // Simulate this instruction with SSE2 instructions + if (!arg.IsSimpleReg(regOp)) + MOVSD(regOp, arg); + UNPCKLPD(regOp, R(regOp)); + } +} + +//There are a few more left + +// Also some integer instructions are missing +void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);} +void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);} +void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);} + +void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);} +void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);} +void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);} +void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6C, dest, arg);} + +void XEmitter::PSRLW(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg)); + Write8(shift); +} + +void XEmitter::PSRLD(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg)); + Write8(shift); +} + +void XEmitter::PSRLQ(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg)); + Write8(shift); +} + +void XEmitter::PSRLQ(X64Reg reg, OpArg arg) +{ + WriteSSEOp(0x66, 0xd3, reg, arg); +} + +void XEmitter::PSRLDQ(X64Reg reg, int shift) { + WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg)); + Write8(shift); +} + +void XEmitter::PSLLW(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg)); + Write8(shift); +} + +void XEmitter::PSLLD(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg)); + Write8(shift); +} + +void XEmitter::PSLLQ(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg)); + Write8(shift); +} + +void XEmitter::PSLLDQ(X64Reg reg, int shift) { + WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg)); + Write8(shift); +} + +void XEmitter::PSRAW(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x71, (X64Reg)4, R(reg)); + Write8(shift); +} + +void XEmitter::PSRAD(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x72, (X64Reg)4, R(reg)); + Write8(shift); +} + +void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +{ + if (!Common::GetCPUCaps().ssse3) + ASSERT_MSG(0, "Trying to use SSSE3 on a system that doesn't support it. Bad programmer."); + WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); +} + +void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +{ + if (!Common::GetCPUCaps().sse4_1) + ASSERT_MSG(0, "Trying to use SSE4.1 on a system that doesn't support it. Bad programmer."); + WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); +} + +void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);} +void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);} +void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);} +void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);} + +void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);} +void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);} +void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);} +void XEmitter::PMINUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383b, dest, arg);} +void XEmitter::PMAXSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383c, dest, arg);} +void XEmitter::PMAXSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383d, dest, arg);} +void XEmitter::PMAXUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383e, dest, arg);} +void XEmitter::PMAXUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383f, dest, arg);} + +void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);} +void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);} +void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);} +void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);} +void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);} +void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);} +void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);} +void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);} +void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);} +void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);} +void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);} +void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);} + +void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);} +void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);} +void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);} +void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1); Write8(blend); } +void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1); Write8(blend); } + +void XEmitter::ROUNDSS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0A, dest, arg, 1); Write8(mode);} +void XEmitter::ROUNDSD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0B, dest, arg, 1); Write8(mode);} +void XEmitter::ROUNDPS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A08, dest, arg, 1); Write8(mode);} +void XEmitter::ROUNDPD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A09, dest, arg, 1); Write8(mode);} + +void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);} +void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);} +void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEF, dest, arg);} +void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEB, dest, arg);} + +void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFC, dest, arg);} +void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFD, dest, arg);} +void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFE, dest, arg);} +void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD4, dest, arg);} + +void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEC, dest, arg);} +void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xED, dest, arg);} +void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDC, dest, arg);} +void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDD, dest, arg);} + +void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF8, dest, arg);} +void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF9, dest, arg);} +void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFA, dest, arg);} +void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFB, dest, arg);} + +void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE8, dest, arg);} +void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE9, dest, arg);} +void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD8, dest, arg);} +void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD9, dest, arg);} + +void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE0, dest, arg);} +void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE3, dest, arg);} + +void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x74, dest, arg);} +void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x75, dest, arg);} +void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x76, dest, arg);} + +void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x64, dest, arg);} +void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x65, dest, arg);} +void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x66, dest, arg);} + +void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg, 1); Write8(subreg);} +void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg, 1); Write8(subreg);} + +void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF5, dest, arg); } +void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF6, dest, arg);} + +void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEE, dest, arg); } +void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDE, dest, arg); } +void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEA, dest, arg); } +void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDA, dest, arg); } + +void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD7, dest, arg); } +void XEmitter::PSHUFD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, 0x70, regOp, arg, 1); Write8(shuffle);} +void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);} +void XEmitter::PSHUFHW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF3, 0x70, regOp, arg, 1); Write8(shuffle);} + +// VEX +void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);} +void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);} +void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);} +void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);} +void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);} +void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);} +void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);} +void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);} +void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);} +void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);} +void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);} +void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);} + +void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg); } +void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg); } +void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg); } +void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg); } +void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg); } +void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg); } +void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg); } +void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg); } + +void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg); } +void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg); } +void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg); } +void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg); } + +void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg); } +void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg); } +void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg); } +void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg); } +void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg); } +void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg); } +void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg); } +void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg); } +void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg); } +void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg); } +void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg); } +void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg); } +void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg); } +void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg); } +void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg); } +void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg); } +void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg); } +void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg); } +void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg); } +void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg); } +void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg); } +void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg); } +void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg); } +void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg); } +void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1); } + +void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate) {WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); Write8(rotate);} +void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);} +void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);} +void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);} +void XEmitter::BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);} +void XEmitter::BLSR(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);} +void XEmitter::BLSMSK(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);} +void XEmitter::BLSI(int bits, X64Reg regOp, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);} +void XEmitter::BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2){WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);} + +// Prefixes + +void XEmitter::LOCK() { Write8(0xF0); } +void XEmitter::REP() { Write8(0xF3); } +void XEmitter::REPNE() { Write8(0xF2); } +void XEmitter::FSOverride() { Write8(0x64); } +void XEmitter::GSOverride() { Write8(0x65); } + +void XEmitter::FWAIT() +{ + Write8(0x9B); +} + +// TODO: make this more generic +void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg) +{ + int mf = 0; + ASSERT_MSG(!(bits == 80 && op_80b == floatINVALID), "WriteFloatLoadStore: 80 bits not supported for this instruction"); + switch (bits) + { + case 32: mf = 0; break; + case 64: mf = 4; break; + case 80: mf = 2; break; + default: ASSERT_MSG(0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)"); + } + Write8(0xd9 | mf); + // x87 instructions use the reg field of the ModR/M byte as opcode: + if (bits == 80) + op = op_80b; + arg.WriteRest(this, 0, (X64Reg) op); +} + +void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, floatLD80, src);} +void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, floatINVALID, dest);} +void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, floatSTP80, dest);} +void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); } + +void XEmitter::RDTSC() { Write8(0x0F); Write8(0x31); } + +void XCodeBlock::PoisonMemory() { + // x86/64: 0xCC = breakpoint + memset(region, 0xCC, region_size); +} + +} diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h new file mode 100644 index 000000000..e9c924126 --- /dev/null +++ b/src/common/x64/emitter.h @@ -0,0 +1,1067 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#pragma once + +#include "common/assert.h" +#include "common/common_types.h" +#include "common/code_block.h" + +#if defined(ARCHITECTURE_x86_64) && !defined(_ARCH_64) +#define _ARCH_64 +#endif + +#ifdef _ARCH_64 +#define PTRBITS 64 +#else +#define PTRBITS 32 +#endif + +namespace Gen +{ + +enum X64Reg +{ + EAX = 0, EBX = 3, ECX = 1, EDX = 2, + ESI = 6, EDI = 7, EBP = 5, ESP = 4, + + RAX = 0, RBX = 3, RCX = 1, RDX = 2, + RSI = 6, RDI = 7, RBP = 5, RSP = 4, + R8 = 8, R9 = 9, R10 = 10,R11 = 11, + R12 = 12,R13 = 13,R14 = 14,R15 = 15, + + AL = 0, BL = 3, CL = 1, DL = 2, + SIL = 6, DIL = 7, BPL = 5, SPL = 4, + AH = 0x104, BH = 0x107, CH = 0x105, DH = 0x106, + + AX = 0, BX = 3, CX = 1, DX = 2, + SI = 6, DI = 7, BP = 5, SP = 4, + + XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, + + YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, + YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15, + + INVALID_REG = 0xFFFFFFFF +}; + +enum CCFlags +{ + CC_O = 0, + CC_NO = 1, + CC_B = 2, CC_C = 2, CC_NAE = 2, + CC_NB = 3, CC_NC = 3, CC_AE = 3, + CC_Z = 4, CC_E = 4, + CC_NZ = 5, CC_NE = 5, + CC_BE = 6, CC_NA = 6, + CC_NBE = 7, CC_A = 7, + CC_S = 8, + CC_NS = 9, + CC_P = 0xA, CC_PE = 0xA, + CC_NP = 0xB, CC_PO = 0xB, + CC_L = 0xC, CC_NGE = 0xC, + CC_NL = 0xD, CC_GE = 0xD, + CC_LE = 0xE, CC_NG = 0xE, + CC_NLE = 0xF, CC_G = 0xF +}; + +enum +{ + NUMGPRs = 16, + NUMXMMs = 16, +}; + +enum +{ + SCALE_NONE = 0, + SCALE_1 = 1, + SCALE_2 = 2, + SCALE_4 = 4, + SCALE_8 = 8, + SCALE_ATREG = 16, + //SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG + SCALE_NOBASE_2 = 34, + SCALE_NOBASE_4 = 36, + SCALE_NOBASE_8 = 40, + SCALE_RIP = 0xFF, + SCALE_IMM8 = 0xF0, + SCALE_IMM16 = 0xF1, + SCALE_IMM32 = 0xF2, + SCALE_IMM64 = 0xF3, +}; + +enum NormalOp { + nrmADD, + nrmADC, + nrmSUB, + nrmSBB, + nrmAND, + nrmOR , + nrmXOR, + nrmMOV, + nrmTEST, + nrmCMP, + nrmXCHG, +}; + +enum { + CMP_EQ = 0, + CMP_LT = 1, + CMP_LE = 2, + CMP_UNORD = 3, + CMP_NEQ = 4, + CMP_NLT = 5, + CMP_NLE = 6, + CMP_ORD = 7, +}; + +enum FloatOp { + floatLD = 0, + floatST = 2, + floatSTP = 3, + floatLD80 = 5, + floatSTP80 = 7, + + floatINVALID = -1, +}; + +enum FloatRound { + FROUND_NEAREST = 0, + FROUND_FLOOR = 1, + FROUND_CEIL = 2, + FROUND_ZERO = 3, + FROUND_MXCSR = 4, + + FROUND_RAISE_PRECISION = 0, + FROUND_IGNORE_PRECISION = 8, +}; + +class XEmitter; + +// RIP addressing does not benefit from micro op fusion on Core arch +struct OpArg +{ + OpArg() {} // dummy op arg, used for storage + OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX) + { + operandReg = 0; + scale = (u8)_scale; + offsetOrBaseReg = (u16)rmReg; + indexReg = (u16)scaledReg; + //if scale == 0 never mind offsetting + offset = _offset; + } + bool operator==(const OpArg &b) const + { + return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg && + indexReg == b.indexReg && offset == b.offset; + } + void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const; + void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const; + void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const; + void WriteFloatModRM(XEmitter *emit, FloatOp op); + void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits); + // This one is public - must be written to + u64 offset; // use RIP-relative as much as possible - 64-bit immediates are not available. + u16 operandReg; + + void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const; + bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;} + bool IsSimpleReg() const {return scale == SCALE_NONE;} + bool IsSimpleReg(X64Reg reg) const + { + if (!IsSimpleReg()) + return false; + return GetSimpleReg() == reg; + } + + bool CanDoOpWith(const OpArg &other) const + { + if (IsSimpleReg()) return true; + if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false; + return true; + } + + int GetImmBits() const + { + switch (scale) + { + case SCALE_IMM8: return 8; + case SCALE_IMM16: return 16; + case SCALE_IMM32: return 32; + case SCALE_IMM64: return 64; + default: return -1; + } + } + + void SetImmBits(int bits) { + switch (bits) + { + case 8: scale = SCALE_IMM8; break; + case 16: scale = SCALE_IMM16; break; + case 32: scale = SCALE_IMM32; break; + case 64: scale = SCALE_IMM64; break; + } + } + + X64Reg GetSimpleReg() const + { + if (scale == SCALE_NONE) + return (X64Reg)offsetOrBaseReg; + else + return INVALID_REG; + } + + u32 GetImmValue() const { + return (u32)offset; + } + + // For loops. + void IncreaseOffset(int sz) { + offset += sz; + } + +private: + u8 scale; + u16 offsetOrBaseReg; + u16 indexReg; +}; + +inline OpArg M(const void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);} +template <typename T> +inline OpArg M(const T *ptr) {return OpArg((u64)(const void *)ptr, (int)SCALE_RIP);} +inline OpArg R(X64Reg value) {return OpArg(0, SCALE_NONE, value);} +inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);} + +inline OpArg MDisp(X64Reg value, int offset) +{ + return OpArg((u32)offset, SCALE_ATREG, value); +} + +inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) +{ + return OpArg(offset, scale, base, scaled); +} + +inline OpArg MScaled(X64Reg scaled, int scale, int offset) +{ + if (scale == SCALE_1) + return OpArg(offset, SCALE_ATREG, scaled); + else + return OpArg(offset, scale | 0x20, RAX, scaled); +} + +inline OpArg MRegSum(X64Reg base, X64Reg offset) +{ + return MComplex(base, offset, 1, 0); +} + +inline OpArg Imm8 (u8 imm) {return OpArg(imm, SCALE_IMM8);} +inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used +inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);} +inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);} +inline OpArg UImmAuto(u32 imm) { + return OpArg(imm, imm >= 128 ? SCALE_IMM32 : SCALE_IMM8); +} +inline OpArg SImmAuto(s32 imm) { + return OpArg(imm, (imm >= 128 || imm < -128) ? SCALE_IMM32 : SCALE_IMM8); +} + +#ifdef _ARCH_64 +inline OpArg ImmPtr(const void* imm) {return Imm64((u64)imm);} +#else +inline OpArg ImmPtr(const void* imm) {return Imm32((u32)imm);} +#endif + +inline u32 PtrOffset(const void* ptr, const void* base) +{ +#ifdef _ARCH_64 + s64 distance = (s64)ptr-(s64)base; + if (distance >= 0x80000000LL || + distance < -0x80000000LL) + { + ASSERT_MSG(0, "pointer offset out of range"); + return 0; + } + + return (u32)distance; +#else + return (u32)ptr-(u32)base; +#endif +} + +//usage: int a[]; ARRAY_OFFSET(a,10) +#define ARRAY_OFFSET(array,index) ((u32)((u64)&(array)[index]-(u64)&(array)[0])) +//usage: struct {int e;} s; STRUCT_OFFSET(s,e) +#define STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str))) + +struct FixupBranch +{ + u8 *ptr; + int type; //0 = 8bit 1 = 32bit +}; + +enum SSECompare +{ + EQ = 0, + LT, + LE, + UNORD, + NEQ, + NLT, + NLE, + ORD, +}; + +typedef const u8* JumpTarget; + +class XEmitter +{ + friend struct OpArg; // for Write8 etc +private: + u8 *code; + bool flags_locked; + + void CheckFlags(); + + void Rex(int w, int r, int x, int b); + void WriteSimple1Byte(int bits, u8 byte, X64Reg reg); + void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg); + void WriteMulDivType(int bits, OpArg src, int ext); + void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false); + void WriteShift(int bits, OpArg dest, OpArg &shift, int ext); + void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext); + void WriteMXCSR(OpArg arg, int ext); + void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); + void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); + void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); + void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); + void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg); + void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); + + void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); + +protected: + inline void Write8(u8 value) {*code++ = value;} + inline void Write16(u16 value) {*(u16*)code = (value); code += 2;} + inline void Write32(u32 value) {*(u32*)code = (value); code += 4;} + inline void Write64(u64 value) {*(u64*)code = (value); code += 8;} + +public: + XEmitter() { code = nullptr; flags_locked = false; } + XEmitter(u8 *code_ptr) { code = code_ptr; flags_locked = false; } + virtual ~XEmitter() {} + + void WriteModRM(int mod, int rm, int reg); + void WriteSIB(int scale, int index, int base); + + void SetCodePtr(u8 *ptr); + void ReserveCodeSpace(int bytes); + const u8 *AlignCode4(); + const u8 *AlignCode16(); + const u8 *AlignCodePage(); + const u8 *GetCodePtr() const; + u8 *GetWritableCodePtr(); + + void LockFlags() { flags_locked = true; } + void UnlockFlags() { flags_locked = false; } + + // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU + // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., + // INC and DEC are slow on Intel Core, but not on AMD. They create a + // false flag dependency because they only update a subset of the flags. + // XCHG is SLOW and should be avoided. + + // Debug breakpoint + void INT3(); + + // Do nothing + void NOP(size_t count = 1); + + // Save energy in wait-loops on P4 only. Probably not too useful. + void PAUSE(); + + // Flag control + void STC(); + void CLC(); + void CMC(); + + // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD! + void LAHF(); // 3 cycle vector path + void SAHF(); // direct path fast + + + // Stack control + void PUSH(X64Reg reg); + void POP(X64Reg reg); + void PUSH(int bits, const OpArg ®); + void POP(int bits, const OpArg ®); + void PUSHF(); + void POPF(); + + // Flow control + void RET(); + void RET_FAST(); + void UD2(); + FixupBranch J(bool force5bytes = false); + + void JMP(const u8 * addr, bool force5Bytes = false); + void JMP(OpArg arg); + void JMPptr(const OpArg &arg); + void JMPself(); //infinite loop! +#ifdef CALL +#undef CALL +#endif + void CALL(const void *fnptr); + void CALLptr(OpArg arg); + + FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false); + //void J_CC(CCFlags conditionCode, JumpTarget target); + void J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes = false); + + void SetJumpTarget(const FixupBranch &branch); + + void SETcc(CCFlags flag, OpArg dest); + // Note: CMOV brings small if any benefit on current cpus. + void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag); + + // Fences + void LFENCE(); + void MFENCE(); + void SFENCE(); + + // Bit scan + void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit + void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit + + // Cache control + enum PrefetchLevel + { + PF_NTA, //Non-temporal (data used once and only once) + PF_T0, //All cache levels + PF_T1, //Levels 2+ (aliased to T0 on AMD) + PF_T2, //Levels 3+ (aliased to T0 on AMD) + }; + void PREFETCH(PrefetchLevel level, OpArg arg); + void MOVNTI(int bits, OpArg dest, X64Reg src); + void MOVNTDQ(OpArg arg, X64Reg regOp); + void MOVNTPS(OpArg arg, X64Reg regOp); + void MOVNTPD(OpArg arg, X64Reg regOp); + + // Multiplication / division + void MUL(int bits, OpArg src); //UNSIGNED + void IMUL(int bits, OpArg src); //SIGNED + void IMUL(int bits, X64Reg regOp, OpArg src); + void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm); + void DIV(int bits, OpArg src); + void IDIV(int bits, OpArg src); + + // Shift + void ROL(int bits, OpArg dest, OpArg shift); + void ROR(int bits, OpArg dest, OpArg shift); + void RCL(int bits, OpArg dest, OpArg shift); + void RCR(int bits, OpArg dest, OpArg shift); + void SHL(int bits, OpArg dest, OpArg shift); + void SHR(int bits, OpArg dest, OpArg shift); + void SAR(int bits, OpArg dest, OpArg shift); + + // Bit Test + void BT(int bits, OpArg dest, OpArg index); + void BTS(int bits, OpArg dest, OpArg index); + void BTR(int bits, OpArg dest, OpArg index); + void BTC(int bits, OpArg dest, OpArg index); + + // Double-Precision Shift + void SHRD(int bits, OpArg dest, OpArg src, OpArg shift); + void SHLD(int bits, OpArg dest, OpArg src, OpArg shift); + + // Extend EAX into EDX in various ways + void CWD(int bits = 16); + inline void CDQ() {CWD(32);} + inline void CQO() {CWD(64);} + void CBW(int bits = 8); + inline void CWDE() {CBW(16);} + inline void CDQE() {CBW(32);} + + // Load effective address + void LEA(int bits, X64Reg dest, OpArg src); + + // Integer arithmetic + void NEG (int bits, OpArg src); + void ADD (int bits, const OpArg &a1, const OpArg &a2); + void ADC (int bits, const OpArg &a1, const OpArg &a2); + void SUB (int bits, const OpArg &a1, const OpArg &a2); + void SBB (int bits, const OpArg &a1, const OpArg &a2); + void AND (int bits, const OpArg &a1, const OpArg &a2); + void CMP (int bits, const OpArg &a1, const OpArg &a2); + + // Bit operations + void NOT (int bits, OpArg src); + void OR (int bits, const OpArg &a1, const OpArg &a2); + void XOR (int bits, const OpArg &a1, const OpArg &a2); + void MOV (int bits, const OpArg &a1, const OpArg &a2); + void TEST(int bits, const OpArg &a1, const OpArg &a2); + + // Are these useful at all? Consider removing. + void XCHG(int bits, const OpArg &a1, const OpArg &a2); + void XCHG_AHAL(); + + // Byte swapping (32 and 64-bit only). + void BSWAP(int bits, X64Reg reg); + + // Sign/zero extension + void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary + void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); + + // Available only on Atom or >= Haswell so far. Test with GetCPUCaps().movbe. + void MOVBE(int dbits, const OpArg& dest, const OpArg& src); + + // Available only on AMD >= Phenom or Intel >= Haswell + void LZCNT(int bits, X64Reg dest, OpArg src); + // Note: this one is actually part of BMI1 + void TZCNT(int bits, X64Reg dest, OpArg src); + + // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64) + void STMXCSR(OpArg memloc); + void LDMXCSR(OpArg memloc); + + // Prefixes + void LOCK(); + void REP(); + void REPNE(); + void FSOverride(); + void GSOverride(); + + // x87 + enum x87StatusWordBits { + x87_InvalidOperation = 0x1, + x87_DenormalizedOperand = 0x2, + x87_DivisionByZero = 0x4, + x87_Overflow = 0x8, + x87_Underflow = 0x10, + x87_Precision = 0x20, + x87_StackFault = 0x40, + x87_ErrorSummary = 0x80, + x87_C0 = 0x100, + x87_C1 = 0x200, + x87_C2 = 0x400, + x87_TopOfStack = 0x2000 | 0x1000 | 0x800, + x87_C3 = 0x4000, + x87_FPUBusy = 0x8000, + }; + + void FLD(int bits, OpArg src); + void FST(int bits, OpArg dest); + void FSTP(int bits, OpArg dest); + void FNSTSW_AX(); + void FWAIT(); + + // SSE/SSE2: Floating point arithmetic + void ADDSS(X64Reg regOp, OpArg arg); + void ADDSD(X64Reg regOp, OpArg arg); + void SUBSS(X64Reg regOp, OpArg arg); + void SUBSD(X64Reg regOp, OpArg arg); + void MULSS(X64Reg regOp, OpArg arg); + void MULSD(X64Reg regOp, OpArg arg); + void DIVSS(X64Reg regOp, OpArg arg); + void DIVSD(X64Reg regOp, OpArg arg); + void MINSS(X64Reg regOp, OpArg arg); + void MINSD(X64Reg regOp, OpArg arg); + void MAXSS(X64Reg regOp, OpArg arg); + void MAXSD(X64Reg regOp, OpArg arg); + void SQRTSS(X64Reg regOp, OpArg arg); + void SQRTSD(X64Reg regOp, OpArg arg); + void RSQRTSS(X64Reg regOp, OpArg arg); + + // SSE/SSE2: Floating point bitwise (yes) + void CMPSS(X64Reg regOp, OpArg arg, u8 compare); + void CMPSD(X64Reg regOp, OpArg arg, u8 compare); + + inline void CMPEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_EQ); } + inline void CMPLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LT); } + inline void CMPLESS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LE); } + inline void CMPUNORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_UNORD); } + inline void CMPNEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NEQ); } + inline void CMPNLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NLT); } + inline void CMPORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_ORD); } + + // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double) + void ADDPS(X64Reg regOp, OpArg arg); + void ADDPD(X64Reg regOp, OpArg arg); + void SUBPS(X64Reg regOp, OpArg arg); + void SUBPD(X64Reg regOp, OpArg arg); + void CMPPS(X64Reg regOp, OpArg arg, u8 compare); + void CMPPD(X64Reg regOp, OpArg arg, u8 compare); + void MULPS(X64Reg regOp, OpArg arg); + void MULPD(X64Reg regOp, OpArg arg); + void DIVPS(X64Reg regOp, OpArg arg); + void DIVPD(X64Reg regOp, OpArg arg); + void MINPS(X64Reg regOp, OpArg arg); + void MINPD(X64Reg regOp, OpArg arg); + void MAXPS(X64Reg regOp, OpArg arg); + void MAXPD(X64Reg regOp, OpArg arg); + void SQRTPS(X64Reg regOp, OpArg arg); + void SQRTPD(X64Reg regOp, OpArg arg); + void RCPPS(X64Reg regOp, OpArg arg); + void RSQRTPS(X64Reg regOp, OpArg arg); + + // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double) + void ANDPS(X64Reg regOp, OpArg arg); + void ANDPD(X64Reg regOp, OpArg arg); + void ANDNPS(X64Reg regOp, OpArg arg); + void ANDNPD(X64Reg regOp, OpArg arg); + void ORPS(X64Reg regOp, OpArg arg); + void ORPD(X64Reg regOp, OpArg arg); + void XORPS(X64Reg regOp, OpArg arg); + void XORPD(X64Reg regOp, OpArg arg); + + // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation. + void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle); + void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle); + + // SSE/SSE2: Useful alternative to shuffle in some cases. + void MOVDDUP(X64Reg regOp, OpArg arg); + + // TODO: Actually implement +#if 0 + // SSE3: Horizontal operations in SIMD registers. Could be useful for various VFPU things like dot products... + void ADDSUBPS(X64Reg dest, OpArg src); + void ADDSUBPD(X64Reg dest, OpArg src); + void HADDPD(X64Reg dest, OpArg src); + void HSUBPS(X64Reg dest, OpArg src); + void HSUBPD(X64Reg dest, OpArg src); + + // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". + void DPPD(X64Reg dest, OpArg src, u8 arg); + + // These are probably useful for VFPU emulation. + void INSERTPS(X64Reg dest, OpArg src, u8 arg); + void EXTRACTPS(OpArg dest, X64Reg src, u8 arg); +#endif + + // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy. + void HADDPS(X64Reg dest, OpArg src); + + // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". + void DPPS(X64Reg dest, OpArg src, u8 arg); + + void UNPCKLPS(X64Reg dest, OpArg src); + void UNPCKHPS(X64Reg dest, OpArg src); + void UNPCKLPD(X64Reg dest, OpArg src); + void UNPCKHPD(X64Reg dest, OpArg src); + + // SSE/SSE2: Compares. + void COMISS(X64Reg regOp, OpArg arg); + void COMISD(X64Reg regOp, OpArg arg); + void UCOMISS(X64Reg regOp, OpArg arg); + void UCOMISD(X64Reg regOp, OpArg arg); + + // SSE/SSE2: Moves. Use the right data type for your data, in most cases. + void MOVAPS(X64Reg regOp, OpArg arg); + void MOVAPD(X64Reg regOp, OpArg arg); + void MOVAPS(OpArg arg, X64Reg regOp); + void MOVAPD(OpArg arg, X64Reg regOp); + + void MOVUPS(X64Reg regOp, OpArg arg); + void MOVUPD(X64Reg regOp, OpArg arg); + void MOVUPS(OpArg arg, X64Reg regOp); + void MOVUPD(OpArg arg, X64Reg regOp); + + void MOVDQA(X64Reg regOp, OpArg arg); + void MOVDQA(OpArg arg, X64Reg regOp); + void MOVDQU(X64Reg regOp, OpArg arg); + void MOVDQU(OpArg arg, X64Reg regOp); + + void MOVSS(X64Reg regOp, OpArg arg); + void MOVSD(X64Reg regOp, OpArg arg); + void MOVSS(OpArg arg, X64Reg regOp); + void MOVSD(OpArg arg, X64Reg regOp); + + void MOVLPS(X64Reg regOp, OpArg arg); + void MOVLPD(X64Reg regOp, OpArg arg); + void MOVLPS(OpArg arg, X64Reg regOp); + void MOVLPD(OpArg arg, X64Reg regOp); + + void MOVHPS(X64Reg regOp, OpArg arg); + void MOVHPD(X64Reg regOp, OpArg arg); + void MOVHPS(OpArg arg, X64Reg regOp); + void MOVHPD(OpArg arg, X64Reg regOp); + + void MOVHLPS(X64Reg regOp1, X64Reg regOp2); + void MOVLHPS(X64Reg regOp1, X64Reg regOp2); + + void MOVD_xmm(X64Reg dest, const OpArg &arg); + void MOVQ_xmm(X64Reg dest, OpArg arg); + void MOVD_xmm(const OpArg &arg, X64Reg src); + void MOVQ_xmm(OpArg arg, X64Reg src); + + // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question. + void MOVMSKPS(X64Reg dest, OpArg arg); + void MOVMSKPD(X64Reg dest, OpArg arg); + + // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one. + void MASKMOVDQU(X64Reg dest, X64Reg src); + void LDDQU(X64Reg dest, OpArg src); + + // SSE/SSE2: Data type conversions. + void CVTPS2PD(X64Reg dest, OpArg src); + void CVTPD2PS(X64Reg dest, OpArg src); + void CVTSS2SD(X64Reg dest, OpArg src); + void CVTSI2SS(X64Reg dest, OpArg src); + void CVTSD2SS(X64Reg dest, OpArg src); + void CVTSI2SD(X64Reg dest, OpArg src); + void CVTDQ2PD(X64Reg regOp, OpArg arg); + void CVTPD2DQ(X64Reg regOp, OpArg arg); + void CVTDQ2PS(X64Reg regOp, OpArg arg); + void CVTPS2DQ(X64Reg regOp, OpArg arg); + + void CVTTPS2DQ(X64Reg regOp, OpArg arg); + void CVTTPD2DQ(X64Reg regOp, OpArg arg); + + // Destinations are X64 regs (rax, rbx, ...) for these instructions. + void CVTSS2SI(X64Reg xregdest, OpArg src); + void CVTSD2SI(X64Reg xregdest, OpArg src); + void CVTTSS2SI(X64Reg xregdest, OpArg arg); + void CVTTSD2SI(X64Reg xregdest, OpArg arg); + + // SSE2: Packed integer instructions + void PACKSSDW(X64Reg dest, OpArg arg); + void PACKSSWB(X64Reg dest, OpArg arg); + void PACKUSDW(X64Reg dest, OpArg arg); + void PACKUSWB(X64Reg dest, OpArg arg); + + void PUNPCKLBW(X64Reg dest, const OpArg &arg); + void PUNPCKLWD(X64Reg dest, const OpArg &arg); + void PUNPCKLDQ(X64Reg dest, const OpArg &arg); + void PUNPCKLQDQ(X64Reg dest, const OpArg &arg); + + void PTEST(X64Reg dest, OpArg arg); + void PAND(X64Reg dest, OpArg arg); + void PANDN(X64Reg dest, OpArg arg); + void PXOR(X64Reg dest, OpArg arg); + void POR(X64Reg dest, OpArg arg); + + void PADDB(X64Reg dest, OpArg arg); + void PADDW(X64Reg dest, OpArg arg); + void PADDD(X64Reg dest, OpArg arg); + void PADDQ(X64Reg dest, OpArg arg); + + void PADDSB(X64Reg dest, OpArg arg); + void PADDSW(X64Reg dest, OpArg arg); + void PADDUSB(X64Reg dest, OpArg arg); + void PADDUSW(X64Reg dest, OpArg arg); + + void PSUBB(X64Reg dest, OpArg arg); + void PSUBW(X64Reg dest, OpArg arg); + void PSUBD(X64Reg dest, OpArg arg); + void PSUBQ(X64Reg dest, OpArg arg); + + void PSUBSB(X64Reg dest, OpArg arg); + void PSUBSW(X64Reg dest, OpArg arg); + void PSUBUSB(X64Reg dest, OpArg arg); + void PSUBUSW(X64Reg dest, OpArg arg); + + void PAVGB(X64Reg dest, OpArg arg); + void PAVGW(X64Reg dest, OpArg arg); + + void PCMPEQB(X64Reg dest, OpArg arg); + void PCMPEQW(X64Reg dest, OpArg arg); + void PCMPEQD(X64Reg dest, OpArg arg); + + void PCMPGTB(X64Reg dest, OpArg arg); + void PCMPGTW(X64Reg dest, OpArg arg); + void PCMPGTD(X64Reg dest, OpArg arg); + + void PEXTRW(X64Reg dest, OpArg arg, u8 subreg); + void PINSRW(X64Reg dest, OpArg arg, u8 subreg); + + void PMADDWD(X64Reg dest, OpArg arg); + void PSADBW(X64Reg dest, OpArg arg); + + void PMAXSW(X64Reg dest, OpArg arg); + void PMAXUB(X64Reg dest, OpArg arg); + void PMINSW(X64Reg dest, OpArg arg); + void PMINUB(X64Reg dest, OpArg arg); + // SSE4: More MAX/MIN instructions. + void PMINSB(X64Reg dest, OpArg arg); + void PMINSD(X64Reg dest, OpArg arg); + void PMINUW(X64Reg dest, OpArg arg); + void PMINUD(X64Reg dest, OpArg arg); + void PMAXSB(X64Reg dest, OpArg arg); + void PMAXSD(X64Reg dest, OpArg arg); + void PMAXUW(X64Reg dest, OpArg arg); + void PMAXUD(X64Reg dest, OpArg arg); + + void PMOVMSKB(X64Reg dest, OpArg arg); + void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle); + void PSHUFB(X64Reg dest, OpArg arg); + + void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle); + void PSHUFHW(X64Reg dest, OpArg arg, u8 shuffle); + + void PSRLW(X64Reg reg, int shift); + void PSRLD(X64Reg reg, int shift); + void PSRLQ(X64Reg reg, int shift); + void PSRLQ(X64Reg reg, OpArg arg); + void PSRLDQ(X64Reg reg, int shift); + + void PSLLW(X64Reg reg, int shift); + void PSLLD(X64Reg reg, int shift); + void PSLLQ(X64Reg reg, int shift); + void PSLLDQ(X64Reg reg, int shift); + + void PSRAW(X64Reg reg, int shift); + void PSRAD(X64Reg reg, int shift); + + // SSE4: data type conversions + void PMOVSXBW(X64Reg dest, OpArg arg); + void PMOVSXBD(X64Reg dest, OpArg arg); + void PMOVSXBQ(X64Reg dest, OpArg arg); + void PMOVSXWD(X64Reg dest, OpArg arg); + void PMOVSXWQ(X64Reg dest, OpArg arg); + void PMOVSXDQ(X64Reg dest, OpArg arg); + void PMOVZXBW(X64Reg dest, OpArg arg); + void PMOVZXBD(X64Reg dest, OpArg arg); + void PMOVZXBQ(X64Reg dest, OpArg arg); + void PMOVZXWD(X64Reg dest, OpArg arg); + void PMOVZXWQ(X64Reg dest, OpArg arg); + void PMOVZXDQ(X64Reg dest, OpArg arg); + + // SSE4: variable blend instructions (xmm0 implicit argument) + void PBLENDVB(X64Reg dest, OpArg arg); + void BLENDVPS(X64Reg dest, OpArg arg); + void BLENDVPD(X64Reg dest, OpArg arg); + void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend); + void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend); + + // SSE4: rounding (see FloatRound for mode or use ROUNDNEARSS, etc. helpers.) + void ROUNDSS(X64Reg dest, OpArg arg, u8 mode); + void ROUNDSD(X64Reg dest, OpArg arg, u8 mode); + void ROUNDPS(X64Reg dest, OpArg arg, u8 mode); + void ROUNDPD(X64Reg dest, OpArg arg, u8 mode); + + inline void ROUNDNEARSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_NEAREST); } + inline void ROUNDFLOORSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_FLOOR); } + inline void ROUNDCEILSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_CEIL); } + inline void ROUNDZEROSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_ZERO); } + + inline void ROUNDNEARSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_NEAREST); } + inline void ROUNDFLOORSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_FLOOR); } + inline void ROUNDCEILSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_CEIL); } + inline void ROUNDZEROSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_ZERO); } + + inline void ROUNDNEARPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_NEAREST); } + inline void ROUNDFLOORPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_FLOOR); } + inline void ROUNDCEILPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_CEIL); } + inline void ROUNDZEROPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_ZERO); } + + inline void ROUNDNEARPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_NEAREST); } + inline void ROUNDFLOORPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_FLOOR); } + inline void ROUNDCEILPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_CEIL); } + inline void ROUNDZEROPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_ZERO); } + + // AVX + void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle); + void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + + void VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + + void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg); + + // FMA3 + void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + + // VEX GPR instructions + void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); + void SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); + void SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); + void RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate); + void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); + void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); + void MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); + void BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); + void BLSR(int bits, X64Reg regOp, OpArg arg); + void BLSMSK(int bits, X64Reg regOp, OpArg arg); + void BLSI(int bits, X64Reg regOp, OpArg arg); + void BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); + void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); + + void RDTSC(); + + // Utility functions + // The difference between this and CALL is that this aligns the stack + // where appropriate. + void ABI_CallFunction(const void *func); + template <typename T> + void ABI_CallFunction(T (*func)()) { + ABI_CallFunction((const void *)func); + } + + void ABI_CallFunction(const u8 *func) { + ABI_CallFunction((const void *)func); + } + void ABI_CallFunctionC16(const void *func, u16 param1); + void ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2); + + + // These only support u32 parameters, but that's enough for a lot of uses. + // These will destroy the 1 or 2 first "parameter regs". + void ABI_CallFunctionC(const void *func, u32 param1); + void ABI_CallFunctionCC(const void *func, u32 param1, u32 param2); + void ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3); + void ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3); + void ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4); + void ABI_CallFunctionP(const void *func, void *param1); + void ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2); + void ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3); + void ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3); + void ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2); + void ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3); + void ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1); + void ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2); + + // Pass a register as a parameter. + void ABI_CallFunctionR(const void *func, X64Reg reg1); + void ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2); + + template <typename Tr, typename T1> + void ABI_CallFunctionC(Tr (*func)(T1), u32 param1) { + ABI_CallFunctionC((const void *)func, param1); + } + + // A function that doesn't have any control over what it will do to regs, + // such as the dispatcher, should be surrounded by these. + void ABI_PushAllCalleeSavedRegsAndAdjustStack(); + void ABI_PopAllCalleeSavedRegsAndAdjustStack(); + + // A function that doesn't know anything about it's surroundings, should + // be surrounded by these to establish a safe environment, where it can roam free. + // An example is a backpatch injected function. + void ABI_PushAllCallerSavedRegsAndAdjustStack(); + void ABI_PopAllCallerSavedRegsAndAdjustStack(); + + unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize); + void ABI_AlignStack(unsigned int frameSize); + void ABI_RestoreStack(unsigned int frameSize); + + // Sets up a __cdecl function. + // Only x64 really needs the parameter count. + void ABI_EmitPrologue(int maxCallParams); + void ABI_EmitEpilogue(int maxCallParams); + + #ifdef _M_IX86 + inline int ABI_GetNumXMMRegs() { return 8; } + #else + inline int ABI_GetNumXMMRegs() { return 16; } + #endif +}; // class XEmitter + + +// Everything that needs to generate X86 code should inherit from this. +// You get memory management for free, plus, you can use all the MOV etc functions without +// having to prefix them with gen-> or something similar. + +class XCodeBlock : public CodeBlock<XEmitter> { +public: + void PoisonMemory() override; +}; + +} // namespace diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index ba9af2a1f..6cc60fd58 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -4,9 +4,8 @@ set(SRCS arm/dyncom/arm_dyncom.cpp arm/dyncom/arm_dyncom_dec.cpp arm/dyncom/arm_dyncom_interpreter.cpp - arm/dyncom/arm_dyncom_run.cpp arm/dyncom/arm_dyncom_thumb.cpp - arm/skyeye_common/arminit.cpp + arm/skyeye_common/armstate.cpp arm/skyeye_common/armsupp.cpp arm/skyeye_common/vfp/vfp.cpp arm/skyeye_common/vfp/vfpdouble.cpp @@ -133,7 +132,6 @@ set(HEADERS arm/dyncom/arm_dyncom_thumb.h arm/skyeye_common/arm_regformat.h arm/skyeye_common/armstate.h - arm/skyeye_common/armmmu.h arm/skyeye_common/armsupp.h arm/skyeye_common/vfp/asm_vfp.h arm/skyeye_common/vfp/vfp.h diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h index 85ed2c698..5cffe513c 100644 --- a/src/core/arm/arm_interface.h +++ b/src/core/arm/arm_interface.h @@ -62,6 +62,34 @@ public: virtual void SetReg(int index, u32 value) = 0; /** + * Gets the value of a VFP register + * @param index Register index (0-31) + * @return Returns the value in the register + */ + virtual u32 GetVFPReg(int index) const = 0; + + /** + * Sets a VFP register to the given value + * @param index Register index (0-31) + * @param value Value to set register to + */ + virtual void SetVFPReg(int index, u32 value) = 0; + + /** + * Gets the current value within a given VFP system register + * @param reg The VFP system register + * @return The value within the VFP system register + */ + virtual u32 GetVFPSystemReg(VFPSystemRegister reg) const = 0; + + /** + * Sets the VFP system register to the given value + * @param reg The VFP system register + * @param value Value to set the VFP system register to + */ + virtual void SetVFPSystemReg(VFPSystemRegister reg, u32 value) = 0; + + /** * Get the current CPSR register * @return Returns the value of the CPSR register */ diff --git a/src/core/arm/disassembler/arm_disasm.cpp b/src/core/arm/disassembler/arm_disasm.cpp index f6d44d85a..77af10b54 100644 --- a/src/core/arm/disassembler/arm_disasm.cpp +++ b/src/core/arm/disassembler/arm_disasm.cpp @@ -1,9 +1,13 @@ // Copyright 2006 The Android Open Source Project #include <string> +#include <unordered_set> +#include "common/common_types.h" #include "common/string_util.h" + #include "core/arm/disassembler/arm_disasm.h" +#include "core/arm/skyeye_common/armsupp.h" static const char *cond_names[] = { "eq", @@ -37,6 +41,7 @@ static const char *opcode_names[] = { "blx", "bx", "cdp", + "clrex", "clz", "cmn", "cmp", @@ -46,6 +51,10 @@ static const char *opcode_names[] = { "ldr", "ldrb", "ldrbt", + "ldrex", + "ldrexb", + "ldrexd", + "ldrexh", "ldrh", "ldrsb", "ldrsh", @@ -58,28 +67,105 @@ static const char *opcode_names[] = { "msr", "mul", "mvn", + "nop", "orr", + "pkh", "pld", + "qadd16", + "qadd8", + "qasx", + "qsax", + "qsub16", + "qsub8", + "rev", + "rev16", + "revsh", "rsb", "rsc", + "sadd16", + "sadd8", + "sasx", "sbc", + "sel", + "sev", + "shadd16", + "shadd8", + "shasx", + "shsax", + "shsub16", + "shsub8", + "smlad", "smlal", + "smlald", + "smlsd", + "smlsld", + "smmla", + "smmls", + "smmul", + "smuad", "smull", + "smusd", + "ssat", + "ssat16", + "ssax", + "ssub16", + "ssub8", "stc", "stm", "str", "strb", "strbt", + "strex", + "strexb", + "strexd", + "strexh", "strh", "strt", "sub", "swi", "swp", "swpb", + "sxtab", + "sxtab16", + "sxtah", + "sxtb", + "sxtb16", + "sxth", "teq", "tst", + "uadd16", + "uadd8", + "uasx", + "uhadd16", + "uhadd8", + "uhasx", + "uhsax", + "uhsub16", + "uhsub8", "umlal", "umull", + "uqadd16", + "uqadd8", + "uqasx", + "uqsax", + "uqsub16", + "uqsub8", + "usad8", + "usada8", + "usat", + "usat16", + "usax", + "usub16", + "usub8", + "uxtab", + "uxtab16", + "uxtah", + "uxtb", + "uxtb16", + "uxth", + "wfe", + "wfi", + "yield", "undefined", "adc", @@ -131,11 +217,11 @@ static const char *shift_names[] = { "ROR" }; -static const char* cond_to_str(uint32_t cond) { +static const char* cond_to_str(u32 cond) { return cond_names[cond]; } -std::string ARM_Disasm::Disassemble(uint32_t addr, uint32_t insn) +std::string ARM_Disasm::Disassemble(u32 addr, u32 insn) { Opcode opcode = Decode(insn); switch (opcode) { @@ -172,6 +258,8 @@ std::string ARM_Disasm::Disassemble(uint32_t addr, uint32_t insn) return DisassembleBX(insn); case OP_CDP: return "cdp"; + case OP_CLREX: + return "clrex"; case OP_CLZ: return DisassembleCLZ(insn); case OP_LDC: @@ -188,6 +276,15 @@ std::string ARM_Disasm::Disassemble(uint32_t addr, uint32_t insn) case OP_STRBT: case OP_STRT: return DisassembleMem(insn); + case OP_LDREX: + case OP_LDREXB: + case OP_LDREXD: + case OP_LDREXH: + case OP_STREX: + case OP_STREXB: + case OP_STREXD: + case OP_STREXH: + return DisassembleREX(opcode, insn); case OP_LDRH: case OP_LDRSB: case OP_LDRSH: @@ -204,8 +301,76 @@ std::string ARM_Disasm::Disassemble(uint32_t addr, uint32_t insn) return DisassembleMSR(insn); case OP_MUL: return DisassembleMUL(opcode, insn); + case OP_NOP: + case OP_SEV: + case OP_WFE: + case OP_WFI: + case OP_YIELD: + return DisassembleNoOperands(opcode, insn); + case OP_PKH: + return DisassemblePKH(insn); case OP_PLD: return DisassemblePLD(insn); + case OP_QADD16: + case OP_QADD8: + case OP_QASX: + case OP_QSAX: + case OP_QSUB16: + case OP_QSUB8: + case OP_SADD16: + case OP_SADD8: + case OP_SASX: + case OP_SHADD16: + case OP_SHADD8: + case OP_SHASX: + case OP_SHSAX: + case OP_SHSUB16: + case OP_SHSUB8: + case OP_SSAX: + case OP_SSUB16: + case OP_SSUB8: + case OP_UADD16: + case OP_UADD8: + case OP_UASX: + case OP_UHADD16: + case OP_UHADD8: + case OP_UHASX: + case OP_UHSAX: + case OP_UHSUB16: + case OP_UHSUB8: + case OP_UQADD16: + case OP_UQADD8: + case OP_UQASX: + case OP_UQSAX: + case OP_UQSUB16: + case OP_UQSUB8: + case OP_USAX: + case OP_USUB16: + case OP_USUB8: + return DisassembleParallelAddSub(opcode, insn); + case OP_REV: + case OP_REV16: + case OP_REVSH: + return DisassembleREV(opcode, insn); + case OP_SEL: + return DisassembleSEL(insn); + case OP_SMLAD: + case OP_SMLALD: + case OP_SMLSD: + case OP_SMLSLD: + case OP_SMMLA: + case OP_SMMLS: + case OP_SMMUL: + case OP_SMUAD: + case OP_SMUSD: + case OP_USAD8: + case OP_USADA8: + return DisassembleMediaMulDiv(opcode, insn); + case OP_SSAT: + case OP_SSAT16: + case OP_USAT: + case OP_USAT16: + return DisassembleSAT(opcode, insn); case OP_STC: return "stc"; case OP_SWI: @@ -213,6 +378,19 @@ std::string ARM_Disasm::Disassemble(uint32_t addr, uint32_t insn) case OP_SWP: case OP_SWPB: return DisassembleSWP(opcode, insn); + case OP_SXTAB: + case OP_SXTAB16: + case OP_SXTAH: + case OP_SXTB: + case OP_SXTB16: + case OP_SXTH: + case OP_UXTAB: + case OP_UXTAB16: + case OP_UXTAH: + case OP_UXTB: + case OP_UXTB16: + case OP_UXTH: + return DisassembleXT(opcode, insn); case OP_UMLAL: case OP_UMULL: case OP_SMLAL: @@ -224,22 +402,22 @@ std::string ARM_Disasm::Disassemble(uint32_t addr, uint32_t insn) return NULL; } -std::string ARM_Disasm::DisassembleALU(Opcode opcode, uint32_t insn) +std::string ARM_Disasm::DisassembleALU(Opcode opcode, u32 insn) { - static const uint8_t kNoOperand1 = 1; - static const uint8_t kNoDest = 2; - static const uint8_t kNoSbit = 4; + static const u8 kNoOperand1 = 1; + static const u8 kNoDest = 2; + static const u8 kNoSbit = 4; std::string rn_str; std::string rd_str; - uint8_t flags = 0; - uint8_t cond = (insn >> 28) & 0xf; - uint8_t is_immed = (insn >> 25) & 0x1; - uint8_t bit_s = (insn >> 20) & 1; - uint8_t rn = (insn >> 16) & 0xf; - uint8_t rd = (insn >> 12) & 0xf; - uint8_t immed = insn & 0xff; + u8 flags = 0; + u8 cond = (insn >> 28) & 0xf; + u8 is_immed = (insn >> 25) & 0x1; + u8 bit_s = (insn >> 20) & 1; + u8 rn = (insn >> 16) & 0xf; + u8 rd = (insn >> 12) & 0xf; + u8 immed = insn & 0xff; const char* opname = opcode_names[opcode]; switch (opcode) { @@ -279,14 +457,14 @@ std::string ARM_Disasm::DisassembleALU(Opcode opcode, uint32_t insn) opname, cond_to_str(cond), sbit_str, rd_str.c_str(), rn_str.c_str(), immed, immed); } - uint8_t shift_is_reg = (insn >> 4) & 1; - uint8_t rotate = (insn >> 8) & 0xf; - uint8_t rm = insn & 0xf; - uint8_t shift_type = (insn >> 5) & 0x3; - uint8_t rs = (insn >> 8) & 0xf; - uint8_t shift_amount = (insn >> 7) & 0x1f; - uint32_t rotated_val = immed; - uint8_t rotate2 = rotate << 1; + u8 shift_is_reg = (insn >> 4) & 1; + u8 rotate = (insn >> 8) & 0xf; + u8 rm = insn & 0xf; + u8 shift_type = (insn >> 5) & 0x3; + u8 rs = (insn >> 8) & 0xf; + u8 shift_amount = (insn >> 7) & 0x1f; + u32 rotated_val = immed; + u8 rotate2 = rotate << 1; rotated_val = (rotated_val >> rotate2) | (rotated_val << (32 - rotate2)); if (!shift_is_reg && shift_type == 0 && shift_amount == 0) { @@ -312,10 +490,10 @@ std::string ARM_Disasm::DisassembleALU(Opcode opcode, uint32_t insn) shift_name, shift_amount); } -std::string ARM_Disasm::DisassembleBranch(uint32_t addr, Opcode opcode, uint32_t insn) +std::string ARM_Disasm::DisassembleBranch(u32 addr, Opcode opcode, u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint32_t offset = insn & 0xffffff; + u8 cond = (insn >> 28) & 0xf; + u32 offset = insn & 0xffffff; // Sign-extend the 24-bit offset if ((offset >> 23) & 1) offset |= 0xff000000; @@ -328,39 +506,71 @@ std::string ARM_Disasm::DisassembleBranch(uint32_t addr, Opcode opcode, uint32_t return Common::StringFromFormat("%s%s\t0x%x", opname, cond_to_str(cond), addr); } -std::string ARM_Disasm::DisassembleBX(uint32_t insn) +std::string ARM_Disasm::DisassembleBX(u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint8_t rn = insn & 0xf; + u8 cond = (insn >> 28) & 0xf; + u8 rn = insn & 0xf; return Common::StringFromFormat("bx%s\tr%d", cond_to_str(cond), rn); } -std::string ARM_Disasm::DisassembleBKPT(uint32_t insn) +std::string ARM_Disasm::DisassembleBKPT(u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint32_t immed = (((insn >> 8) & 0xfff) << 4) | (insn & 0xf); + u8 cond = (insn >> 28) & 0xf; + u32 immed = (((insn >> 8) & 0xfff) << 4) | (insn & 0xf); return Common::StringFromFormat("bkpt%s\t#%d", cond_to_str(cond), immed); } -std::string ARM_Disasm::DisassembleCLZ(uint32_t insn) +std::string ARM_Disasm::DisassembleCLZ(u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint8_t rd = (insn >> 12) & 0xf; - uint8_t rm = insn & 0xf; + u8 cond = (insn >> 28) & 0xf; + u8 rd = (insn >> 12) & 0xf; + u8 rm = insn & 0xf; return Common::StringFromFormat("clz%s\tr%d, r%d", cond_to_str(cond), rd, rm); } -std::string ARM_Disasm::DisassembleMemblock(Opcode opcode, uint32_t insn) +std::string ARM_Disasm::DisassembleMediaMulDiv(Opcode opcode, u32 insn) { + u32 cond = BITS(insn, 28, 31); + u32 rd = BITS(insn, 16, 19); + u32 ra = BITS(insn, 12, 15); + u32 rm = BITS(insn, 8, 11); + u32 m = BIT(insn, 5); + u32 rn = BITS(insn, 0, 3); + + std::string cross = ""; + if (m) { + if (opcode == OP_SMMLA || opcode == OP_SMMUL || opcode == OP_SMMLS) + cross = "r"; + else + cross = "x"; + } + + std::string ext_reg = ""; + std::unordered_set<Opcode, std::hash<int>> with_ext_reg = { + OP_SMLAD, OP_SMLSD, OP_SMMLA, OP_SMMLS, OP_USADA8 + }; + if (with_ext_reg.find(opcode) != with_ext_reg.end()) + ext_reg = Common::StringFromFormat(", r%u", ra); + + std::string rd_low = ""; + if (opcode == OP_SMLALD || opcode == OP_SMLSLD) + rd_low = Common::StringFromFormat("r%u, ", ra); + + return Common::StringFromFormat("%s%s%s\t%sr%u, r%u, r%u%s", opcode_names[opcode], + cross.c_str(), cond_to_str(cond), rd_low.c_str(), rd, rn, rm, + ext_reg.c_str()); +} + +std::string ARM_Disasm::DisassembleMemblock(Opcode opcode, u32 insn) { std::string tmp_list; - uint8_t cond = (insn >> 28) & 0xf; - uint8_t write_back = (insn >> 21) & 0x1; - uint8_t bit_s = (insn >> 22) & 0x1; - uint8_t is_up = (insn >> 23) & 0x1; - uint8_t is_pre = (insn >> 24) & 0x1; - uint8_t rn = (insn >> 16) & 0xf; - uint16_t reg_list = insn & 0xffff; + u8 cond = (insn >> 28) & 0xf; + u8 write_back = (insn >> 21) & 0x1; + u8 bit_s = (insn >> 22) & 0x1; + u8 is_up = (insn >> 23) & 0x1; + u8 is_pre = (insn >> 24) & 0x1; + u8 rn = (insn >> 16) & 0xf; + u16 reg_list = insn & 0xffff; const char *opname = opcode_names[opcode]; @@ -400,18 +610,18 @@ std::string ARM_Disasm::DisassembleMemblock(Opcode opcode, uint32_t insn) opname, cond_to_str(cond), addr_mode, rn, bang, tmp_list.c_str(), carret); } -std::string ARM_Disasm::DisassembleMem(uint32_t insn) +std::string ARM_Disasm::DisassembleMem(u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint8_t is_reg = (insn >> 25) & 0x1; - uint8_t is_load = (insn >> 20) & 0x1; - uint8_t write_back = (insn >> 21) & 0x1; - uint8_t is_byte = (insn >> 22) & 0x1; - uint8_t is_up = (insn >> 23) & 0x1; - uint8_t is_pre = (insn >> 24) & 0x1; - uint8_t rn = (insn >> 16) & 0xf; - uint8_t rd = (insn >> 12) & 0xf; - uint16_t offset = insn & 0xfff; + u8 cond = (insn >> 28) & 0xf; + u8 is_reg = (insn >> 25) & 0x1; + u8 is_load = (insn >> 20) & 0x1; + u8 write_back = (insn >> 21) & 0x1; + u8 is_byte = (insn >> 22) & 0x1; + u8 is_up = (insn >> 23) & 0x1; + u8 is_pre = (insn >> 24) & 0x1; + u8 rn = (insn >> 16) & 0xf; + u8 rd = (insn >> 12) & 0xf; + u16 offset = insn & 0xfff; const char *opname = "ldr"; if (!is_load) @@ -448,9 +658,9 @@ std::string ARM_Disasm::DisassembleMem(uint32_t insn) } } - uint8_t rm = insn & 0xf; - uint8_t shift_type = (insn >> 5) & 0x3; - uint8_t shift_amount = (insn >> 7) & 0x1f; + u8 rm = insn & 0xf; + u8 shift_type = (insn >> 5) & 0x3; + u8 shift_amount = (insn >> 7) & 0x1f; const char *shift_name = shift_names[shift_type]; @@ -492,19 +702,19 @@ std::string ARM_Disasm::DisassembleMem(uint32_t insn) shift_name, shift_amount); } -std::string ARM_Disasm::DisassembleMemHalf(uint32_t insn) +std::string ARM_Disasm::DisassembleMemHalf(u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint8_t is_load = (insn >> 20) & 0x1; - uint8_t write_back = (insn >> 21) & 0x1; - uint8_t is_immed = (insn >> 22) & 0x1; - uint8_t is_up = (insn >> 23) & 0x1; - uint8_t is_pre = (insn >> 24) & 0x1; - uint8_t rn = (insn >> 16) & 0xf; - uint8_t rd = (insn >> 12) & 0xf; - uint8_t bits_65 = (insn >> 5) & 0x3; - uint8_t rm = insn & 0xf; - uint8_t offset = (((insn >> 8) & 0xf) << 4) | (insn & 0xf); + u8 cond = (insn >> 28) & 0xf; + u8 is_load = (insn >> 20) & 0x1; + u8 write_back = (insn >> 21) & 0x1; + u8 is_immed = (insn >> 22) & 0x1; + u8 is_up = (insn >> 23) & 0x1; + u8 is_pre = (insn >> 24) & 0x1; + u8 rn = (insn >> 16) & 0xf; + u8 rd = (insn >> 12) & 0xf; + u8 bits_65 = (insn >> 5) & 0x3; + u8 rm = insn & 0xf; + u8 offset = (((insn >> 8) & 0xf) << 4) | (insn & 0xf); const char *opname = "ldr"; if (is_load == 0) @@ -548,78 +758,78 @@ std::string ARM_Disasm::DisassembleMemHalf(uint32_t insn) } } -std::string ARM_Disasm::DisassembleMCR(Opcode opcode, uint32_t insn) +std::string ARM_Disasm::DisassembleMCR(Opcode opcode, u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint8_t crn = (insn >> 16) & 0xf; - uint8_t crd = (insn >> 12) & 0xf; - uint8_t cpnum = (insn >> 8) & 0xf; - uint8_t opcode2 = (insn >> 5) & 0x7; - uint8_t crm = insn & 0xf; + u8 cond = (insn >> 28) & 0xf; + u8 crn = (insn >> 16) & 0xf; + u8 crd = (insn >> 12) & 0xf; + u8 cpnum = (insn >> 8) & 0xf; + u8 opcode2 = (insn >> 5) & 0x7; + u8 crm = insn & 0xf; const char *opname = opcode_names[opcode]; return Common::StringFromFormat("%s%s\t%d, 0, r%d, cr%d, cr%d, {%d}", opname, cond_to_str(cond), cpnum, crd, crn, crm, opcode2); } -std::string ARM_Disasm::DisassembleMLA(Opcode opcode, uint32_t insn) +std::string ARM_Disasm::DisassembleMLA(Opcode opcode, u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint8_t rd = (insn >> 16) & 0xf; - uint8_t rn = (insn >> 12) & 0xf; - uint8_t rs = (insn >> 8) & 0xf; - uint8_t rm = insn & 0xf; - uint8_t bit_s = (insn >> 20) & 1; + u8 cond = (insn >> 28) & 0xf; + u8 rd = (insn >> 16) & 0xf; + u8 rn = (insn >> 12) & 0xf; + u8 rs = (insn >> 8) & 0xf; + u8 rm = insn & 0xf; + u8 bit_s = (insn >> 20) & 1; const char *opname = opcode_names[opcode]; return Common::StringFromFormat("%s%s%s\tr%d, r%d, r%d, r%d", opname, cond_to_str(cond), bit_s ? "s" : "", rd, rm, rs, rn); } -std::string ARM_Disasm::DisassembleUMLAL(Opcode opcode, uint32_t insn) +std::string ARM_Disasm::DisassembleUMLAL(Opcode opcode, u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint8_t rdhi = (insn >> 16) & 0xf; - uint8_t rdlo = (insn >> 12) & 0xf; - uint8_t rs = (insn >> 8) & 0xf; - uint8_t rm = insn & 0xf; - uint8_t bit_s = (insn >> 20) & 1; + u8 cond = (insn >> 28) & 0xf; + u8 rdhi = (insn >> 16) & 0xf; + u8 rdlo = (insn >> 12) & 0xf; + u8 rs = (insn >> 8) & 0xf; + u8 rm = insn & 0xf; + u8 bit_s = (insn >> 20) & 1; const char *opname = opcode_names[opcode]; return Common::StringFromFormat("%s%s%s\tr%d, r%d, r%d, r%d", opname, cond_to_str(cond), bit_s ? "s" : "", rdlo, rdhi, rm, rs); } -std::string ARM_Disasm::DisassembleMUL(Opcode opcode, uint32_t insn) +std::string ARM_Disasm::DisassembleMUL(Opcode opcode, u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint8_t rd = (insn >> 16) & 0xf; - uint8_t rs = (insn >> 8) & 0xf; - uint8_t rm = insn & 0xf; - uint8_t bit_s = (insn >> 20) & 1; + u8 cond = (insn >> 28) & 0xf; + u8 rd = (insn >> 16) & 0xf; + u8 rs = (insn >> 8) & 0xf; + u8 rm = insn & 0xf; + u8 bit_s = (insn >> 20) & 1; const char *opname = opcode_names[opcode]; return Common::StringFromFormat("%s%s%s\tr%d, r%d, r%d", opname, cond_to_str(cond), bit_s ? "s" : "", rd, rm, rs); } -std::string ARM_Disasm::DisassembleMRS(uint32_t insn) +std::string ARM_Disasm::DisassembleMRS(u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint8_t rd = (insn >> 12) & 0xf; - uint8_t ps = (insn >> 22) & 1; + u8 cond = (insn >> 28) & 0xf; + u8 rd = (insn >> 12) & 0xf; + u8 ps = (insn >> 22) & 1; return Common::StringFromFormat("mrs%s\tr%d, %s", cond_to_str(cond), rd, ps ? "spsr" : "cpsr"); } -std::string ARM_Disasm::DisassembleMSR(uint32_t insn) +std::string ARM_Disasm::DisassembleMSR(u32 insn) { char flags[8]; int flag_index = 0; - uint8_t cond = (insn >> 28) & 0xf; - uint8_t is_immed = (insn >> 25) & 0x1; - uint8_t pd = (insn >> 22) & 1; - uint8_t mask = (insn >> 16) & 0xf; + u8 cond = (insn >> 28) & 0xf; + u8 is_immed = (insn >> 25) & 0x1; + u8 pd = (insn >> 22) & 1; + u8 mask = (insn >> 16) & 0xf; if (mask & 1) flags[flag_index++] = 'c'; @@ -632,36 +842,76 @@ std::string ARM_Disasm::DisassembleMSR(uint32_t insn) flags[flag_index] = 0; if (is_immed) { - uint32_t immed = insn & 0xff; - uint8_t rotate = (insn >> 8) & 0xf; - uint8_t rotate2 = rotate << 1; - uint32_t rotated_val = (immed >> rotate2) | (immed << (32 - rotate2)); + u32 immed = insn & 0xff; + u8 rotate = (insn >> 8) & 0xf; + u8 rotate2 = rotate << 1; + u32 rotated_val = (immed >> rotate2) | (immed << (32 - rotate2)); return Common::StringFromFormat("msr%s\t%s_%s, #0x%x", cond_to_str(cond), pd ? "spsr" : "cpsr", flags, rotated_val); } - uint8_t rm = insn & 0xf; + u8 rm = insn & 0xf; return Common::StringFromFormat("msr%s\t%s_%s, r%d", cond_to_str(cond), pd ? "spsr" : "cpsr", flags, rm); } -std::string ARM_Disasm::DisassemblePLD(uint32_t insn) +std::string ARM_Disasm::DisassembleNoOperands(Opcode opcode, u32 insn) +{ + u32 cond = BITS(insn, 28, 31); + return Common::StringFromFormat("%s%s", opcode_names[opcode], cond_to_str(cond)); +} + +std::string ARM_Disasm::DisassembleParallelAddSub(Opcode opcode, u32 insn) { + u32 cond = BITS(insn, 28, 31); + u32 rn = BITS(insn, 16, 19); + u32 rd = BITS(insn, 12, 15); + u32 rm = BITS(insn, 0, 3); + + return Common::StringFromFormat("%s%s\tr%u, r%u, r%u", opcode_names[opcode], cond_to_str(cond), + rd, rn, rm); +} + +std::string ARM_Disasm::DisassemblePKH(u32 insn) +{ + u32 cond = BITS(insn, 28, 31); + u32 rn = BITS(insn, 16, 19); + u32 rd = BITS(insn, 12, 15); + u32 imm5 = BITS(insn, 7, 11); + u32 tb = BIT(insn, 6); + u32 rm = BITS(insn, 0, 3); + + std::string suffix = tb ? "tb" : "bt"; + std::string shift = ""; + + if (tb && imm5 == 0) + imm5 = 32; + + if (imm5 > 0) { + shift = tb ? ", ASR" : ", LSL"; + shift += " #" + std::to_string(imm5); + } + + return Common::StringFromFormat("pkh%s%s\tr%u, r%u, r%u%s", suffix.c_str(), cond_to_str(cond), + rd, rn, rm, shift.c_str()); +} + +std::string ARM_Disasm::DisassemblePLD(u32 insn) { - uint8_t is_reg = (insn >> 25) & 0x1; - uint8_t is_up = (insn >> 23) & 0x1; - uint8_t rn = (insn >> 16) & 0xf; + u8 is_reg = (insn >> 25) & 0x1; + u8 is_up = (insn >> 23) & 0x1; + u8 rn = (insn >> 16) & 0xf; const char *minus = ""; if (is_up == 0) minus = "-"; if (is_reg) { - uint8_t rm = insn & 0xf; + u8 rm = insn & 0xf; return Common::StringFromFormat("pld\t[r%d, %sr%d]", rn, minus, rm); } - uint16_t offset = insn & 0xfff; + u16 offset = insn & 0xfff; if (offset == 0) { return Common::StringFromFormat("pld\t[r%d]", rn); } else { @@ -669,27 +919,128 @@ std::string ARM_Disasm::DisassemblePLD(uint32_t insn) } } -std::string ARM_Disasm::DisassembleSWI(uint32_t insn) +std::string ARM_Disasm::DisassembleREV(Opcode opcode, u32 insn) { + u32 cond = BITS(insn, 28, 31); + u32 rd = BITS(insn, 12, 15); + u32 rm = BITS(insn, 0, 3); + + return Common::StringFromFormat("%s%s\tr%u, r%u", opcode_names[opcode], cond_to_str(cond), + rd, rm); +} + +std::string ARM_Disasm::DisassembleREX(Opcode opcode, u32 insn) { + u32 rn = BITS(insn, 16, 19); + u32 rd = BITS(insn, 12, 15); + u32 rt = BITS(insn, 0, 3); + u32 cond = BITS(insn, 28, 31); + + switch (opcode) { + case OP_STREX: + case OP_STREXB: + case OP_STREXH: + return Common::StringFromFormat("%s%s\tr%d, r%d, [r%d]", opcode_names[opcode], + cond_to_str(cond), rd, rt, rn); + case OP_STREXD: + return Common::StringFromFormat("%s%s\tr%d, r%d, r%d, [r%d]", opcode_names[opcode], + cond_to_str(cond), rd, rt, rt + 1, rn); + + // for LDREX instructions, rd corresponds to Rt from reference manual + case OP_LDREX: + case OP_LDREXB: + case OP_LDREXH: + return Common::StringFromFormat("%s%s\tr%d, [r%d]", opcode_names[opcode], + cond_to_str(cond), rd, rn); + case OP_LDREXD: + return Common::StringFromFormat("%s%s\tr%d, r%d, [r%d]", opcode_names[opcode], + cond_to_str(cond), rd, rd + 1, rn); + default: + return opcode_names[OP_UNDEFINED]; + } +} + +std::string ARM_Disasm::DisassembleSAT(Opcode opcode, u32 insn) { + u32 cond = BITS(insn, 28, 31); + u32 sat_imm = BITS(insn, 16, 20); + u32 rd = BITS(insn, 12, 15); + u32 imm5 = BITS(insn, 7, 11); + u32 sh = BIT(insn, 6); + u32 rn = BITS(insn, 0, 3); + + std::string shift_part = ""; + bool opcode_has_shift = (opcode == OP_SSAT) || (opcode == OP_USAT); + if (opcode_has_shift && !(sh == 0 && imm5 == 0)) { + if (sh == 0) + shift_part += ", LSL #"; + else + shift_part += ", ASR #"; + + if (imm5 == 0) + imm5 = 32; + shift_part += std::to_string(imm5); + } + + if (opcode == OP_SSAT || opcode == OP_SSAT16) + sat_imm++; + + return Common::StringFromFormat("%s%s\tr%u, #%u, r%u%s", opcode_names[opcode], cond_to_str(cond), rd, + sat_imm, rn, shift_part.c_str()); +} + +std::string ARM_Disasm::DisassembleSEL(u32 insn) { + u32 cond = BITS(insn, 28, 31); + u32 rn = BITS(insn, 16, 19); + u32 rd = BITS(insn, 12, 15); + u32 rm = BITS(insn, 0, 3); + + return Common::StringFromFormat("%s%s\tr%u, r%u, r%u", opcode_names[OP_SEL], cond_to_str(cond), + rd, rn, rm); +} + +std::string ARM_Disasm::DisassembleSWI(u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint32_t sysnum = insn & 0x00ffffff; + u8 cond = (insn >> 28) & 0xf; + u32 sysnum = insn & 0x00ffffff; return Common::StringFromFormat("swi%s 0x%x", cond_to_str(cond), sysnum); } -std::string ARM_Disasm::DisassembleSWP(Opcode opcode, uint32_t insn) +std::string ARM_Disasm::DisassembleSWP(Opcode opcode, u32 insn) { - uint8_t cond = (insn >> 28) & 0xf; - uint8_t rn = (insn >> 16) & 0xf; - uint8_t rd = (insn >> 12) & 0xf; - uint8_t rm = insn & 0xf; + u8 cond = (insn >> 28) & 0xf; + u8 rn = (insn >> 16) & 0xf; + u8 rd = (insn >> 12) & 0xf; + u8 rm = insn & 0xf; const char *opname = opcode_names[opcode]; return Common::StringFromFormat("%s%s\tr%d, r%d, [r%d]", opname, cond_to_str(cond), rd, rm, rn); } -Opcode ARM_Disasm::Decode(uint32_t insn) { - uint32_t bits27_26 = (insn >> 26) & 0x3; +std::string ARM_Disasm::DisassembleXT(Opcode opcode, u32 insn) +{ + u32 cond = BITS(insn, 28, 31); + u32 rn = BITS(insn, 16, 19); + u32 rd = BITS(insn, 12, 15); + u32 rotate = BITS(insn, 10, 11); + u32 rm = BITS(insn, 0, 3); + + std::string rn_part = ""; + static std::unordered_set<Opcode, std::hash<int>> extend_with_add = { + OP_SXTAB, OP_SXTAB16, OP_SXTAH, + OP_UXTAB, OP_UXTAB16, OP_UXTAH + }; + if (extend_with_add.find(opcode) != extend_with_add.end()) + rn_part = ", r" + std::to_string(rn); + + std::string rotate_part = ""; + if (rotate != 0) + rotate_part = ", ROR #" + std::to_string(rotate << 3); + + return Common::StringFromFormat("%s%s\tr%u%s, r%u%s", opcode_names[opcode], cond_to_str(cond), + rd, rn_part.c_str(), rm, rotate_part.c_str()); +} + +Opcode ARM_Disasm::Decode(u32 insn) { + u32 bits27_26 = (insn >> 26) & 0x3; switch (bits27_26) { case 0x0: return Decode00(insn); @@ -703,9 +1054,9 @@ Opcode ARM_Disasm::Decode(uint32_t insn) { return OP_INVALID; } -Opcode ARM_Disasm::Decode00(uint32_t insn) { - uint8_t bit25 = (insn >> 25) & 0x1; - uint8_t bit4 = (insn >> 4) & 0x1; +Opcode ARM_Disasm::Decode00(u32 insn) { + u8 bit25 = (insn >> 25) & 0x1; + u8 bit4 = (insn >> 4) & 0x1; if (bit25 == 0 && bit4 == 1) { if ((insn & 0x0ffffff0) == 0x012fff10) { // Bx instruction @@ -719,41 +1070,48 @@ Opcode ARM_Disasm::Decode00(uint32_t insn) { // Bkpt instruction return OP_BKPT; } - uint32_t bits7_4 = (insn >> 4) & 0xf; + u32 bits7_4 = (insn >> 4) & 0xf; if (bits7_4 == 0x9) { - if ((insn & 0x0ff00ff0) == 0x01000090) { - // Swp instruction - uint8_t bit22 = (insn >> 22) & 0x1; - if (bit22) - return OP_SWPB; - return OP_SWP; + u32 bit24 = BIT(insn, 24); + if (bit24) { + return DecodeSyncPrimitive(insn); } // One of the multiply instructions return DecodeMUL(insn); } - uint8_t bit7 = (insn >> 7) & 0x1; + u8 bit7 = (insn >> 7) & 0x1; if (bit7 == 1) { // One of the load/store halfword/byte instructions return DecodeLDRH(insn); } } + u32 op1 = BITS(insn, 20, 24); + if (bit25 && (op1 == 0x12 || op1 == 0x16)) { + // One of the MSR (immediate) and hints instructions + return DecodeMSRImmAndHints(insn); + } + // One of the data processing instructions return DecodeALU(insn); } -Opcode ARM_Disasm::Decode01(uint32_t insn) { - uint8_t is_reg = (insn >> 25) & 0x1; - uint8_t bit4 = (insn >> 4) & 0x1; +Opcode ARM_Disasm::Decode01(u32 insn) { + u8 is_reg = (insn >> 25) & 0x1; + u8 bit4 = (insn >> 4) & 0x1; if (is_reg == 1 && bit4 == 1) - return OP_UNDEFINED; - uint8_t is_load = (insn >> 20) & 0x1; - uint8_t is_byte = (insn >> 22) & 0x1; + return DecodeMedia(insn); + u8 is_load = (insn >> 20) & 0x1; + u8 is_byte = (insn >> 22) & 0x1; if ((insn & 0xfd70f000) == 0xf550f000) { // Pre-load return OP_PLD; } + if (insn == 0xf57ff01f) { + // Clear-Exclusive + return OP_CLREX; + } if (is_load) { if (is_byte) { // Load byte @@ -770,36 +1128,28 @@ Opcode ARM_Disasm::Decode01(uint32_t insn) { return OP_STR; } -Opcode ARM_Disasm::Decode10(uint32_t insn) { - uint8_t bit25 = (insn >> 25) & 0x1; +Opcode ARM_Disasm::Decode10(u32 insn) { + u8 bit25 = (insn >> 25) & 0x1; if (bit25 == 0) { // LDM/STM - uint8_t is_load = (insn >> 20) & 0x1; + u8 is_load = (insn >> 20) & 0x1; if (is_load) return OP_LDM; return OP_STM; } - // Branch or Branch with link - uint8_t is_link = (insn >> 24) & 1; - uint32_t offset = insn & 0xffffff; - // Sign-extend the 24-bit offset - if ((offset >> 23) & 1) - offset |= 0xff000000; + // Branch with link + if ((insn >> 24) & 1) + return OP_BL; - // Pre-compute the left-shift and the prefetch offset - offset <<= 2; - offset += 8; - if (is_link == 0) - return OP_B; - return OP_BL; + return OP_B; } -Opcode ARM_Disasm::Decode11(uint32_t insn) { - uint8_t bit25 = (insn >> 25) & 0x1; +Opcode ARM_Disasm::Decode11(u32 insn) { + u8 bit25 = (insn >> 25) & 0x1; if (bit25 == 0) { // LDC, SDC - uint8_t is_load = (insn >> 20) & 0x1; + u8 is_load = (insn >> 20) & 0x1; if (is_load) { // LDC return OP_LDC; @@ -808,18 +1158,18 @@ Opcode ARM_Disasm::Decode11(uint32_t insn) { return OP_STC; } - uint8_t bit24 = (insn >> 24) & 0x1; + u8 bit24 = (insn >> 24) & 0x1; if (bit24 == 0x1) { // SWI return OP_SWI; } - uint8_t bit4 = (insn >> 4) & 0x1; - uint8_t cpnum = (insn >> 8) & 0xf; + u8 bit4 = (insn >> 4) & 0x1; + u8 cpnum = (insn >> 8) & 0xf; if (cpnum == 15) { // Special case for coprocessor 15 - uint8_t opcode = (insn >> 21) & 0x7; + u8 opcode = (insn >> 21) & 0x7; if (bit4 == 0 || opcode != 0) { // This is an unexpected bit pattern. Create an undefined // instruction in case this is ever executed. @@ -827,7 +1177,7 @@ Opcode ARM_Disasm::Decode11(uint32_t insn) { } // MRC, MCR - uint8_t is_mrc = (insn >> 20) & 0x1; + u8 is_mrc = (insn >> 20) & 0x1; if (is_mrc) return OP_MRC; return OP_MCR; @@ -838,22 +1188,165 @@ Opcode ARM_Disasm::Decode11(uint32_t insn) { return OP_CDP; } // MRC, MCR - uint8_t is_mrc = (insn >> 20) & 0x1; + u8 is_mrc = (insn >> 20) & 0x1; if (is_mrc) return OP_MRC; return OP_MCR; } -Opcode ARM_Disasm::DecodeMUL(uint32_t insn) { - uint8_t bit24 = (insn >> 24) & 0x1; +Opcode ARM_Disasm::DecodeSyncPrimitive(u32 insn) { + u32 op = BITS(insn, 20, 23); + u32 bit22 = BIT(insn, 22); + switch (op) { + case 0x0: + if (bit22) + return OP_SWPB; + return OP_SWP; + case 0x8: + return OP_STREX; + case 0x9: + return OP_LDREX; + case 0xA: + return OP_STREXD; + case 0xB: + return OP_LDREXD; + case 0xC: + return OP_STREXB; + case 0xD: + return OP_LDREXB; + case 0xE: + return OP_STREXH; + case 0xF: + return OP_LDREXH; + default: + return OP_UNDEFINED; + } +} + +Opcode ARM_Disasm::DecodeParallelAddSub(u32 insn) { + u32 op1 = BITS(insn, 20, 21); + u32 op2 = BITS(insn, 5, 7); + u32 is_unsigned = BIT(insn, 22); + + if (op1 == 0x0 || op2 == 0x5 || op2 == 0x6) + return OP_UNDEFINED; + + // change op1 range from [1, 3] to range [0, 2] + op1--; + + // change op2 range from [0, 4] U {7} to range [0, 5] + if (op2 == 0x7) + op2 = 0x5; + + static std::vector<Opcode> opcodes = { + // op1 = 0 + OP_SADD16, OP_UADD16, + OP_SASX, OP_UASX, + OP_SSAX, OP_USAX, + OP_SSUB16, OP_USUB16, + OP_SADD8, OP_UADD8, + OP_SSUB8, OP_USUB8, + // op1 = 1 + OP_QADD16, OP_UQADD16, + OP_QASX, OP_UQASX, + OP_QSAX, OP_UQSAX, + OP_QSUB16, OP_UQSUB16, + OP_QADD8, OP_UQADD8, + OP_QSUB8, OP_UQSUB8, + // op1 = 2 + OP_SHADD16, OP_UHADD16, + OP_SHASX, OP_UHASX, + OP_SHSAX, OP_UHSAX, + OP_SHSUB16, OP_UHSUB16, + OP_SHADD8, OP_UHADD8, + OP_SHSUB8, OP_UHSUB8 + }; + + u32 opcode_index = op1 * 12 + op2 * 2 + is_unsigned; + return opcodes[opcode_index]; +} + +Opcode ARM_Disasm::DecodePackingSaturationReversal(u32 insn) { + u32 op1 = BITS(insn, 20, 22); + u32 a = BITS(insn, 16, 19); + u32 op2 = BITS(insn, 5, 7); + + switch (op1) { + case 0x0: + if (BIT(op2, 0) == 0) + return OP_PKH; + if (op2 == 0x3 && a != 0xf) + return OP_SXTAB16; + if (op2 == 0x3 && a == 0xf) + return OP_SXTB16; + if (op2 == 0x5) + return OP_SEL; + break; + case 0x2: + if (BIT(op2, 0) == 0) + return OP_SSAT; + if (op2 == 0x1) + return OP_SSAT16; + if (op2 == 0x3 && a != 0xf) + return OP_SXTAB; + if (op2 == 0x3 && a == 0xf) + return OP_SXTB; + break; + case 0x3: + if (op2 == 0x1) + return OP_REV; + if (BIT(op2, 0) == 0) + return OP_SSAT; + if (op2 == 0x3 && a != 0xf) + return OP_SXTAH; + if (op2 == 0x3 && a == 0xf) + return OP_SXTH; + if (op2 == 0x5) + return OP_REV16; + break; + case 0x4: + if (op2 == 0x3 && a != 0xf) + return OP_UXTAB16; + if (op2 == 0x3 && a == 0xf) + return OP_UXTB16; + break; + case 0x6: + if (BIT(op2, 0) == 0) + return OP_USAT; + if (op2 == 0x1) + return OP_USAT16; + if (op2 == 0x3 && a != 0xf) + return OP_UXTAB; + if (op2 == 0x3 && a == 0xf) + return OP_UXTB; + break; + case 0x7: + if (BIT(op2, 0) == 0) + return OP_USAT; + if (op2 == 0x3 && a != 0xf) + return OP_UXTAH; + if (op2 == 0x3 && a == 0xf) + return OP_UXTH; + if (op2 == 0x5) + return OP_REVSH; + break; + default: + break; + } + + return OP_UNDEFINED; +} + +Opcode ARM_Disasm::DecodeMUL(u32 insn) { + u8 bit24 = (insn >> 24) & 0x1; if (bit24 != 0) { // This is an unexpected bit pattern. Create an undefined // instruction in case this is ever executed. return OP_UNDEFINED; } - uint8_t bit23 = (insn >> 23) & 0x1; - uint8_t bit22_U = (insn >> 22) & 0x1; - uint8_t bit21_A = (insn >> 21) & 0x1; + u8 bit23 = (insn >> 23) & 0x1; + u8 bit22_U = (insn >> 22) & 0x1; + u8 bit21_A = (insn >> 21) & 0x1; if (bit23 == 0) { // 32-bit multiply if (bit22_U != 0) { @@ -878,9 +1371,104 @@ Opcode ARM_Disasm::DecodeMUL(uint32_t insn) { return OP_SMLAL; } -Opcode ARM_Disasm::DecodeLDRH(uint32_t insn) { - uint8_t is_load = (insn >> 20) & 0x1; - uint8_t bits_65 = (insn >> 5) & 0x3; +Opcode ARM_Disasm::DecodeMSRImmAndHints(u32 insn) { + u32 op = BIT(insn, 22); + u32 op1 = BITS(insn, 16, 19); + u32 op2 = BITS(insn, 0, 7); + + if (op == 0 && op1 == 0) { + switch (op2) { + case 0x0: + return OP_NOP; + case 0x1: + return OP_YIELD; + case 0x2: + return OP_WFE; + case 0x3: + return OP_WFI; + case 0x4: + return OP_SEV; + default: + return OP_UNDEFINED; + } + } + + return OP_MSR; +} + +Opcode ARM_Disasm::DecodeMediaMulDiv(u32 insn) { + u32 op1 = BITS(insn, 20, 22); + u32 op2_h = BITS(insn, 6, 7); + u32 a = BITS(insn, 12, 15); + + switch (op1) { + case 0x0: + if (op2_h == 0x0) { + if (a != 0xf) + return OP_SMLAD; + else + return OP_SMUAD; + } else if (op2_h == 0x1) { + if (a != 0xf) + return OP_SMLSD; + else + return OP_SMUSD; + } + break; + case 0x4: + if (op2_h == 0x0) + return OP_SMLALD; + else if (op2_h == 0x1) + return OP_SMLSLD; + break; + case 0x5: + if (op2_h == 0x0) { + if (a != 0xf) + return OP_SMMLA; + else + return OP_SMMUL; + } else if (op2_h == 0x3) { + return OP_SMMLS; + } + break; + default: + break; + } + + return OP_UNDEFINED; +} + +Opcode ARM_Disasm::DecodeMedia(u32 insn) { + u32 op1 = BITS(insn, 20, 24); + u32 rd = BITS(insn, 12, 15); + u32 op2 = BITS(insn, 5, 7); + + switch (BITS(op1, 3, 4)) { + case 0x0: + // unsigned and signed parallel addition and subtraction + return DecodeParallelAddSub(insn); + case 0x1: + // Packing, unpacking, saturation, and reversal + return DecodePackingSaturationReversal(insn); + case 0x2: + // Signed multiply, signed and unsigned divide + return DecodeMediaMulDiv(insn); + case 0x3: + if (op2 == 0 && rd == 0xf) + return OP_USAD8; + if (op2 == 0 && rd != 0xf) + return OP_USADA8; + break; + default: + break; + } + + return OP_UNDEFINED; +} + +Opcode ARM_Disasm::DecodeLDRH(u32 insn) { + u8 is_load = (insn >> 20) & 0x1; + u8 bits_65 = (insn >> 5) & 0x3; if (is_load) { if (bits_65 == 0x1) { // Load unsigned halfword @@ -908,12 +1496,12 @@ Opcode ARM_Disasm::DecodeLDRH(uint32_t insn) { return OP_STRH; } -Opcode ARM_Disasm::DecodeALU(uint32_t insn) { - uint8_t is_immed = (insn >> 25) & 0x1; - uint8_t opcode = (insn >> 21) & 0xf; - uint8_t bit_s = (insn >> 20) & 1; - uint8_t shift_is_reg = (insn >> 4) & 1; - uint8_t bit7 = (insn >> 7) & 1; +Opcode ARM_Disasm::DecodeALU(u32 insn) { + u8 is_immed = (insn >> 25) & 0x1; + u8 opcode = (insn >> 21) & 0xf; + u8 bit_s = (insn >> 20) & 1; + u8 shift_is_reg = (insn >> 4) & 1; + u8 bit7 = (insn >> 7) & 1; if (!is_immed && shift_is_reg && (bit7 != 0)) { // This is an unexpected bit pattern. Create an undefined // instruction in case this is ever executed. diff --git a/src/core/arm/disassembler/arm_disasm.h b/src/core/arm/disassembler/arm_disasm.h index f94bd4669..53d9c6a74 100644 --- a/src/core/arm/disassembler/arm_disasm.h +++ b/src/core/arm/disassembler/arm_disasm.h @@ -2,9 +2,10 @@ #pragma once -#include <cstdint> #include <string> +#include "common/common_types.h" + // Note: this list of opcodes must match the list used to initialize // the opflags[] array in opcode.cpp. enum Opcode { @@ -20,6 +21,7 @@ enum Opcode { OP_BLX, OP_BX, OP_CDP, + OP_CLREX, OP_CLZ, OP_CMN, OP_CMP, @@ -29,6 +31,10 @@ enum Opcode { OP_LDR, OP_LDRB, OP_LDRBT, + OP_LDREX, + OP_LDREXB, + OP_LDREXD, + OP_LDREXH, OP_LDRH, OP_LDRSB, OP_LDRSH, @@ -41,28 +47,105 @@ enum Opcode { OP_MSR, OP_MUL, OP_MVN, + OP_NOP, OP_ORR, + OP_PKH, OP_PLD, + OP_QADD16, + OP_QADD8, + OP_QASX, + OP_QSAX, + OP_QSUB16, + OP_QSUB8, + OP_REV, + OP_REV16, + OP_REVSH, OP_RSB, OP_RSC, + OP_SADD16, + OP_SADD8, + OP_SASX, OP_SBC, + OP_SEL, + OP_SEV, + OP_SHADD16, + OP_SHADD8, + OP_SHASX, + OP_SHSAX, + OP_SHSUB16, + OP_SHSUB8, + OP_SMLAD, OP_SMLAL, + OP_SMLALD, + OP_SMLSD, + OP_SMLSLD, + OP_SMMLA, + OP_SMMLS, + OP_SMMUL, + OP_SMUAD, OP_SMULL, + OP_SMUSD, + OP_SSAT, + OP_SSAT16, + OP_SSAX, + OP_SSUB16, + OP_SSUB8, OP_STC, OP_STM, OP_STR, OP_STRB, OP_STRBT, + OP_STREX, + OP_STREXB, + OP_STREXD, + OP_STREXH, OP_STRH, OP_STRT, OP_SUB, OP_SWI, OP_SWP, OP_SWPB, + OP_SXTAB, + OP_SXTAB16, + OP_SXTAH, + OP_SXTB, + OP_SXTB16, + OP_SXTH, OP_TEQ, OP_TST, + OP_UADD16, + OP_UADD8, + OP_UASX, + OP_UHADD16, + OP_UHADD8, + OP_UHASX, + OP_UHSAX, + OP_UHSUB16, + OP_UHSUB8, OP_UMLAL, OP_UMULL, + OP_UQADD16, + OP_UQADD8, + OP_UQASX, + OP_UQSAX, + OP_UQSUB16, + OP_UQSUB8, + OP_USAD8, + OP_USADA8, + OP_USAT, + OP_USAT16, + OP_USAX, + OP_USUB16, + OP_USUB8, + OP_UXTAB, + OP_UXTAB16, + OP_UXTAH, + OP_UXTB, + OP_UXTB16, + OP_UXTH, + OP_WFE, + OP_WFI, + OP_YIELD, // Define thumb opcodes OP_THUMB_UNDEFINED, @@ -109,33 +192,48 @@ enum Opcode { class ARM_Disasm { public: - static std::string Disassemble(uint32_t addr, uint32_t insn); - static Opcode Decode(uint32_t insn); + static std::string Disassemble(u32 addr, u32 insn); + static Opcode Decode(u32 insn); private: - static Opcode Decode00(uint32_t insn); - static Opcode Decode01(uint32_t insn); - static Opcode Decode10(uint32_t insn); - static Opcode Decode11(uint32_t insn); - static Opcode DecodeMUL(uint32_t insn); - static Opcode DecodeLDRH(uint32_t insn); - static Opcode DecodeALU(uint32_t insn); + static Opcode Decode00(u32 insn); + static Opcode Decode01(u32 insn); + static Opcode Decode10(u32 insn); + static Opcode Decode11(u32 insn); + static Opcode DecodeSyncPrimitive(u32 insn); + static Opcode DecodeParallelAddSub(u32 insn); + static Opcode DecodePackingSaturationReversal(u32 insn); + static Opcode DecodeMUL(u32 insn); + static Opcode DecodeMSRImmAndHints(u32 insn); + static Opcode DecodeMediaMulDiv(u32 insn); + static Opcode DecodeMedia(u32 insn); + static Opcode DecodeLDRH(u32 insn); + static Opcode DecodeALU(u32 insn); - static std::string DisassembleALU(Opcode opcode, uint32_t insn); - static std::string DisassembleBranch(uint32_t addr, Opcode opcode, uint32_t insn); - static std::string DisassembleBX(uint32_t insn); - static std::string DisassembleBKPT(uint32_t insn); - static std::string DisassembleCLZ(uint32_t insn); - static std::string DisassembleMemblock(Opcode opcode, uint32_t insn); - static std::string DisassembleMem(uint32_t insn); - static std::string DisassembleMemHalf(uint32_t insn); - static std::string DisassembleMCR(Opcode opcode, uint32_t insn); - static std::string DisassembleMLA(Opcode opcode, uint32_t insn); - static std::string DisassembleUMLAL(Opcode opcode, uint32_t insn); - static std::string DisassembleMUL(Opcode opcode, uint32_t insn); - static std::string DisassembleMRS(uint32_t insn); - static std::string DisassembleMSR(uint32_t insn); - static std::string DisassemblePLD(uint32_t insn); - static std::string DisassembleSWI(uint32_t insn); - static std::string DisassembleSWP(Opcode opcode, uint32_t insn); + static std::string DisassembleALU(Opcode opcode, u32 insn); + static std::string DisassembleBranch(u32 addr, Opcode opcode, u32 insn); + static std::string DisassembleBX(u32 insn); + static std::string DisassembleBKPT(u32 insn); + static std::string DisassembleCLZ(u32 insn); + static std::string DisassembleMediaMulDiv(Opcode opcode, u32 insn); + static std::string DisassembleMemblock(Opcode opcode, u32 insn); + static std::string DisassembleMem(u32 insn); + static std::string DisassembleMemHalf(u32 insn); + static std::string DisassembleMCR(Opcode opcode, u32 insn); + static std::string DisassembleMLA(Opcode opcode, u32 insn); + static std::string DisassembleUMLAL(Opcode opcode, u32 insn); + static std::string DisassembleMUL(Opcode opcode, u32 insn); + static std::string DisassembleMRS(u32 insn); + static std::string DisassembleMSR(u32 insn); + static std::string DisassembleNoOperands(Opcode opcode, u32 insn); + static std::string DisassembleParallelAddSub(Opcode opcode, u32 insn); + static std::string DisassemblePKH(u32 insn); + static std::string DisassemblePLD(u32 insn); + static std::string DisassembleREV(Opcode opcode, u32 insn); + static std::string DisassembleREX(Opcode opcode, u32 insn); + static std::string DisassembleSAT(Opcode opcode, u32 insn); + static std::string DisassembleSEL(u32 insn); + static std::string DisassembleSWI(u32 insn); + static std::string DisassembleSWP(Opcode opcode, u32 insn); + static std::string DisassembleXT(Opcode opcode, u32 insn); }; diff --git a/src/core/arm/dyncom/arm_dyncom.cpp b/src/core/arm/dyncom/arm_dyncom.cpp index a51a3acf8..f3be2c857 100644 --- a/src/core/arm/dyncom/arm_dyncom.cpp +++ b/src/core/arm/dyncom/arm_dyncom.cpp @@ -18,16 +18,7 @@ #include "core/core_timing.h" ARM_DynCom::ARM_DynCom(PrivilegeMode initial_mode) { - state = Common::make_unique<ARMul_State>(); - - // Reset the core to initial state - ARMul_Reset(state.get()); - - // Switch to the desired privilege mode. - switch_mode(state.get(), initial_mode); - - state->Reg[13] = 0x10000000; // Set stack pointer to the top of the stack - state->Reg[15] = 0x00000000; + state = Common::make_unique<ARMul_State>(initial_mode); } ARM_DynCom::~ARM_DynCom() { @@ -49,6 +40,22 @@ void ARM_DynCom::SetReg(int index, u32 value) { state->Reg[index] = value; } +u32 ARM_DynCom::GetVFPReg(int index) const { + return state->ExtReg[index]; +} + +void ARM_DynCom::SetVFPReg(int index, u32 value) { + state->ExtReg[index] = value; +} + +u32 ARM_DynCom::GetVFPSystemReg(VFPSystemRegister reg) const { + return state->VFP[reg]; +} + +void ARM_DynCom::SetVFPSystemReg(VFPSystemRegister reg, u32 value) { + state->VFP[reg] = value; +} + u32 ARM_DynCom::GetCPSR() const { return state->Cpsr; } @@ -91,8 +98,8 @@ void ARM_DynCom::ResetContext(Core::ThreadContext& context, u32 stack_top, u32 e } void ARM_DynCom::SaveContext(Core::ThreadContext& ctx) { - memcpy(ctx.cpu_registers, state->Reg, sizeof(ctx.cpu_registers)); - memcpy(ctx.fpu_registers, state->ExtReg, sizeof(ctx.fpu_registers)); + memcpy(ctx.cpu_registers, state->Reg.data(), sizeof(ctx.cpu_registers)); + memcpy(ctx.fpu_registers, state->ExtReg.data(), sizeof(ctx.fpu_registers)); ctx.sp = state->Reg[13]; ctx.lr = state->Reg[14]; @@ -104,8 +111,8 @@ void ARM_DynCom::SaveContext(Core::ThreadContext& ctx) { } void ARM_DynCom::LoadContext(const Core::ThreadContext& ctx) { - memcpy(state->Reg, ctx.cpu_registers, sizeof(ctx.cpu_registers)); - memcpy(state->ExtReg, ctx.fpu_registers, sizeof(ctx.fpu_registers)); + memcpy(state->Reg.data(), ctx.cpu_registers, sizeof(ctx.cpu_registers)); + memcpy(state->ExtReg.data(), ctx.fpu_registers, sizeof(ctx.fpu_registers)); state->Reg[13] = ctx.sp; state->Reg[14] = ctx.lr; diff --git a/src/core/arm/dyncom/arm_dyncom.h b/src/core/arm/dyncom/arm_dyncom.h index 87ab6908a..3664fd728 100644 --- a/src/core/arm/dyncom/arm_dyncom.h +++ b/src/core/arm/dyncom/arm_dyncom.h @@ -25,6 +25,10 @@ public: u32 GetPC() const override; u32 GetReg(int index) const override; void SetReg(int index, u32 value) override; + u32 GetVFPReg(int index) const override; + void SetVFPReg(int index, u32 value) override; + u32 GetVFPSystemReg(VFPSystemRegister reg) const override; + void SetVFPSystemReg(VFPSystemRegister reg, u32 value) override; u32 GetCPSR() const override; void SetCPSR(u32 cpsr) override; u32 GetCP15Register(CP15Register reg) override; diff --git a/src/core/arm/dyncom/arm_dyncom_dec.cpp b/src/core/arm/dyncom/arm_dyncom_dec.cpp index 3ab9f2c17..ee4288314 100644 --- a/src/core/arm/dyncom/arm_dyncom_dec.cpp +++ b/src/core/arm/dyncom/arm_dyncom_dec.cpp @@ -5,7 +5,7 @@ #include "core/arm/dyncom/arm_dyncom_dec.h" #include "core/arm/skyeye_common/armsupp.h" -const ISEITEM arm_instruction[] = { +const InstructionSetEncodingItem arm_instruction[] = { { "vmla", 4, ARMVFP2, { 23, 27, 0x1C, 20, 21, 0x0, 9, 11, 0x5, 4, 4, 0 }}, { "vmls", 7, ARMVFP2, { 28, 31, 0xF, 25, 27, 0x1, 23, 23, 1, 11, 11, 0, 8, 9, 0x2, 6, 6, 1, 4, 4, 0 }}, { "vnmla", 4, ARMVFP2, { 23, 27, 0x1C, 20, 21, 0x1, 9, 11, 0x5, 4, 4, 0 }}, @@ -207,7 +207,7 @@ const ISEITEM arm_instruction[] = { { "bbl", 1, 0, { 25, 27, 0x00000005 }}, }; -const ISEITEM arm_exclusion_code[] = { +const InstructionSetEncodingItem arm_exclusion_code[] = { { "vmla", 0, ARMVFP2, { 0 }}, { "vmls", 0, ARMVFP2, { 0 }}, { "vnmla", 0, ARMVFP2, { 0 }}, @@ -414,14 +414,13 @@ const ISEITEM arm_exclusion_code[] = { { "invalid", 0, INVALID, { 0 }} }; -int decode_arm_instr(u32 instr, s32* idx) { +ARMDecodeStatus DecodeARMInstruction(u32 instr, s32* idx) { int n = 0; int base = 0; - int ret = DECODE_FAILURE; - int i = 0; - int instr_slots = sizeof(arm_instruction) / sizeof(ISEITEM); + int instr_slots = sizeof(arm_instruction) / sizeof(InstructionSetEncodingItem); + ARMDecodeStatus ret = ARMDecodeStatus::FAILURE; - for (i = 0; i < instr_slots; i++) { + for (int i = 0; i < instr_slots; i++) { n = arm_instruction[i].attribute_value; base = 0; @@ -438,11 +437,11 @@ int decode_arm_instr(u32 instr, s32* idx) { n--; } - // All conditions is satisfied. + // All conditions are satisfied. if (n == 0) - ret = DECODE_SUCCESS; + ret = ARMDecodeStatus::SUCCESS; - if (ret == DECODE_SUCCESS) { + if (ret == ARMDecodeStatus::SUCCESS) { n = arm_exclusion_code[i].attribute_value; if (n != 0) { base = 0; @@ -454,13 +453,13 @@ int decode_arm_instr(u32 instr, s32* idx) { n--; } - // All conditions is satisfied. + // All conditions are satisfied. if (n == 0) - ret = DECODE_FAILURE; + ret = ARMDecodeStatus::FAILURE; } } - if (ret == DECODE_SUCCESS) { + if (ret == ARMDecodeStatus::SUCCESS) { *idx = i; return ret; } diff --git a/src/core/arm/dyncom/arm_dyncom_dec.h b/src/core/arm/dyncom/arm_dyncom_dec.h index 5f6279627..d7170e0fc 100644 --- a/src/core/arm/dyncom/arm_dyncom_dec.h +++ b/src/core/arm/dyncom/arm_dyncom_dec.h @@ -6,22 +6,20 @@ #include "common/common_types.h" -int decode_arm_instr(u32 instr, s32* idx); - -enum DECODE_STATUS { - DECODE_SUCCESS, - DECODE_FAILURE +enum class ARMDecodeStatus { + SUCCESS, + FAILURE }; -struct instruction_set_encoding_item { +ARMDecodeStatus DecodeARMInstruction(u32 instr, s32* idx); + +struct InstructionSetEncodingItem { const char *name; int attribute_value; int version; u32 content[21]; }; -typedef struct instruction_set_encoding_item ISEITEM; - // ARM versions enum { INVALID = 0, @@ -38,4 +36,4 @@ enum { ARMV6K, }; -extern const ISEITEM arm_instruction[]; +extern const InstructionSetEncodingItem arm_instruction[]; diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp index fd5e13295..422e80b50 100644 --- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp +++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp @@ -7,6 +7,7 @@ #include <algorithm> #include <cstdio> +#include "common/common_types.h" #include "common/logging/log.h" #include "common/profiler.h" @@ -17,7 +18,6 @@ #include "core/arm/dyncom/arm_dyncom_interpreter.h" #include "core/arm/dyncom/arm_dyncom_thumb.h" #include "core/arm/dyncom/arm_dyncom_run.h" -#include "core/arm/skyeye_common/armmmu.h" #include "core/arm/skyeye_common/armstate.h" #include "core/arm/skyeye_common/armsupp.h" #include "core/arm/skyeye_common/vfp/vfp.h" @@ -48,27 +48,6 @@ enum { typedef unsigned int (*shtop_fp_t)(ARMul_State* cpu, unsigned int sht_oper); -// Defines a reservation granule of 2 words, which protects the first 2 words starting at the tag. -// This is the smallest granule allowed by the v7 spec, and is coincidentally just large enough to -// support LDR/STREXD. -static const u32 RESERVATION_GRANULE_MASK = 0xFFFFFFF8; - -// Exclusive memory access -static int exclusive_detect(ARMul_State* state, u32 addr) { - if(state->exclusive_tag == (addr & RESERVATION_GRANULE_MASK)) - return 0; - else - return -1; -} - -static void add_exclusive_addr(ARMul_State* state, u32 addr){ - state->exclusive_tag = addr & RESERVATION_GRANULE_MASK; -} - -static void remove_exclusive(ARMul_State* state, u32 addr){ - state->exclusive_tag = 0xFFFFFFFF; -} - static int CondPassed(ARMul_State* cpu, unsigned int cond) { const u32 NFLAG = cpu->NFlag; const u32 ZFLAG = cpu->ZFlag; @@ -781,8 +760,8 @@ struct bx_inst { struct blx_inst { union { - int32_t signed_immed_24; - uint32_t Rm; + s32 signed_immed_24; + u32 Rm; } val; unsigned int inst; }; @@ -3490,21 +3469,15 @@ enum { FETCH_FAILURE }; -static tdstate decode_thumb_instr(u32 inst, u32 addr, u32* arm_inst, u32* inst_size, ARM_INST_PTR* ptr_inst_base) { +static ThumbDecodeStatus DecodeThumbInstruction(u32 inst, u32 addr, u32* arm_inst, u32* inst_size, ARM_INST_PTR* ptr_inst_base) { // Check if in Thumb mode - tdstate ret = thumb_translate (addr, inst, arm_inst, inst_size); - if(ret == t_branch){ - // TODO: FIXME, endian should be judged - u32 tinstr; - if((addr & 0x3) != 0) - tinstr = inst >> 16; - else - tinstr = inst & 0xFFFF; - + ThumbDecodeStatus ret = TranslateThumbInstruction (addr, inst, arm_inst, inst_size); + if (ret == ThumbDecodeStatus::BRANCH) { int inst_index; int table_length = sizeof(arm_instruction_trans) / sizeof(transop_fp_t); + u32 tinstr = GetThumbInstruction(inst, addr); - switch((tinstr & 0xF800) >> 11){ + switch ((tinstr & 0xF800) >> 11) { case 26: case 27: if (((tinstr & 0x0F00) != 0x0E00) && ((tinstr & 0x0F00) != 0x0F00)){ @@ -3537,7 +3510,7 @@ static tdstate decode_thumb_instr(u32 inst, u32 addr, u32* arm_inst, u32* inst_s *ptr_inst_base = arm_instruction_trans[inst_index](tinstr, inst_index); break; default: - ret = t_undefined; + ret = ThumbDecodeStatus::UNDEFINED; break; } } @@ -3549,10 +3522,6 @@ enum { FETCH_EXCEPTION }; -typedef struct instruction_set_encoding_item ISEITEM; - -extern const ISEITEM arm_instruction[]; - static int InterpreterTranslate(ARMul_State* cpu, int& bb_start, u32 addr) { Common::Profiling::ScopeTimer timer_decode(profile_decode); @@ -3574,20 +3543,19 @@ static int InterpreterTranslate(ARMul_State* cpu, int& bb_start, u32 addr) { inst = Memory::Read32(phys_addr & 0xFFFFFFFC); size++; - // If we are in thumb instruction, we will translate one thumb to one corresponding arm instruction + // If we are in Thumb mode, we'll translate one Thumb instruction to the corresponding ARM instruction if (cpu->TFlag) { - uint32_t arm_inst; - tdstate state = decode_thumb_instr(inst, phys_addr, &arm_inst, &inst_size, &inst_base); + u32 arm_inst; + ThumbDecodeStatus state = DecodeThumbInstruction(inst, phys_addr, &arm_inst, &inst_size, &inst_base); - // We have translated the branch instruction of thumb in thumb decoder - if(state == t_branch){ + // We have translated the Thumb branch instruction in the Thumb decoder + if (state == ThumbDecodeStatus::BRANCH) { goto translated; } inst = arm_inst; } - ret = decode_arm_instr(inst, &idx); - if (ret == DECODE_FAILURE) { + if (DecodeARMInstruction(inst, &idx) == ARMDecodeStatus::FAILURE) { std::string disasm = ARM_Disasm::Disassemble(phys_addr, inst); LOG_ERROR(Core_ARM11, "Decode failure.\tPC : [0x%x]\tInstruction : %s [%x]", phys_addr, disasm.c_str(), inst); LOG_ERROR(Core_ARM11, "cpsr=0x%x, cpu->TFlag=%d, r15=0x%x", cpu->Cpsr, cpu->TFlag, cpu->Reg[15]); @@ -3919,7 +3887,6 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { #endif arm_inst* inst_base; unsigned int addr; - unsigned int phys_addr; unsigned int num_instrs = 0; int ptr; @@ -3938,8 +3905,6 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { else cpu->Reg[15] &= 0xfffffffc; - phys_addr = cpu->Reg[15]; - // Find the cached instruction cream, otherwise translate it... auto itr = cpu->instruction_cache.find(cpu->Reg[15]); if (itr != cpu->instruction_cache.end()) { @@ -3957,14 +3922,18 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { adc_inst* const inst_cream = (adc_inst*)inst_base->component; + u32 rn_val = RN; + if (inst_cream->Rn == 15) + rn_val += 2 * cpu->GetInstructionSize(); + bool carry; bool overflow; - RD = AddWithCarry(RN, SHIFTER_OPERAND, cpu->CFlag, &carry, &overflow); + RD = AddWithCarry(rn_val, SHIFTER_OPERAND, cpu->CFlag, &carry, &overflow); if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Spsr_copy & 0x1f); + cpu->ChangePrivilegeMode(cpu->Spsr_copy & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -3978,7 +3947,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(adc_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -3990,7 +3959,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 rn_val = RN; if (inst_cream->Rn == 15) - rn_val += 2 * GET_INST_SIZE(cpu); + rn_val += 2 * cpu->GetInstructionSize(); bool carry; bool overflow; @@ -3999,7 +3968,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Cpsr & 0x1f); + cpu->ChangePrivilegeMode(cpu->Cpsr & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -4013,22 +3982,28 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(add_inst)); FETCH_INST; GOTO_NEXT_INST; } AND_INST: { - and_inst *inst_cream = (and_inst *)inst_base->component; - if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) { + if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { + and_inst* const inst_cream = (and_inst*)inst_base->component; + u32 lop = RN; u32 rop = SHIFTER_OPERAND; + + if (inst_cream->Rn == 15) + lop += 2 * cpu->GetInstructionSize(); + RD = lop & rop; + if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Cpsr & 0x1f); + cpu->ChangePrivilegeMode(cpu->Cpsr & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -4041,7 +4016,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(and_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4057,7 +4032,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { INC_PC(sizeof(bbl_inst)); goto DISPATCH; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(bbl_inst)); goto DISPATCH; } @@ -4067,14 +4042,14 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) { u32 lop = RN; if (inst_cream->Rn == 15) { - lop += 2 * GET_INST_SIZE(cpu); + lop += 2 * cpu->GetInstructionSize(); } u32 rop = SHIFTER_OPERAND; RD = lop & (~rop); if ((inst_cream->S) && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Spsr_copy & 0x1f); + cpu->ChangePrivilegeMode(cpu->Spsr_copy & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -4087,7 +4062,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(bic_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4098,7 +4073,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { bkpt_inst* const inst_cream = (bkpt_inst*)inst_base->component; LOG_DEBUG(Core_ARM11, "Breakpoint instruction hit. Immediate: 0x%08X", inst_cream->imm); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(bkpt_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4109,13 +4084,13 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) { unsigned int inst = inst_cream->inst; if (BITS(inst, 20, 27) == 0x12 && BITS(inst, 4, 7) == 0x3) { - cpu->Reg[14] = (cpu->Reg[15] + GET_INST_SIZE(cpu)); + cpu->Reg[14] = (cpu->Reg[15] + cpu->GetInstructionSize()); if(cpu->TFlag) cpu->Reg[14] |= 0x1; cpu->Reg[15] = cpu->Reg[inst_cream->val.Rm] & 0xfffffffe; cpu->TFlag = cpu->Reg[inst_cream->val.Rm] & 0x1; } else { - cpu->Reg[14] = (cpu->Reg[15] + GET_INST_SIZE(cpu)); + cpu->Reg[14] = (cpu->Reg[15] + cpu->GetInstructionSize()); cpu->TFlag = 0x1; int signed_int = inst_cream->val.signed_immed_24; signed_int = (signed_int & 0x800000) ? (0x3F000000 | signed_int) : signed_int; @@ -4125,7 +4100,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { INC_PC(sizeof(blx_inst)); goto DISPATCH; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(blx_inst)); goto DISPATCH; } @@ -4147,7 +4122,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 address = RM; if (inst_cream->Rm == 15) - address += 2 * GET_INST_SIZE(cpu); + address += 2 * cpu->GetInstructionSize(); cpu->TFlag = address & 1; cpu->Reg[15] = address & 0xfffffffe; @@ -4155,7 +4130,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(bx_inst)); goto DISPATCH; } @@ -4167,7 +4142,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { cpu->NumInstrsToExecute = 0; return num_instrs; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(cdp_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4175,10 +4150,8 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { CLREX_INST: { - remove_exclusive(cpu, 0); - cpu->exclusive_state = 0; - - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->UnsetExclusiveMemoryAddress(); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(clrex_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4189,7 +4162,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { clz_inst* inst_cream = (clz_inst*)inst_base->component; RD = clz(RM); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(clz_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4199,16 +4172,20 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { cmn_inst* const inst_cream = (cmn_inst*)inst_base->component; + u32 rn_val = RN; + if (inst_cream->Rn == 15) + rn_val += 2 * cpu->GetInstructionSize(); + bool carry; bool overflow; - u32 result = AddWithCarry(RN, SHIFTER_OPERAND, 0, &carry, &overflow); + u32 result = AddWithCarry(rn_val, SHIFTER_OPERAND, 0, &carry, &overflow); UPDATE_NFLAG(result); UPDATE_ZFLAG(result); cpu->CFlag = carry; cpu->VFlag = overflow; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(cmn_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4220,7 +4197,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 rn_val = RN; if (inst_cream->Rn == 15) - rn_val += 2 * GET_INST_SIZE(cpu); + rn_val += 2 * cpu->GetInstructionSize(); bool carry; bool overflow; @@ -4231,7 +4208,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { cpu->CFlag = carry; cpu->VFlag = overflow; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(cmp_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4239,9 +4216,9 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { CPS_INST: { cps_inst *inst_cream = (cps_inst *)inst_base->component; - uint32_t aif_val = 0; - uint32_t aif_mask = 0; - if (InAPrivilegedMode(cpu)) { + u32 aif_val = 0; + u32 aif_mask = 0; + if (cpu->InAPrivilegedMode()) { if (inst_cream->imod1) { if (inst_cream->A) { aif_val |= (inst_cream->imod0 << 8); @@ -4260,10 +4237,10 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } if (inst_cream->mmod) { cpu->Cpsr = (cpu->Cpsr & 0xffffffe0) | inst_cream->mode; - switch_mode(cpu, inst_cream->mode); + cpu->ChangePrivilegeMode(inst_cream->mode); } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(cps_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4279,7 +4256,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(mov_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4291,14 +4268,14 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 lop = RN; if (inst_cream->Rn == 15) { - lop += 2 * GET_INST_SIZE(cpu); + lop += 2 * cpu->GetInstructionSize(); } u32 rop = SHIFTER_OPERAND; RD = lop ^ rop; if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Spsr_copy & 0x1f); + cpu->ChangePrivilegeMode(cpu->Spsr_copy & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -4311,7 +4288,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(eor_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4320,7 +4297,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { { // Instruction not implemented //LOG_CRITICAL(Core_ARM11, "unimplemented instruction"); - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldc_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4335,30 +4312,30 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (BIT(inst, 22) && !BIT(inst, 15)) { for (int i = 0; i < 13; i++) { if(BIT(inst, i)) { - cpu->Reg[i] = ReadMemory32(cpu, addr); + cpu->Reg[i] = cpu->ReadMemory32(addr); addr += 4; } } if (BIT(inst, 13)) { if (cpu->Mode == USER32MODE) - cpu->Reg[13] = ReadMemory32(cpu, addr); + cpu->Reg[13] = cpu->ReadMemory32(addr); else - cpu->Reg_usr[0] = ReadMemory32(cpu, addr); + cpu->Reg_usr[0] = cpu->ReadMemory32(addr); addr += 4; } if (BIT(inst, 14)) { if (cpu->Mode == USER32MODE) - cpu->Reg[14] = ReadMemory32(cpu, addr); + cpu->Reg[14] = cpu->ReadMemory32(addr); else - cpu->Reg_usr[1] = ReadMemory32(cpu, addr); + cpu->Reg_usr[1] = cpu->ReadMemory32(addr); addr += 4; } } else if (!BIT(inst, 22)) { for(int i = 0; i < 16; i++ ){ if(BIT(inst, i)){ - unsigned int ret = ReadMemory32(cpu, addr); + unsigned int ret = cpu->ReadMemory32(addr); // For armv5t, should enter thumb when bits[0] is non-zero. if(i == 15){ @@ -4373,18 +4350,18 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } else if (BIT(inst, 22) && BIT(inst, 15)) { for(int i = 0; i < 15; i++ ){ if(BIT(inst, i)){ - cpu->Reg[i] = ReadMemory32(cpu, addr); + cpu->Reg[i] = cpu->ReadMemory32(addr); addr += 4; } } if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Cpsr & 0x1f); + cpu->ChangePrivilegeMode(cpu->Cpsr & 0x1F); LOAD_NZCVT; } - cpu->Reg[15] = ReadMemory32(cpu, addr); + cpu->Reg[15] = cpu->ReadMemory32(addr); } if (BIT(inst, 15)) { @@ -4392,7 +4369,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4410,7 +4387,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } RD = operand2; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(sxth_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4420,7 +4397,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { ldst_inst *inst_cream = (ldst_inst *)inst_base->component; inst_cream->get_addr(cpu, inst_cream->inst, addr); - unsigned int value = ReadMemory32(cpu, addr); + unsigned int value = cpu->ReadMemory32(addr); cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value; if (BITS(inst_cream->inst, 12, 15) == 15) { @@ -4431,7 +4408,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4442,7 +4419,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { ldst_inst *inst_cream = (ldst_inst *)inst_base->component; inst_cream->get_addr(cpu, inst_cream->inst, addr); - unsigned int value = ReadMemory32(cpu, addr); + unsigned int value = cpu->ReadMemory32(addr); cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value; if (BITS(inst_cream->inst, 12, 15) == 15) { @@ -4453,7 +4430,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4464,7 +4441,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { uxth_inst* inst_cream = (uxth_inst*)inst_base->component; RD = ROTATE_RIGHT_32(RM, 8 * inst_cream->rotate) & 0xffff; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(uxth_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4477,7 +4454,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = RN + operand2; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(uxtah_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4495,7 +4472,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4513,7 +4490,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4527,8 +4504,8 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { // The 3DS doesn't have LPAE (Large Physical Access Extension), so it // wouldn't do this as a single read. - cpu->Reg[BITS(inst_cream->inst, 12, 15) + 0] = ReadMemory32(cpu, addr); - cpu->Reg[BITS(inst_cream->inst, 12, 15) + 1] = ReadMemory32(cpu, addr + 4); + cpu->Reg[BITS(inst_cream->inst, 12, 15) + 0] = cpu->ReadMemory32(addr); + cpu->Reg[BITS(inst_cream->inst, 12, 15) + 1] = cpu->ReadMemory32(addr + 4); // No dispatch since this operation should not modify R15 } @@ -4544,16 +4521,15 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component; unsigned int read_addr = RN; - add_exclusive_addr(cpu, read_addr); - cpu->exclusive_state = 1; + cpu->SetExclusiveMemoryAddress(read_addr); - RD = ReadMemory32(cpu, read_addr); + RD = cpu->ReadMemory32(read_addr); if (inst_cream->Rd == 15) { INC_PC(sizeof(generic_arm_inst)); goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4564,8 +4540,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component; unsigned int read_addr = RN; - add_exclusive_addr(cpu, read_addr); - cpu->exclusive_state = 1; + cpu->SetExclusiveMemoryAddress(read_addr); RD = Memory::Read8(read_addr); if (inst_cream->Rd == 15) { @@ -4573,7 +4548,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4584,16 +4559,15 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component; unsigned int read_addr = RN; - add_exclusive_addr(cpu, read_addr); - cpu->exclusive_state = 1; + cpu->SetExclusiveMemoryAddress(read_addr); - RD = ReadMemory16(cpu, read_addr); + RD = cpu->ReadMemory16(read_addr); if (inst_cream->Rd == 15) { INC_PC(sizeof(generic_arm_inst)); goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4604,18 +4578,17 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component; unsigned int read_addr = RN; - add_exclusive_addr(cpu, read_addr); - cpu->exclusive_state = 1; + cpu->SetExclusiveMemoryAddress(read_addr); - RD = ReadMemory32(cpu, read_addr); - RD2 = ReadMemory32(cpu, read_addr + 4); + RD = cpu->ReadMemory32(read_addr); + RD2 = cpu->ReadMemory32(read_addr + 4); if (inst_cream->Rd == 15) { INC_PC(sizeof(generic_arm_inst)); goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4626,13 +4599,13 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { ldst_inst* inst_cream = (ldst_inst*)inst_base->component; inst_cream->get_addr(cpu, inst_cream->inst, addr); - cpu->Reg[BITS(inst_cream->inst, 12, 15)] = ReadMemory16(cpu, addr); + cpu->Reg[BITS(inst_cream->inst, 12, 15)] = cpu->ReadMemory16(addr); if (BITS(inst_cream->inst, 12, 15) == 15) { INC_PC(sizeof(ldst_inst)); goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4652,7 +4625,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4663,7 +4636,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { ldst_inst* inst_cream = (ldst_inst*)inst_base->component; inst_cream->get_addr(cpu, inst_cream->inst, addr); - unsigned int value = ReadMemory16(cpu, addr); + unsigned int value = cpu->ReadMemory16(addr); if (BIT(value, 15)) { value |= 0xffff0000; } @@ -4673,7 +4646,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4684,7 +4657,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { ldst_inst* inst_cream = (ldst_inst*)inst_base->component; inst_cream->get_addr(cpu, inst_cream->inst, addr); - unsigned int value = ReadMemory32(cpu, addr); + unsigned int value = cpu->ReadMemory32(addr); cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value; if (BITS(inst_cream->inst, 12, 15) == 15) { @@ -4692,7 +4665,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4707,10 +4680,10 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { DEBUG_MSG; } else { if (inst_cream->cp_num == 15) - WriteCP15Register(cpu, RD, CRn, OPCODE_1, CRm, OPCODE_2); + cpu->WriteCP15Register(RD, CRn, OPCODE_1, CRm, OPCODE_2); } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(mcr_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4727,7 +4700,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { inst_cream->cp_num, inst_cream->crm, inst_cream->opcode_1, inst_cream->rt, inst_cream->rt2); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(mcrr_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4738,11 +4711,11 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { mla_inst* inst_cream = (mla_inst*)inst_base->component; - uint64_t rm = RM; - uint64_t rs = RS; - uint64_t rn = RN; + u64 rm = RM; + u64 rs = RS; + u64 rn = RN; - RD = static_cast<uint32_t>((rm * rs + rn) & 0xffffffff); + RD = static_cast<u32>((rm * rs + rn) & 0xffffffff); if (inst_cream->S) { UPDATE_NFLAG(RD); UPDATE_ZFLAG(RD); @@ -4752,7 +4725,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(mla_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4766,7 +4739,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Spsr_copy & 0x1f); + cpu->ChangePrivilegeMode(cpu->Spsr_copy & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -4779,7 +4752,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(mov_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4800,10 +4773,10 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto END; } else { if (inst_cream->cp_num == 15) - RD = ReadCP15Register(cpu, CRn, OPCODE_1, CRm, OPCODE_2); + RD = cpu->ReadCP15Register(CRn, OPCODE_1, CRm, OPCODE_2); } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(mrc_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4820,7 +4793,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { inst_cream->cp_num, inst_cream->crm, inst_cream->opcode_1, inst_cream->rt, inst_cream->rt2); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(mcrr_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4838,7 +4811,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = cpu->Cpsr; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(mrs_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4847,7 +4820,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { { if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { msr_inst* inst_cream = (msr_inst*)inst_base->component; - const uint32_t UserMask = 0xf80f0200, PrivMask = 0x000001df, StateMask = 0x01000020; + const u32 UserMask = 0xf80f0200, PrivMask = 0x000001df, StateMask = 0x01000020; unsigned int inst = inst_cream->inst; unsigned int operand; @@ -4857,11 +4830,11 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } else { operand = cpu->Reg[BITS(inst, 0, 3)]; } - uint32_t byte_mask = (BIT(inst, 16) ? 0xff : 0) | (BIT(inst, 17) ? 0xff00 : 0) + u32 byte_mask = (BIT(inst, 16) ? 0xff : 0) | (BIT(inst, 17) ? 0xff00 : 0) | (BIT(inst, 18) ? 0xff0000 : 0) | (BIT(inst, 19) ? 0xff000000 : 0); - uint32_t mask = 0; + u32 mask = 0; if (!inst_cream->R) { - if (InAPrivilegedMode(cpu)) { + if (cpu->InAPrivilegedMode()) { if ((operand & StateMask) != 0) { /// UNPREDICTABLE DEBUG_MSG; @@ -4873,7 +4846,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { SAVE_NZCVT; cpu->Cpsr = (cpu->Cpsr & ~mask) | (operand & mask); - switch_mode(cpu, cpu->Cpsr & 0x1f); + cpu->ChangePrivilegeMode(cpu->Cpsr & 0x1F); LOAD_NZCVT; } else { if (CurrentModeHasSPSR) { @@ -4882,7 +4855,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(msr_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4892,9 +4865,9 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { mul_inst* inst_cream = (mul_inst*)inst_base->component; - uint64_t rm = RM; - uint64_t rs = RS; - RD = static_cast<uint32_t>((rm * rs) & 0xffffffff); + u64 rm = RM; + u64 rs = RS; + RD = static_cast<u32>((rm * rs) & 0xffffffff); if (inst_cream->S) { UPDATE_NFLAG(RD); UPDATE_ZFLAG(RD); @@ -4904,7 +4877,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(mul_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4919,7 +4892,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Spsr_copy & 0x1f); + cpu->ChangePrivilegeMode(cpu->Spsr_copy & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -4932,7 +4905,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(mvn_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4944,12 +4917,16 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 lop = RN; u32 rop = SHIFTER_OPERAND; + + if (inst_cream->Rn == 15) + lop += 2 * cpu->GetInstructionSize(); + RD = lop | rop; if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Spsr_copy & 0x1f); + cpu->ChangePrivilegeMode(cpu->Spsr_copy & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -4962,7 +4939,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(orr_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4970,7 +4947,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { NOP_INST: { - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC_STUB; FETCH_INST; GOTO_NEXT_INST; @@ -4982,7 +4959,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { pkh_inst *inst_cream = (pkh_inst *)inst_base->component; RD = (RN & 0xFFFF) | ((RM << inst_cream->imm) & 0xFFFF0000); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(pkh_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -4995,7 +4972,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { int shift_imm = inst_cream->imm ? inst_cream->imm : 31; RD = ((static_cast<s32>(RM) >> shift_imm) & 0xFFFF) | (RN & 0xFFFF0000); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(pkh_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5005,7 +4982,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { { // Not implemented. PLD is a hint instruction, so it's optional. - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(pld_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5078,7 +5055,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = result; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5140,7 +5117,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = (lo_result & 0xFFFF) | ((hi_result & 0xFFFF) << 16); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5173,7 +5150,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(rev_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5187,8 +5164,8 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 address = 0; inst_cream->get_addr(cpu, inst_cream->inst, address); - cpu->Cpsr = ReadMemory32(cpu, address); - cpu->Reg[15] = ReadMemory32(cpu, address + 4); + cpu->Cpsr = cpu->ReadMemory32(address); + cpu->Reg[15] = cpu->ReadMemory32(address + 4); INC_PC(sizeof(ldst_inst)); goto DISPATCH; @@ -5201,7 +5178,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 rn_val = RN; if (inst_cream->Rn == 15) - rn_val += 2 * GET_INST_SIZE(cpu); + rn_val += 2 * cpu->GetInstructionSize(); bool carry; bool overflow; @@ -5210,7 +5187,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Spsr_copy & 0x1f); + cpu->ChangePrivilegeMode(cpu->Spsr_copy & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -5224,7 +5201,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(rsb_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5234,14 +5211,18 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { rsc_inst* const inst_cream = (rsc_inst*)inst_base->component; + u32 rn_val = RN; + if (inst_cream->Rn == 15) + rn_val += 2 * cpu->GetInstructionSize(); + bool carry; bool overflow; - RD = AddWithCarry(~RN, SHIFTER_OPERAND, cpu->CFlag, &carry, &overflow); + RD = AddWithCarry(~rn_val, SHIFTER_OPERAND, cpu->CFlag, &carry, &overflow); if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Spsr_copy & 0x1f); + cpu->ChangePrivilegeMode(cpu->Spsr_copy & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -5255,7 +5236,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(rsc_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5363,7 +5344,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5374,14 +5355,18 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { sbc_inst* const inst_cream = (sbc_inst*)inst_base->component; + u32 rn_val = RN; + if (inst_cream->Rn == 15) + rn_val += 2 * cpu->GetInstructionSize(); + bool carry; bool overflow; - RD = AddWithCarry(RN, ~SHIFTER_OPERAND, cpu->CFlag, &carry, &overflow); + RD = AddWithCarry(rn_val, ~SHIFTER_OPERAND, cpu->CFlag, &carry, &overflow); if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Spsr_copy & 0x1f); + cpu->ChangePrivilegeMode(cpu->Spsr_copy & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -5395,7 +5380,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(sbc_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5434,7 +5419,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = result; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5453,7 +5438,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { LOG_WARNING(Core_ARM11, "SETEND %s executed", big_endian ? "BE" : "LE"); - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(setend_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5466,7 +5451,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { LOG_TRACE(Core_ARM11, "SEV executed."); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC_STUB; FETCH_INST; GOTO_NEXT_INST; @@ -5538,7 +5523,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5548,7 +5533,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { { if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { smla_inst* inst_cream = (smla_inst*)inst_base->component; - int32_t operand1, operand2; + s32 operand1, operand2; if (inst_cream->x == 0) operand1 = (BIT(RM, 15)) ? (BITS(RM, 0, 15) | 0xffff0000) : BITS(RM, 0, 15); else @@ -5563,7 +5548,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (AddOverflow(operand1 * operand2, RN, RD)) cpu->Cpsr |= (1 << 27); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(smla_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5619,7 +5604,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(smlad_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5648,7 +5633,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { cpu->ZFlag = (RDHI == 0 && RDLO == 0); } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(umlal_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5678,7 +5663,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RDHI = ((dest >> 32) & 0xFFFFFFFF); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(smlalxy_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5703,7 +5688,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { cpu->Cpsr |= (1 << 27); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(smlad_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5741,7 +5726,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RDHI = ((result >> 32) & 0xFFFFFFFF); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(smlald_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5777,7 +5762,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = ((result >> 32) & 0xFFFFFFFF); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(smlad_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5787,7 +5772,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { { if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { smul_inst* inst_cream = (smul_inst*)inst_base->component; - uint32_t operand1, operand2; + u32 operand1, operand2; if (inst_cream->x == 0) operand1 = (BIT(RM, 15)) ? (BITS(RM, 0, 15) | 0xffff0000) : BITS(RM, 0, 15); else @@ -5799,7 +5784,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { operand2 = (BIT(RS, 31)) ? (BITS(RS, 16, 31) | 0xffff0000) : BITS(RS, 16, 31); RD = operand1 * operand2; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(smul_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5808,15 +5793,15 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { { if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { umull_inst* inst_cream = (umull_inst*)inst_base->component; - int64_t rm = RM; - int64_t rs = RS; + s64 rm = RM; + s64 rs = RS; if (BIT(rm, 31)) { rm |= 0xffffffff00000000LL; } if (BIT(rs, 31)) { rs |= 0xffffffff00000000LL; } - int64_t rst = rm * rs; + s64 rst = rm * rs; RDHI = BITS(rst, 32, 63); RDLO = BITS(rst, 0, 31); @@ -5825,7 +5810,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { cpu->ZFlag = (RDHI == 0 && RDLO == 0); } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(umull_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5841,7 +5826,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { s64 result = (s64)rm * (s64)(s32)RN; RD = BITS(result, 16, 47); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(smlad_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5855,10 +5840,10 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 address = 0; inst_cream->get_addr(cpu, inst_cream->inst, address); - WriteMemory32(cpu, address + 0, cpu->Reg[14]); - WriteMemory32(cpu, address + 4, cpu->Spsr_copy); + cpu->WriteMemory32(address + 0, cpu->Reg[14]); + cpu->WriteMemory32(address + 4, cpu->Spsr_copy); - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5891,7 +5876,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = rn_val; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ssat_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5913,7 +5898,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { cpu->Cpsr |= (1 << 27); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ssat_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5923,7 +5908,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { { // Instruction not implemented //LOG_CRITICAL(Core_ARM11, "unimplemented instruction"); - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(stc_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5941,36 +5926,36 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (BIT(inst_cream->inst, 22) == 1) { for (int i = 0; i < 13; i++) { if (BIT(inst_cream->inst, i)) { - WriteMemory32(cpu, addr, cpu->Reg[i]); + cpu->WriteMemory32(addr, cpu->Reg[i]); addr += 4; } } if (BIT(inst_cream->inst, 13)) { if (cpu->Mode == USER32MODE) - WriteMemory32(cpu, addr, cpu->Reg[13]); + cpu->WriteMemory32(addr, cpu->Reg[13]); else - WriteMemory32(cpu, addr, cpu->Reg_usr[0]); + cpu->WriteMemory32(addr, cpu->Reg_usr[0]); addr += 4; } if (BIT(inst_cream->inst, 14)) { if (cpu->Mode == USER32MODE) - WriteMemory32(cpu, addr, cpu->Reg[14]); + cpu->WriteMemory32(addr, cpu->Reg[14]); else - WriteMemory32(cpu, addr, cpu->Reg_usr[1]); + cpu->WriteMemory32(addr, cpu->Reg_usr[1]); addr += 4; } if (BIT(inst_cream->inst, 15)) { - WriteMemory32(cpu, addr, cpu->Reg_usr[1] + 8); + cpu->WriteMemory32(addr, cpu->Reg_usr[1] + 8); } } else { for (int i = 0; i < 15; i++) { if (BIT(inst_cream->inst, i)) { if (i == Rn) - WriteMemory32(cpu, addr, old_RN); + cpu->WriteMemory32(addr, old_RN); else - WriteMemory32(cpu, addr, cpu->Reg[i]); + cpu->WriteMemory32(addr, cpu->Reg[i]); addr += 4; } @@ -5978,10 +5963,10 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { // Check PC reg if (BIT(inst_cream->inst, 15)) - WriteMemory32(cpu, addr, cpu->Reg_usr[1] + 8); + cpu->WriteMemory32(addr, cpu->Reg_usr[1] + 8); } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -5999,7 +5984,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } RD = operand2; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(sxtb_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6010,10 +5995,15 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { ldst_inst* inst_cream = (ldst_inst*)inst_base->component; inst_cream->get_addr(cpu, inst_cream->inst, addr); - unsigned int value = cpu->Reg[BITS(inst_cream->inst, 12, 15)]; - WriteMemory32(cpu, addr, value); + unsigned int reg = BITS(inst_cream->inst, 12, 15); + unsigned int value = cpu->Reg[reg]; + + if (reg == 15) + value += 2 * cpu->GetInstructionSize(); + + cpu->WriteMemory32(addr, value); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6024,7 +6014,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { uxtb_inst* inst_cream = (uxtb_inst*)inst_base->component; RD = ROTATE_RIGHT_32(RM, 8 * inst_cream->rotate) & 0xff; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(uxtb_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6037,7 +6027,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { unsigned int operand2 = ROTATE_RIGHT_32(RM, 8 * inst_cream->rotate) & 0xff; RD = RN + operand2; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(uxtab_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6050,7 +6040,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { unsigned int value = cpu->Reg[BITS(inst_cream->inst, 12, 15)] & 0xff; Memory::Write8(addr, value); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6063,7 +6053,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { unsigned int value = cpu->Reg[BITS(inst_cream->inst, 12, 15)] & 0xff; Memory::Write8(addr, value); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6076,10 +6066,10 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { // The 3DS doesn't have the Large Physical Access Extension (LPAE) // so STRD wouldn't store these as a single write. - WriteMemory32(cpu, addr + 0, cpu->Reg[BITS(inst_cream->inst, 12, 15)]); - WriteMemory32(cpu, addr + 4, cpu->Reg[BITS(inst_cream->inst, 12, 15) + 1]); + cpu->WriteMemory32(addr + 0, cpu->Reg[BITS(inst_cream->inst, 12, 15)]); + cpu->WriteMemory32(addr + 4, cpu->Reg[BITS(inst_cream->inst, 12, 15) + 1]); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6090,18 +6080,16 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component; unsigned int write_addr = cpu->Reg[inst_cream->Rn]; - if ((exclusive_detect(cpu, write_addr) == 0) && (cpu->exclusive_state == 1)) { - remove_exclusive(cpu, write_addr); - cpu->exclusive_state = 0; - - WriteMemory32(cpu, write_addr, RM); + if (cpu->IsExclusiveMemoryAccess(write_addr)) { + cpu->UnsetExclusiveMemoryAddress(); + cpu->WriteMemory32(write_addr, RM); RD = 0; } else { // Failed to write due to mutex access RD = 1; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6112,10 +6100,8 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component; unsigned int write_addr = cpu->Reg[inst_cream->Rn]; - if ((exclusive_detect(cpu, write_addr) == 0) && (cpu->exclusive_state == 1)) { - remove_exclusive(cpu, write_addr); - cpu->exclusive_state = 0; - + if (cpu->IsExclusiveMemoryAccess(write_addr)) { + cpu->UnsetExclusiveMemoryAddress(); Memory::Write8(write_addr, cpu->Reg[inst_cream->Rm]); RD = 0; } else { @@ -6123,7 +6109,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = 1; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6134,20 +6120,19 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component; unsigned int write_addr = cpu->Reg[inst_cream->Rn]; - if ((exclusive_detect(cpu, write_addr) == 0) && (cpu->exclusive_state == 1)) { - remove_exclusive(cpu, write_addr); - cpu->exclusive_state = 0; + if (cpu->IsExclusiveMemoryAccess(write_addr)) { + cpu->UnsetExclusiveMemoryAddress(); const u32 rt = cpu->Reg[inst_cream->Rm + 0]; const u32 rt2 = cpu->Reg[inst_cream->Rm + 1]; u64 value; - if (InBigEndianMode(cpu)) + if (cpu->InBigEndianMode()) value = (((u64)rt << 32) | rt2); else value = (((u64)rt2 << 32) | rt); - WriteMemory64(cpu, write_addr, value); + cpu->WriteMemory64(write_addr, value); RD = 0; } else { @@ -6155,7 +6140,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = 1; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6166,18 +6151,16 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component; unsigned int write_addr = cpu->Reg[inst_cream->Rn]; - if ((exclusive_detect(cpu, write_addr) == 0) && (cpu->exclusive_state == 1)) { - remove_exclusive(cpu, write_addr); - cpu->exclusive_state = 0; - - WriteMemory16(cpu, write_addr, RM); + if (cpu->IsExclusiveMemoryAccess(write_addr)) { + cpu->UnsetExclusiveMemoryAddress(); + cpu->WriteMemory16(write_addr, RM); RD = 0; } else { // Failed to write due to mutex access RD = 1; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6189,9 +6172,9 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { inst_cream->get_addr(cpu, inst_cream->inst, addr); unsigned int value = cpu->Reg[BITS(inst_cream->inst, 12, 15)] & 0xffff; - WriteMemory16(cpu, addr, value); + cpu->WriteMemory16(addr, value); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6203,9 +6186,9 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { inst_cream->get_addr(cpu, inst_cream->inst, addr); unsigned int value = cpu->Reg[BITS(inst_cream->inst, 12, 15)]; - WriteMemory32(cpu, addr, value); + cpu->WriteMemory32(addr, value); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ldst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6217,7 +6200,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 rn_val = RN; if (inst_cream->Rn == 15) - rn_val += 8; + rn_val += 2 * cpu->GetInstructionSize(); bool carry; bool overflow; @@ -6226,7 +6209,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { if (inst_cream->S && (inst_cream->Rd == 15)) { if (CurrentModeHasSPSR) { cpu->Cpsr = cpu->Spsr_copy; - switch_mode(cpu, cpu->Spsr_copy & 0x1f); + cpu->ChangePrivilegeMode(cpu->Spsr_copy & 0x1F); LOAD_NZCVT; } } else if (inst_cream->S) { @@ -6240,7 +6223,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { goto DISPATCH; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(sub_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6252,7 +6235,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { SVC::CallSVC(inst_cream->num & 0xFFFF); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(swi_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6263,12 +6246,12 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { swp_inst* inst_cream = (swp_inst*)inst_base->component; addr = RN; - unsigned int value = ReadMemory32(cpu, addr); - WriteMemory32(cpu, addr, RM); + unsigned int value = cpu->ReadMemory32(addr); + cpu->WriteMemory32(addr, RM); RD = value; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(swp_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6282,7 +6265,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { Memory::Write8(addr, (RM & 0xFF)); RD = value; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(swp_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6298,7 +6281,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { operand2 = (0x80 & operand2)? (0xFFFFFF00 | operand2):operand2; RD = RN + operand2; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(uxtab_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6331,7 +6314,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(sxtab_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6347,7 +6330,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { operand2 = (0x8000 & operand2) ? (0xFFFF0000 | operand2) : operand2; RD = RN + operand2; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(sxtah_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6362,7 +6345,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 rop = SHIFTER_OPERAND; if (inst_cream->Rn == 15) - lop += GET_INST_SIZE(cpu) * 2; + lop += cpu->GetInstructionSize() * 2; u32 result = lop ^ rop; @@ -6370,7 +6353,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { UPDATE_ZFLAG(result); UPDATE_CFLAG_WITH_SC; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(teq_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6384,7 +6367,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { u32 rop = SHIFTER_OPERAND; if (inst_cream->Rn == 15) - lop += GET_INST_SIZE(cpu) * 2; + lop += cpu->GetInstructionSize() * 2; u32 result = lop & rop; @@ -6392,7 +6375,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { UPDATE_ZFLAG(result); UPDATE_CFLAG_WITH_SC; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(tst_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6563,7 +6546,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = (lo_result & 0xFFFF) | ((hi_result & 0xFFFF) << 16); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6643,7 +6626,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6662,7 +6645,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RDLO = (result & 0xFFFFFFFF); RDHI = ((result >> 32) & 0xFFFFFFFF); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(umaal_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6685,7 +6668,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { cpu->ZFlag = (RDHI == 0 && RDLO == 0); } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(umlal_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6705,7 +6688,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { cpu->ZFlag = (RDHI == 0 && RDLO == 0); } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(umull_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6733,7 +6716,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { { bl_1_thumb* inst_cream = (bl_1_thumb*)inst_base->component; cpu->Reg[14] = cpu->Reg[15] + 4 + inst_cream->imm; - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(bl_1_thumb)); FETCH_INST; GOTO_NEXT_INST; @@ -6814,7 +6797,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = ((lo_val & 0xFFFF) | hi_val << 16); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6844,7 +6827,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = finalDif; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(generic_arm_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6877,7 +6860,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { RD = rn_val; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ssat_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6899,7 +6882,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { cpu->Cpsr |= (1 << 27); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(ssat_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6930,7 +6913,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(uxtab_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -6943,7 +6926,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { LOG_TRACE(Core_ARM11, "WFE executed."); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC_STUB; FETCH_INST; GOTO_NEXT_INST; @@ -6956,7 +6939,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { LOG_TRACE(Core_ARM11, "WFI executed."); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC_STUB; FETCH_INST; GOTO_NEXT_INST; @@ -6969,7 +6952,7 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) { LOG_TRACE(Core_ARM11, "YIELD executed."); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC_STUB; FETCH_INST; GOTO_NEXT_INST; diff --git a/src/core/arm/dyncom/arm_dyncom_run.cpp b/src/core/arm/dyncom/arm_dyncom_run.cpp deleted file mode 100644 index 4c6acba98..000000000 --- a/src/core/arm/dyncom/arm_dyncom_run.cpp +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2012 Michael Kang, 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "core/arm/dyncom/arm_dyncom_run.h" -#include "core/arm/skyeye_common/armstate.h" - -void switch_mode(ARMul_State* core, uint32_t mode) { - if (core->Mode == mode) - return; - - if (mode != USERBANK) { - switch (core->Mode) { - case SYSTEM32MODE: // Shares registers with user mode - case USER32MODE: - core->Reg_usr[0] = core->Reg[13]; - core->Reg_usr[1] = core->Reg[14]; - break; - case IRQ32MODE: - core->Reg_irq[0] = core->Reg[13]; - core->Reg_irq[1] = core->Reg[14]; - core->Spsr[IRQBANK] = core->Spsr_copy; - break; - case SVC32MODE: - core->Reg_svc[0] = core->Reg[13]; - core->Reg_svc[1] = core->Reg[14]; - core->Spsr[SVCBANK] = core->Spsr_copy; - break; - case ABORT32MODE: - core->Reg_abort[0] = core->Reg[13]; - core->Reg_abort[1] = core->Reg[14]; - core->Spsr[ABORTBANK] = core->Spsr_copy; - break; - case UNDEF32MODE: - core->Reg_undef[0] = core->Reg[13]; - core->Reg_undef[1] = core->Reg[14]; - core->Spsr[UNDEFBANK] = core->Spsr_copy; - break; - case FIQ32MODE: - core->Reg_firq[0] = core->Reg[13]; - core->Reg_firq[1] = core->Reg[14]; - core->Spsr[FIQBANK] = core->Spsr_copy; - break; - } - - switch (mode) { - case USER32MODE: - core->Reg[13] = core->Reg_usr[0]; - core->Reg[14] = core->Reg_usr[1]; - core->Bank = USERBANK; - break; - case IRQ32MODE: - core->Reg[13] = core->Reg_irq[0]; - core->Reg[14] = core->Reg_irq[1]; - core->Spsr_copy = core->Spsr[IRQBANK]; - core->Bank = IRQBANK; - break; - case SVC32MODE: - core->Reg[13] = core->Reg_svc[0]; - core->Reg[14] = core->Reg_svc[1]; - core->Spsr_copy = core->Spsr[SVCBANK]; - core->Bank = SVCBANK; - break; - case ABORT32MODE: - core->Reg[13] = core->Reg_abort[0]; - core->Reg[14] = core->Reg_abort[1]; - core->Spsr_copy = core->Spsr[ABORTBANK]; - core->Bank = ABORTBANK; - break; - case UNDEF32MODE: - core->Reg[13] = core->Reg_undef[0]; - core->Reg[14] = core->Reg_undef[1]; - core->Spsr_copy = core->Spsr[UNDEFBANK]; - core->Bank = UNDEFBANK; - break; - case FIQ32MODE: - core->Reg[13] = core->Reg_firq[0]; - core->Reg[14] = core->Reg_firq[1]; - core->Spsr_copy = core->Spsr[FIQBANK]; - core->Bank = FIQBANK; - break; - case SYSTEM32MODE: // Shares registers with user mode. - core->Reg[13] = core->Reg_usr[0]; - core->Reg[14] = core->Reg_usr[1]; - core->Bank = SYSTEMBANK; - break; - } - - // Set the mode bits in the APSR - core->Cpsr = (core->Cpsr & ~core->Mode) | mode; - core->Mode = mode; - } -} diff --git a/src/core/arm/dyncom/arm_dyncom_run.h b/src/core/arm/dyncom/arm_dyncom_run.h index ef18455bc..13bef17fc 100644 --- a/src/core/arm/dyncom/arm_dyncom_run.h +++ b/src/core/arm/dyncom/arm_dyncom_run.h @@ -20,38 +20,29 @@ #include "core/arm/skyeye_common/armstate.h" -void switch_mode(ARMul_State* core, uint32_t mode); - -// Note that for the 3DS, a Thumb instruction will only ever be -// two bytes in size. Thus we don't need to worry about ThumbEE -// or Thumb-2 where instructions can be 4 bytes in length. -static inline u32 GET_INST_SIZE(ARMul_State* core) { - return core->TFlag? 2 : 4; -} - /** * Checks if the PC is being read, and if so, word-aligns it. * Used with address calculations. * - * @param core The ARM CPU state instance. + * @param cpu The ARM CPU state instance. * @param Rn The register being read. * * @return If the PC is being read, then the word-aligned PC value is returned. * If the PC is not being read, then the value stored in the register is returned. */ -static inline u32 CHECK_READ_REG15_WA(ARMul_State* core, int Rn) { - return (Rn == 15) ? ((core->Reg[15] & ~0x3) + GET_INST_SIZE(core) * 2) : core->Reg[Rn]; +static inline u32 CHECK_READ_REG15_WA(ARMul_State* cpu, int Rn) { + return (Rn == 15) ? ((cpu->Reg[15] & ~0x3) + cpu->GetInstructionSize() * 2) : cpu->Reg[Rn]; } /** * Reads the PC. Used for data processing operations that use the PC. * - * @param core The ARM CPU state instance. + * @param cpu The ARM CPU state instance. * @param Rn The register being read. * * @return If the PC is being read, then the incremented PC value is returned. * If the PC is not being read, then the values stored in the register is returned. */ -static inline u32 CHECK_READ_REG15(ARMul_State* core, int Rn) { - return (Rn == 15) ? ((core->Reg[15] & ~0x1) + GET_INST_SIZE(core) * 2) : core->Reg[Rn]; +static inline u32 CHECK_READ_REG15(ARMul_State* cpu, int Rn) { + return (Rn == 15) ? ((cpu->Reg[15] & ~0x1) + cpu->GetInstructionSize() * 2) : cpu->Reg[Rn]; } diff --git a/src/core/arm/dyncom/arm_dyncom_thumb.cpp b/src/core/arm/dyncom/arm_dyncom_thumb.cpp index 2860af376..29272fd5d 100644 --- a/src/core/arm/dyncom/arm_dyncom_thumb.cpp +++ b/src/core/arm/dyncom/arm_dyncom_thumb.cpp @@ -12,15 +12,9 @@ // with the following Thumb instruction held in the high 16-bits. Passing in two Thumb instructions // allows easier simulation of the special dual BL instruction. -tdstate thumb_translate(u32 addr, u32 instr, u32* ainstr, u32* inst_size) { - tdstate valid = t_uninitialized; - u32 tinstr = instr; - - // The endian should be judge here - if((addr & 0x3) != 0) - tinstr = instr >> 16; - else - tinstr &= 0xFFFF; +ThumbDecodeStatus TranslateThumbInstruction(u32 addr, u32 instr, u32* ainstr, u32* inst_size) { + ThumbDecodeStatus valid = ThumbDecodeStatus::UNINITIALIZED; + u32 tinstr = GetThumbInstruction(instr, addr); *ainstr = 0xDEADC0DE; // Debugging to catch non updates @@ -357,21 +351,21 @@ tdstate thumb_translate(u32 addr, u32 instr, u32* ainstr, u32* inst_size) { else *ainstr |= (tinstr & 0x00FF); } else if ((tinstr & 0x0F00) != 0x0E00) - valid = t_branch; + valid = ThumbDecodeStatus::BRANCH; else // UNDEFINED : cc=1110(AL) uses different format - valid = t_undefined; + valid = ThumbDecodeStatus::UNDEFINED; break; case 28: // B - valid = t_branch; + valid = ThumbDecodeStatus::BRANCH; break; case 29: - if(tinstr & 0x1) - valid = t_undefined; + if (tinstr & 0x1) + valid = ThumbDecodeStatus::UNDEFINED; else - valid = t_branch; + valid = ThumbDecodeStatus::BRANCH; break; case 30: // BL instruction 1 @@ -380,7 +374,7 @@ tdstate thumb_translate(u32 addr, u32 instr, u32* ainstr, u32* inst_size) { // simulation simple (from the user perspective) we check if the following instruction is // the second half of this BL, and if it is we simulate it immediately - valid = t_branch; + valid = ThumbDecodeStatus::BRANCH; break; case 31: // BL instruction 2 @@ -389,7 +383,7 @@ tdstate thumb_translate(u32 addr, u32 instr, u32* ainstr, u32* inst_size) { // ever be matched with the fmt19 "BL instruction 1" instruction. However, we do allow the // simulation of it on its own, with undefined results if r14 is not suitably initialised. - valid = t_branch; + valid = ThumbDecodeStatus::BRANCH; break; } diff --git a/src/core/arm/dyncom/arm_dyncom_thumb.h b/src/core/arm/dyncom/arm_dyncom_thumb.h index c06f09580..447974363 100644 --- a/src/core/arm/dyncom/arm_dyncom_thumb.h +++ b/src/core/arm/dyncom/arm_dyncom_thumb.h @@ -28,20 +28,22 @@ #include "common/common_types.h" -enum tdstate { - t_undefined, // Undefined Thumb instruction - t_decoded, // Instruction decoded to ARM equivalent - t_branch, // Thumb branch (already processed) - t_uninitialized, +enum class ThumbDecodeStatus { + UNDEFINED, // Undefined Thumb instruction + DECODED, // Instruction decoded to ARM equivalent + BRANCH, // Thumb branch (already processed) + UNINITIALIZED, }; -tdstate thumb_translate(u32 addr, u32 instr, u32* ainstr, u32* inst_size); +// Translates a Thumb mode instruction into its ARM equivalent. +ThumbDecodeStatus TranslateThumbInstruction(u32 addr, u32 instr, u32* ainstr, u32* inst_size); -static inline u32 get_thumb_instr(u32 instr, u32 pc) { - u32 tinstr; - if ((pc & 0x3) != 0) - tinstr = instr >> 16; - else - tinstr = instr & 0xFFFF; - return tinstr; +static inline u32 GetThumbInstruction(u32 instr, u32 address) { + // Normally you would need to handle instruction endianness, + // however, it is fixed to little-endian on the MPCore, so + // there's no need to check for this beforehand. + if ((address & 0x3) != 0) + return instr >> 16; + + return instr & 0xFFFF; } diff --git a/src/core/arm/skyeye_common/arm_regformat.h b/src/core/arm/skyeye_common/arm_regformat.h index d1c721809..38fa97ab9 100644 --- a/src/core/arm/skyeye_common/arm_regformat.h +++ b/src/core/arm/skyeye_common/arm_regformat.h @@ -55,7 +55,7 @@ enum { }; // VFP system registers -enum { +enum VFPSystemRegister { VFP_FPSID, VFP_FPSCR, VFP_FPEXC, diff --git a/src/core/arm/skyeye_common/arminit.cpp b/src/core/arm/skyeye_common/arminit.cpp deleted file mode 100644 index b7c508d75..000000000 --- a/src/core/arm/skyeye_common/arminit.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* arminit.c -- ARMulator initialization: ARM6 Instruction Emulator. - Copyright (C) 1994 Advanced RISC Machines Ltd. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ - -#include <cstring> -#include "core/arm/skyeye_common/armstate.h" -#include "core/arm/skyeye_common/vfp/vfp.h" - -// Resets certain MPCore CP15 values to their ARM-defined reset values. -static void ResetMPCoreCP15Registers(ARMul_State* cpu) -{ - // c0 - cpu->CP15[CP15_MAIN_ID] = 0x410FB024; - cpu->CP15[CP15_TLB_TYPE] = 0x00000800; - cpu->CP15[CP15_PROCESSOR_FEATURE_0] = 0x00000111; - cpu->CP15[CP15_PROCESSOR_FEATURE_1] = 0x00000001; - cpu->CP15[CP15_DEBUG_FEATURE_0] = 0x00000002; - cpu->CP15[CP15_MEMORY_MODEL_FEATURE_0] = 0x01100103; - cpu->CP15[CP15_MEMORY_MODEL_FEATURE_1] = 0x10020302; - cpu->CP15[CP15_MEMORY_MODEL_FEATURE_2] = 0x01222000; - cpu->CP15[CP15_MEMORY_MODEL_FEATURE_3] = 0x00000000; - cpu->CP15[CP15_ISA_FEATURE_0] = 0x00100011; - cpu->CP15[CP15_ISA_FEATURE_1] = 0x12002111; - cpu->CP15[CP15_ISA_FEATURE_2] = 0x11221011; - cpu->CP15[CP15_ISA_FEATURE_3] = 0x01102131; - cpu->CP15[CP15_ISA_FEATURE_4] = 0x00000141; - - // c1 - cpu->CP15[CP15_CONTROL] = 0x00054078; - cpu->CP15[CP15_AUXILIARY_CONTROL] = 0x0000000F; - cpu->CP15[CP15_COPROCESSOR_ACCESS_CONTROL] = 0x00000000; - - // c2 - cpu->CP15[CP15_TRANSLATION_BASE_TABLE_0] = 0x00000000; - cpu->CP15[CP15_TRANSLATION_BASE_TABLE_1] = 0x00000000; - cpu->CP15[CP15_TRANSLATION_BASE_CONTROL] = 0x00000000; - - // c3 - cpu->CP15[CP15_DOMAIN_ACCESS_CONTROL] = 0x00000000; - - // c7 - cpu->CP15[CP15_PHYS_ADDRESS] = 0x00000000; - - // c9 - cpu->CP15[CP15_DATA_CACHE_LOCKDOWN] = 0xFFFFFFF0; - - // c10 - cpu->CP15[CP15_TLB_LOCKDOWN] = 0x00000000; - cpu->CP15[CP15_PRIMARY_REGION_REMAP] = 0x00098AA4; - cpu->CP15[CP15_NORMAL_REGION_REMAP] = 0x44E048E0; - - // c13 - cpu->CP15[CP15_PID] = 0x00000000; - cpu->CP15[CP15_CONTEXT_ID] = 0x00000000; - cpu->CP15[CP15_THREAD_UPRW] = 0x00000000; - cpu->CP15[CP15_THREAD_URO] = 0x00000000; - cpu->CP15[CP15_THREAD_PRW] = 0x00000000; - - // c15 - cpu->CP15[CP15_PERFORMANCE_MONITOR_CONTROL] = 0x00000000; - cpu->CP15[CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS] = 0x00000000; - cpu->CP15[CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS] = 0x00000000; - cpu->CP15[CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE] = 0x00000000; - cpu->CP15[CP15_TLB_DEBUG_CONTROL] = 0x00000000; -} - -// Performs a reset -void ARMul_Reset(ARMul_State* state) -{ - VFPInit(state); - - state->Reg[15] = 0; - state->Cpsr = INTBITS | SVC32MODE; - state->Mode = SVC32MODE; - state->Bank = SVCBANK; - - ResetMPCoreCP15Registers(state); - - state->NresetSig = HIGH; - state->NfiqSig = HIGH; - state->NirqSig = HIGH; - state->NtransSig = (state->Mode & 3) ? HIGH : LOW; - state->abortSig = LOW; - - state->NumInstrs = 0; - state->Emulate = RUN; -} diff --git a/src/core/arm/skyeye_common/armmmu.h b/src/core/arm/skyeye_common/armmmu.h deleted file mode 100644 index 5423588c0..000000000 --- a/src/core/arm/skyeye_common/armmmu.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - armmmu.c - Memory Management Unit emulation. - ARMulator extensions for the ARM7100 family. - Copyright (C) 1999 Ben Williamson - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ - -#pragma once - -#include "common/swap.h" - -#include "core/memory.h" -#include "core/arm/skyeye_common/armstate.h" -#include "core/arm/skyeye_common/armsupp.h" - -// Register numbers in the MMU -enum -{ - MMU_ID = 0, - MMU_CONTROL = 1, - MMU_TRANSLATION_TABLE_BASE = 2, - MMU_DOMAIN_ACCESS_CONTROL = 3, - MMU_FAULT_STATUS = 5, - MMU_FAULT_ADDRESS = 6, - MMU_CACHE_OPS = 7, - MMU_TLB_OPS = 8, - MMU_CACHE_LOCKDOWN = 9, - MMU_TLB_LOCKDOWN = 10, - MMU_PID = 13, - - // MMU_V4 - MMU_V4_CACHE_OPS = 7, - MMU_V4_TLB_OPS = 8, - - // MMU_V3 - MMU_V3_FLUSH_TLB = 5, - MMU_V3_FLUSH_TLB_ENTRY = 6, - MMU_V3_FLUSH_CACHE = 7, -}; - -// Reads data in big/little endian format based on the -// state of the E (endian) bit in the emulated CPU's APSR. -inline u16 ReadMemory16(ARMul_State* cpu, u32 address) { - u16 data = Memory::Read16(address); - - if (InBigEndianMode(cpu)) - data = Common::swap16(data); - - return data; -} - -inline u32 ReadMemory32(ARMul_State* cpu, u32 address) { - u32 data = Memory::Read32(address); - - if (InBigEndianMode(cpu)) - data = Common::swap32(data); - - return data; -} - -inline u64 ReadMemory64(ARMul_State* cpu, u32 address) { - u64 data = Memory::Read64(address); - - if (InBigEndianMode(cpu)) - data = Common::swap64(data); - - return data; -} - -// Writes data in big/little endian format based on the -// state of the E (endian) bit in the emulated CPU's APSR. -inline void WriteMemory16(ARMul_State* cpu, u32 address, u16 data) { - if (InBigEndianMode(cpu)) - data = Common::swap16(data); - - Memory::Write16(address, data); -} - -inline void WriteMemory32(ARMul_State* cpu, u32 address, u32 data) { - if (InBigEndianMode(cpu)) - data = Common::swap32(data); - - Memory::Write32(address, data); -} - -inline void WriteMemory64(ARMul_State* cpu, u32 address, u64 data) { - if (InBigEndianMode(cpu)) - data = Common::swap64(data); - - Memory::Write64(address, data); -} diff --git a/src/core/arm/skyeye_common/armstate.cpp b/src/core/arm/skyeye_common/armstate.cpp new file mode 100644 index 000000000..ccb2eb0eb --- /dev/null +++ b/src/core/arm/skyeye_common/armstate.cpp @@ -0,0 +1,657 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/swap.h" +#include "common/logging/log.h" +#include "core/mem_map.h" +#include "core/memory.h" +#include "core/arm/skyeye_common/armstate.h" +#include "core/arm/skyeye_common/vfp/vfp.h" + +ARMul_State::ARMul_State(PrivilegeMode initial_mode) +{ + Reset(); + ChangePrivilegeMode(initial_mode); +} + +void ARMul_State::ChangePrivilegeMode(u32 new_mode) +{ + if (Mode == new_mode) + return; + + if (new_mode != USERBANK) { + switch (Mode) { + case SYSTEM32MODE: // Shares registers with user mode + case USER32MODE: + Reg_usr[0] = Reg[13]; + Reg_usr[1] = Reg[14]; + break; + case IRQ32MODE: + Reg_irq[0] = Reg[13]; + Reg_irq[1] = Reg[14]; + Spsr[IRQBANK] = Spsr_copy; + break; + case SVC32MODE: + Reg_svc[0] = Reg[13]; + Reg_svc[1] = Reg[14]; + Spsr[SVCBANK] = Spsr_copy; + break; + case ABORT32MODE: + Reg_abort[0] = Reg[13]; + Reg_abort[1] = Reg[14]; + Spsr[ABORTBANK] = Spsr_copy; + break; + case UNDEF32MODE: + Reg_undef[0] = Reg[13]; + Reg_undef[1] = Reg[14]; + Spsr[UNDEFBANK] = Spsr_copy; + break; + case FIQ32MODE: + Reg_firq[0] = Reg[13]; + Reg_firq[1] = Reg[14]; + Spsr[FIQBANK] = Spsr_copy; + break; + } + + switch (new_mode) { + case USER32MODE: + Reg[13] = Reg_usr[0]; + Reg[14] = Reg_usr[1]; + Bank = USERBANK; + break; + case IRQ32MODE: + Reg[13] = Reg_irq[0]; + Reg[14] = Reg_irq[1]; + Spsr_copy = Spsr[IRQBANK]; + Bank = IRQBANK; + break; + case SVC32MODE: + Reg[13] = Reg_svc[0]; + Reg[14] = Reg_svc[1]; + Spsr_copy = Spsr[SVCBANK]; + Bank = SVCBANK; + break; + case ABORT32MODE: + Reg[13] = Reg_abort[0]; + Reg[14] = Reg_abort[1]; + Spsr_copy = Spsr[ABORTBANK]; + Bank = ABORTBANK; + break; + case UNDEF32MODE: + Reg[13] = Reg_undef[0]; + Reg[14] = Reg_undef[1]; + Spsr_copy = Spsr[UNDEFBANK]; + Bank = UNDEFBANK; + break; + case FIQ32MODE: + Reg[13] = Reg_firq[0]; + Reg[14] = Reg_firq[1]; + Spsr_copy = Spsr[FIQBANK]; + Bank = FIQBANK; + break; + case SYSTEM32MODE: // Shares registers with user mode. + Reg[13] = Reg_usr[0]; + Reg[14] = Reg_usr[1]; + Bank = SYSTEMBANK; + break; + } + + // Set the mode bits in the APSR + Cpsr = (Cpsr & ~Mode) | new_mode; + Mode = new_mode; + } +} + +// Performs a reset +void ARMul_State::Reset() +{ + VFPInit(this); + + // Set stack pointer to the top of the stack + Reg[13] = 0x10000000; + Reg[15] = 0; + + Cpsr = INTBITS | SVC32MODE; + Mode = SVC32MODE; + Bank = SVCBANK; + + ResetMPCoreCP15Registers(); + + NresetSig = HIGH; + NfiqSig = HIGH; + NirqSig = HIGH; + NtransSig = (Mode & 3) ? HIGH : LOW; + abortSig = LOW; + + NumInstrs = 0; + Emulate = RUN; +} + +// Resets certain MPCore CP15 values to their ARM-defined reset values. +void ARMul_State::ResetMPCoreCP15Registers() +{ + // c0 + CP15[CP15_MAIN_ID] = 0x410FB024; + CP15[CP15_TLB_TYPE] = 0x00000800; + CP15[CP15_PROCESSOR_FEATURE_0] = 0x00000111; + CP15[CP15_PROCESSOR_FEATURE_1] = 0x00000001; + CP15[CP15_DEBUG_FEATURE_0] = 0x00000002; + CP15[CP15_MEMORY_MODEL_FEATURE_0] = 0x01100103; + CP15[CP15_MEMORY_MODEL_FEATURE_1] = 0x10020302; + CP15[CP15_MEMORY_MODEL_FEATURE_2] = 0x01222000; + CP15[CP15_MEMORY_MODEL_FEATURE_3] = 0x00000000; + CP15[CP15_ISA_FEATURE_0] = 0x00100011; + CP15[CP15_ISA_FEATURE_1] = 0x12002111; + CP15[CP15_ISA_FEATURE_2] = 0x11221011; + CP15[CP15_ISA_FEATURE_3] = 0x01102131; + CP15[CP15_ISA_FEATURE_4] = 0x00000141; + + // c1 + CP15[CP15_CONTROL] = 0x00054078; + CP15[CP15_AUXILIARY_CONTROL] = 0x0000000F; + CP15[CP15_COPROCESSOR_ACCESS_CONTROL] = 0x00000000; + + // c2 + CP15[CP15_TRANSLATION_BASE_TABLE_0] = 0x00000000; + CP15[CP15_TRANSLATION_BASE_TABLE_1] = 0x00000000; + CP15[CP15_TRANSLATION_BASE_CONTROL] = 0x00000000; + + // c3 + CP15[CP15_DOMAIN_ACCESS_CONTROL] = 0x00000000; + + // c7 + CP15[CP15_PHYS_ADDRESS] = 0x00000000; + + // c9 + CP15[CP15_DATA_CACHE_LOCKDOWN] = 0xFFFFFFF0; + + // c10 + CP15[CP15_TLB_LOCKDOWN] = 0x00000000; + CP15[CP15_PRIMARY_REGION_REMAP] = 0x00098AA4; + CP15[CP15_NORMAL_REGION_REMAP] = 0x44E048E0; + + // c13 + CP15[CP15_PID] = 0x00000000; + CP15[CP15_CONTEXT_ID] = 0x00000000; + CP15[CP15_THREAD_UPRW] = 0x00000000; + CP15[CP15_THREAD_URO] = 0x00000000; + CP15[CP15_THREAD_PRW] = 0x00000000; + + // c15 + CP15[CP15_PERFORMANCE_MONITOR_CONTROL] = 0x00000000; + CP15[CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS] = 0x00000000; + CP15[CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS] = 0x00000000; + CP15[CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE] = 0x00000000; + CP15[CP15_TLB_DEBUG_CONTROL] = 0x00000000; +} + +u16 ARMul_State::ReadMemory16(u32 address) const +{ + u16 data = Memory::Read16(address); + + if (InBigEndianMode()) + data = Common::swap16(data); + + return data; +} + +u32 ARMul_State::ReadMemory32(u32 address) const +{ + u32 data = Memory::Read32(address); + + if (InBigEndianMode()) + data = Common::swap32(data); + + return data; +} + +u64 ARMul_State::ReadMemory64(u32 address) const +{ + u64 data = Memory::Read64(address); + + if (InBigEndianMode()) + data = Common::swap64(data); + + return data; +} + +void ARMul_State::WriteMemory16(u32 address, u16 data) +{ + if (InBigEndianMode()) + data = Common::swap16(data); + + Memory::Write16(address, data); +} + +void ARMul_State::WriteMemory32(u32 address, u32 data) +{ + if (InBigEndianMode()) + data = Common::swap32(data); + + Memory::Write32(address, data); +} + +void ARMul_State::WriteMemory64(u32 address, u64 data) +{ + if (InBigEndianMode()) + data = Common::swap64(data); + + Memory::Write64(address, data); +} + + +// Reads from the CP15 registers. Used with implementation of the MRC instruction. +// Note that since the 3DS does not have the hypervisor extensions, these registers +// are not implemented. +u32 ARMul_State::ReadCP15Register(u32 crn, u32 opcode_1, u32 crm, u32 opcode_2) const +{ + // Unprivileged registers + if (crn == 13 && opcode_1 == 0 && crm == 0) + { + if (opcode_2 == 2) + return CP15[CP15_THREAD_UPRW]; + + if (opcode_2 == 3) + return CP15[CP15_THREAD_URO]; + } + + if (InAPrivilegedMode()) + { + if (crn == 0 && opcode_1 == 0) + { + if (crm == 0) + { + if (opcode_2 == 0) + return CP15[CP15_MAIN_ID]; + + if (opcode_2 == 1) + return CP15[CP15_CACHE_TYPE]; + + if (opcode_2 == 3) + return CP15[CP15_TLB_TYPE]; + + if (opcode_2 == 5) + return CP15[CP15_CPU_ID]; + } + else if (crm == 1) + { + if (opcode_2 == 0) + return CP15[CP15_PROCESSOR_FEATURE_0]; + + if (opcode_2 == 1) + return CP15[CP15_PROCESSOR_FEATURE_1]; + + if (opcode_2 == 2) + return CP15[CP15_DEBUG_FEATURE_0]; + + if (opcode_2 == 4) + return CP15[CP15_MEMORY_MODEL_FEATURE_0]; + + if (opcode_2 == 5) + return CP15[CP15_MEMORY_MODEL_FEATURE_1]; + + if (opcode_2 == 6) + return CP15[CP15_MEMORY_MODEL_FEATURE_2]; + + if (opcode_2 == 7) + return CP15[CP15_MEMORY_MODEL_FEATURE_3]; + } + else if (crm == 2) + { + if (opcode_2 == 0) + return CP15[CP15_ISA_FEATURE_0]; + + if (opcode_2 == 1) + return CP15[CP15_ISA_FEATURE_1]; + + if (opcode_2 == 2) + return CP15[CP15_ISA_FEATURE_2]; + + if (opcode_2 == 3) + return CP15[CP15_ISA_FEATURE_3]; + + if (opcode_2 == 4) + return CP15[CP15_ISA_FEATURE_4]; + } + } + + if (crn == 1 && opcode_1 == 0 && crm == 0) + { + if (opcode_2 == 0) + return CP15[CP15_CONTROL]; + + if (opcode_2 == 1) + return CP15[CP15_AUXILIARY_CONTROL]; + + if (opcode_2 == 2) + return CP15[CP15_COPROCESSOR_ACCESS_CONTROL]; + } + + if (crn == 2 && opcode_1 == 0 && crm == 0) + { + if (opcode_2 == 0) + return CP15[CP15_TRANSLATION_BASE_TABLE_0]; + + if (opcode_2 == 1) + return CP15[CP15_TRANSLATION_BASE_TABLE_1]; + + if (opcode_2 == 2) + return CP15[CP15_TRANSLATION_BASE_CONTROL]; + } + + if (crn == 3 && opcode_1 == 0 && crm == 0 && opcode_2 == 0) + return CP15[CP15_DOMAIN_ACCESS_CONTROL]; + + if (crn == 5 && opcode_1 == 0 && crm == 0) + { + if (opcode_2 == 0) + return CP15[CP15_FAULT_STATUS]; + + if (opcode_2 == 1) + return CP15[CP15_INSTR_FAULT_STATUS]; + } + + if (crn == 6 && opcode_1 == 0 && crm == 0) + { + if (opcode_2 == 0) + return CP15[CP15_FAULT_ADDRESS]; + + if (opcode_2 == 1) + return CP15[CP15_WFAR]; + } + + if (crn == 7 && opcode_1 == 0 && crm == 4 && opcode_2 == 0) + return CP15[CP15_PHYS_ADDRESS]; + + if (crn == 9 && opcode_1 == 0 && crm == 0 && opcode_2 == 0) + return CP15[CP15_DATA_CACHE_LOCKDOWN]; + + if (crn == 10 && opcode_1 == 0) + { + if (crm == 0 && opcode_2 == 0) + return CP15[CP15_TLB_LOCKDOWN]; + + if (crm == 2) + { + if (opcode_2 == 0) + return CP15[CP15_PRIMARY_REGION_REMAP]; + + if (opcode_2 == 1) + return CP15[CP15_NORMAL_REGION_REMAP]; + } + } + + if (crn == 13 && crm == 0) + { + if (opcode_2 == 0) + return CP15[CP15_PID]; + + if (opcode_2 == 1) + return CP15[CP15_CONTEXT_ID]; + + if (opcode_2 == 4) + return CP15[CP15_THREAD_PRW]; + } + + if (crn == 15) + { + if (opcode_1 == 0 && crm == 12) + { + if (opcode_2 == 0) + return CP15[CP15_PERFORMANCE_MONITOR_CONTROL]; + + if (opcode_2 == 1) + return CP15[CP15_CYCLE_COUNTER]; + + if (opcode_2 == 2) + return CP15[CP15_COUNT_0]; + + if (opcode_2 == 3) + return CP15[CP15_COUNT_1]; + } + + if (opcode_1 == 5 && opcode_2 == 2) + { + if (crm == 5) + return CP15[CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS]; + + if (crm == 6) + return CP15[CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS]; + + if (crm == 7) + return CP15[CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE]; + } + + if (opcode_1 == 7 && crm == 1 && opcode_2 == 0) + return CP15[CP15_TLB_DEBUG_CONTROL]; + } + } + + LOG_ERROR(Core_ARM11, "MRC CRn=%u, CRm=%u, OP1=%u OP2=%u is not implemented. Returning zero.", crn, crm, opcode_1, opcode_2); + return 0; +} + +// Write to the CP15 registers. Used with implementation of the MCR instruction. +// Note that since the 3DS does not have the hypervisor extensions, these registers +// are not implemented. +void ARMul_State::WriteCP15Register(u32 value, u32 crn, u32 opcode_1, u32 crm, u32 opcode_2) +{ + if (InAPrivilegedMode()) + { + if (crn == 1 && opcode_1 == 0 && crm == 0) + { + if (opcode_2 == 0) + CP15[CP15_CONTROL] = value; + else if (opcode_2 == 1) + CP15[CP15_AUXILIARY_CONTROL] = value; + else if (opcode_2 == 2) + CP15[CP15_COPROCESSOR_ACCESS_CONTROL] = value; + } + else if (crn == 2 && opcode_1 == 0 && crm == 0) + { + if (opcode_2 == 0) + CP15[CP15_TRANSLATION_BASE_TABLE_0] = value; + else if (opcode_2 == 1) + CP15[CP15_TRANSLATION_BASE_TABLE_1] = value; + else if (opcode_2 == 2) + CP15[CP15_TRANSLATION_BASE_CONTROL] = value; + } + else if (crn == 3 && opcode_1 == 0 && crm == 0 && opcode_2 == 0) + { + CP15[CP15_DOMAIN_ACCESS_CONTROL] = value; + } + else if (crn == 5 && opcode_1 == 0 && crm == 0) + { + if (opcode_2 == 0) + CP15[CP15_FAULT_STATUS] = value; + else if (opcode_2 == 1) + CP15[CP15_INSTR_FAULT_STATUS] = value; + } + else if (crn == 6 && opcode_1 == 0 && crm == 0) + { + if (opcode_2 == 0) + CP15[CP15_FAULT_ADDRESS] = value; + else if (opcode_2 == 1) + CP15[CP15_WFAR] = value; + } + else if (crn == 7 && opcode_1 == 0) + { + if (crm == 0 && opcode_2 == 4) + { + CP15[CP15_WAIT_FOR_INTERRUPT] = value; + } + else if (crm == 4 && opcode_2 == 0) + { + // NOTE: Not entirely accurate. This should do permission checks. + CP15[CP15_PHYS_ADDRESS] = Memory::VirtualToPhysicalAddress(value); + } + else if (crm == 5) + { + if (opcode_2 == 0) + CP15[CP15_INVALIDATE_INSTR_CACHE] = value; + else if (opcode_2 == 1) + CP15[CP15_INVALIDATE_INSTR_CACHE_USING_MVA] = value; + else if (opcode_2 == 2) + CP15[CP15_INVALIDATE_INSTR_CACHE_USING_INDEX] = value; + else if (opcode_2 == 6) + CP15[CP15_FLUSH_BRANCH_TARGET_CACHE] = value; + else if (opcode_2 == 7) + CP15[CP15_FLUSH_BRANCH_TARGET_CACHE_ENTRY] = value; + } + else if (crm == 6) + { + if (opcode_2 == 0) + CP15[CP15_INVALIDATE_DATA_CACHE] = value; + else if (opcode_2 == 1) + CP15[CP15_INVALIDATE_DATA_CACHE_LINE_USING_MVA] = value; + else if (opcode_2 == 2) + CP15[CP15_INVALIDATE_DATA_CACHE_LINE_USING_INDEX] = value; + } + else if (crm == 7 && opcode_2 == 0) + { + CP15[CP15_INVALIDATE_DATA_AND_INSTR_CACHE] = value; + } + else if (crm == 10) + { + if (opcode_2 == 0) + CP15[CP15_CLEAN_DATA_CACHE] = value; + else if (opcode_2 == 1) + CP15[CP15_CLEAN_DATA_CACHE_LINE_USING_MVA] = value; + else if (opcode_2 == 2) + CP15[CP15_CLEAN_DATA_CACHE_LINE_USING_INDEX] = value; + } + else if (crm == 14) + { + if (opcode_2 == 0) + CP15[CP15_CLEAN_AND_INVALIDATE_DATA_CACHE] = value; + else if (opcode_2 == 1) + CP15[CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_MVA] = value; + else if (opcode_2 == 2) + CP15[CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_INDEX] = value; + } + } + else if (crn == 8 && opcode_1 == 0) + { + if (crm == 5) + { + if (opcode_2 == 0) + CP15[CP15_INVALIDATE_ITLB] = value; + else if (opcode_2 == 1) + CP15[CP15_INVALIDATE_ITLB_SINGLE_ENTRY] = value; + else if (opcode_2 == 2) + CP15[CP15_INVALIDATE_ITLB_ENTRY_ON_ASID_MATCH] = value; + else if (opcode_2 == 3) + CP15[CP15_INVALIDATE_ITLB_ENTRY_ON_MVA] = value; + } + else if (crm == 6) + { + if (opcode_2 == 0) + CP15[CP15_INVALIDATE_DTLB] = value; + else if (opcode_2 == 1) + CP15[CP15_INVALIDATE_DTLB_SINGLE_ENTRY] = value; + else if (opcode_2 == 2) + CP15[CP15_INVALIDATE_DTLB_ENTRY_ON_ASID_MATCH] = value; + else if (opcode_2 == 3) + CP15[CP15_INVALIDATE_DTLB_ENTRY_ON_MVA] = value; + } + else if (crm == 7) + { + if (opcode_2 == 0) + CP15[CP15_INVALIDATE_UTLB] = value; + else if (opcode_2 == 1) + CP15[CP15_INVALIDATE_UTLB_SINGLE_ENTRY] = value; + else if (opcode_2 == 2) + CP15[CP15_INVALIDATE_UTLB_ENTRY_ON_ASID_MATCH] = value; + else if (opcode_2 == 3) + CP15[CP15_INVALIDATE_UTLB_ENTRY_ON_MVA] = value; + } + } + else if (crn == 9 && opcode_1 == 0 && crm == 0 && opcode_2 == 0) + { + CP15[CP15_DATA_CACHE_LOCKDOWN] = value; + } + else if (crn == 10 && opcode_1 == 0) + { + if (crm == 0 && opcode_2 == 0) + { + CP15[CP15_TLB_LOCKDOWN] = value; + } + else if (crm == 2) + { + if (opcode_2 == 0) + CP15[CP15_PRIMARY_REGION_REMAP] = value; + else if (opcode_2 == 1) + CP15[CP15_NORMAL_REGION_REMAP] = value; + } + } + else if (crn == 13 && opcode_1 == 0 && crm == 0) + { + if (opcode_2 == 0) + CP15[CP15_PID] = value; + else if (opcode_2 == 1) + CP15[CP15_CONTEXT_ID] = value; + else if (opcode_2 == 3) + CP15[CP15_THREAD_URO] = value; + else if (opcode_2 == 4) + CP15[CP15_THREAD_PRW] = value; + } + else if (crn == 15) + { + if (opcode_1 == 0 && crm == 12) + { + if (opcode_2 == 0) + CP15[CP15_PERFORMANCE_MONITOR_CONTROL] = value; + else if (opcode_2 == 1) + CP15[CP15_CYCLE_COUNTER] = value; + else if (opcode_2 == 2) + CP15[CP15_COUNT_0] = value; + else if (opcode_2 == 3) + CP15[CP15_COUNT_1] = value; + } + else if (opcode_1 == 5) + { + if (crm == 4) + { + if (opcode_2 == 2) + CP15[CP15_READ_MAIN_TLB_LOCKDOWN_ENTRY] = value; + else if (opcode_2 == 4) + CP15[CP15_WRITE_MAIN_TLB_LOCKDOWN_ENTRY] = value; + } + else if (crm == 5 && opcode_2 == 2) + { + CP15[CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS] = value; + } + else if (crm == 6 && opcode_2 == 2) + { + CP15[CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS] = value; + } + else if (crm == 7 && opcode_2 == 2) + { + CP15[CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE] = value; + } + } + else if (opcode_1 == 7 && crm == 1 && opcode_2 == 0) + { + CP15[CP15_TLB_DEBUG_CONTROL] = value; + } + } + } + + // Unprivileged registers + if (crn == 7 && opcode_1 == 0 && crm == 5 && opcode_2 == 4) + { + CP15[CP15_FLUSH_PREFETCH_BUFFER] = value; + } + else if (crn == 7 && opcode_1 == 0 && crm == 10) + { + if (opcode_2 == 4) + CP15[CP15_DATA_SYNC_BARRIER] = value; + else if (opcode_2 == 5) + CP15[CP15_DATA_MEMORY_BARRIER] = value; + } + else if (crn == 13 && opcode_1 == 0 && crm == 0 && opcode_2 == 2) + { + CP15[CP15_THREAD_UPRW] = value; + } +} diff --git a/src/core/arm/skyeye_common/armstate.h b/src/core/arm/skyeye_common/armstate.h index 3ba0ba5cd..b364e2621 100644 --- a/src/core/arm/skyeye_common/armstate.h +++ b/src/core/arm/skyeye_common/armstate.h @@ -17,6 +17,7 @@ #pragma once +#include <array> #include <unordered_map> #include "common/common_types.h" @@ -37,67 +38,30 @@ enum { INSTCACHE = 2, }; -#define VFP_REG_NUM 64 -struct ARMul_State -{ - u32 Emulate; // To start and stop emulation - - // Order of the following register should not be modified - u32 Reg[16]; // The current register file - u32 Cpsr; // The current PSR - u32 Spsr_copy; - u32 phys_pc; - u32 Reg_usr[2]; - u32 Reg_svc[2]; // R13_SVC R14_SVC - u32 Reg_abort[2]; // R13_ABORT R14_ABORT - u32 Reg_undef[2]; // R13 UNDEF R14 UNDEF - u32 Reg_irq[2]; // R13_IRQ R14_IRQ - u32 Reg_firq[7]; // R8---R14 FIRQ - u32 Spsr[7]; // The exception psr's - u32 Mode; // The current mode - u32 Bank; // The current register bank - u32 exclusive_tag; // The address for which the local monitor is in exclusive access mode - u32 exclusive_state; - u32 exclusive_result; - u32 CP15[CP15_REGISTER_COUNT]; - - // FPSID, FPSCR, and FPEXC - u32 VFP[VFP_SYSTEM_REGISTER_COUNT]; - // VFPv2 and VFPv3-D16 has 16 doubleword registers (D0-D16 or S0-S31). - // VFPv3-D32/ASIMD may have up to 32 doubleword registers (D0-D31), - // and only 32 singleword registers are accessible (S0-S31). - u32 ExtReg[VFP_REG_NUM]; - /* ---- End of the ordered registers ---- */ - - u32 NFlag, ZFlag, CFlag, VFlag, IFFlags; // Dummy flags for speed - unsigned int shifter_carry_out; - - // Add armv6 flags dyf:2010-08-09 - u32 GEFlag, EFlag, AFlag, QFlag; - - u32 TFlag; // Thumb state - - unsigned long long NumInstrs; // The number of instructions executed - unsigned NumInstrsToExecute; - - unsigned NresetSig; // Reset the processor - unsigned NfiqSig; - unsigned NirqSig; - - unsigned abortSig; - unsigned NtransSig; - unsigned bigendSig; - unsigned syscallSig; - - // TODO(bunnei): Move this cache to a better place - it should be per codeset (likely per - // process for our purposes), not per ARMul_State (which tracks CPU core state). - std::unordered_map<u32, int> instruction_cache; +// ARM privilege modes +enum PrivilegeMode { + USER32MODE = 16, + FIQ32MODE = 17, + IRQ32MODE = 18, + SVC32MODE = 19, + ABORT32MODE = 23, + UNDEF32MODE = 27, + SYSTEM32MODE = 31 }; -/***************************************************************************\ -* The hardware vector addresses * -\***************************************************************************/ +// ARM privilege mode register banks +enum { + USERBANK = 0, + FIQBANK = 1, + IRQBANK = 2, + SVCBANK = 3, + ABORTBANK = 4, + UNDEFBANK = 5, + DUMMYBANK = 6, + SYSTEMBANK = 7 +}; +// Hardware vector addresses enum { ARMResetV = 0, ARMUndefinedInstrV = 4, @@ -119,40 +83,7 @@ enum { ARMul_FIQV = ARMFIQV }; -/***************************************************************************\ -* Mode and Bank Constants * -\***************************************************************************/ - -enum PrivilegeMode { - USER32MODE = 16, - FIQ32MODE = 17, - IRQ32MODE = 18, - SVC32MODE = 19, - ABORT32MODE = 23, - UNDEF32MODE = 27, - SYSTEM32MODE = 31 -}; - -enum { - USERBANK = 0, - FIQBANK = 1, - IRQBANK = 2, - SVCBANK = 3, - ABORTBANK = 4, - UNDEFBANK = 5, - DUMMYBANK = 6, - SYSTEMBANK = 7 -}; - -/***************************************************************************\ -* Definitions of things in the emulator * -\***************************************************************************/ -void ARMul_Reset(ARMul_State* state); - -/***************************************************************************\ -* Definitions of things in the co-processor interface * -\***************************************************************************/ - +// Coprocessor status values enum { ARMul_FIRST = 0, ARMul_TRANSFER = 1, @@ -164,10 +95,7 @@ enum { ARMul_INC = 3 }; -/***************************************************************************\ -* Definitions of things in the host environment * -\***************************************************************************/ - +// Instruction condition codes enum ConditionCode { EQ = 0, NE = 1, @@ -213,3 +141,112 @@ enum { ONCE = 2, // Execute just one iteration RUN = 3 // Continuous execution }; + + +struct ARMul_State final +{ +public: + explicit ARMul_State(PrivilegeMode initial_mode); + + void ChangePrivilegeMode(u32 new_mode); + void Reset(); + + // Reads/writes data in big/little endian format based on the + // state of the E (endian) bit in the APSR. + u16 ReadMemory16(u32 address) const; + u32 ReadMemory32(u32 address) const; + u64 ReadMemory64(u32 address) const; + void WriteMemory16(u32 address, u16 data); + void WriteMemory32(u32 address, u32 data); + void WriteMemory64(u32 address, u64 data); + + u32 ReadCP15Register(u32 crn, u32 opcode_1, u32 crm, u32 opcode_2) const; + void WriteCP15Register(u32 value, u32 crn, u32 opcode_1, u32 crm, u32 opcode_2); + + // Exclusive memory access functions + bool IsExclusiveMemoryAccess(u32 address) const { + return exclusive_state && exclusive_tag == (address & RESERVATION_GRANULE_MASK); + } + void SetExclusiveMemoryAddress(u32 address) { + exclusive_tag = address & RESERVATION_GRANULE_MASK; + exclusive_state = true; + } + void UnsetExclusiveMemoryAddress() { + exclusive_tag = 0xFFFFFFFF; + exclusive_state = false; + } + + // Whether or not the given CPU is in big endian mode (E bit is set) + bool InBigEndianMode() const { + return (Cpsr & (1 << 9)) != 0; + } + // Whether or not the given CPU is in a mode other than user mode. + bool InAPrivilegedMode() const { + return (Mode != USER32MODE); + } + // Note that for the 3DS, a Thumb instruction will only ever be + // two bytes in size. Thus we don't need to worry about ThumbEE + // or Thumb-2 where instructions can be 4 bytes in length. + u32 GetInstructionSize() const { + return TFlag ? 2 : 4; + } + + std::array<u32, 16> Reg; // The current register file + std::array<u32, 2> Reg_usr; + std::array<u32, 2> Reg_svc; // R13_SVC R14_SVC + std::array<u32, 2> Reg_abort; // R13_ABORT R14_ABORT + std::array<u32, 2> Reg_undef; // R13 UNDEF R14 UNDEF + std::array<u32, 2> Reg_irq; // R13_IRQ R14_IRQ + std::array<u32, 7> Reg_firq; // R8---R14 FIRQ + std::array<u32, 7> Spsr; // The exception psr's + std::array<u32, CP15_REGISTER_COUNT> CP15; + + // FPSID, FPSCR, and FPEXC + std::array<u32, VFP_SYSTEM_REGISTER_COUNT> VFP; + + // VFPv2 and VFPv3-D16 has 16 doubleword registers (D0-D16 or S0-S31). + // VFPv3-D32/ASIMD may have up to 32 doubleword registers (D0-D31), + // and only 32 singleword registers are accessible (S0-S31). + std::array<u32, 64> ExtReg; + + u32 Emulate; // To start and stop emulation + u32 Cpsr; // The current PSR + u32 Spsr_copy; + u32 phys_pc; + + u32 Mode; // The current mode + u32 Bank; // The current register bank + + u32 NFlag, ZFlag, CFlag, VFlag, IFFlags; // Dummy flags for speed + unsigned int shifter_carry_out; + + u32 TFlag; // Thumb state + + unsigned long long NumInstrs; // The number of instructions executed + unsigned NumInstrsToExecute; + + unsigned NresetSig; // Reset the processor + unsigned NfiqSig; + unsigned NirqSig; + + unsigned abortSig; + unsigned NtransSig; + unsigned bigendSig; + unsigned syscallSig; + + // TODO(bunnei): Move this cache to a better place - it should be per codeset (likely per + // process for our purposes), not per ARMul_State (which tracks CPU core state). + std::unordered_map<u32, int> instruction_cache; + +private: + void ResetMPCoreCP15Registers(); + + // Defines a reservation granule of 2 words, which protects the first 2 words starting at the tag. + // This is the smallest granule allowed by the v7 spec, and is coincidentally just large enough to + // support LDR/STREXD. + static const u32 RESERVATION_GRANULE_MASK = 0xFFFFFFF8; + + u32 exclusive_tag; // The address for which the local monitor is in exclusive access mode + u32 exclusive_result; + bool exclusive_state; +}; diff --git a/src/core/arm/skyeye_common/armsupp.cpp b/src/core/arm/skyeye_common/armsupp.cpp index affbf193a..d31fb9449 100644 --- a/src/core/arm/skyeye_common/armsupp.cpp +++ b/src/core/arm/skyeye_common/armsupp.cpp @@ -206,433 +206,3 @@ u32 ARMul_UnsignedSatQ(s32 value, u8 shift, bool* saturation_occurred) *saturation_occurred = false; return (u32)value; } - -// Whether or not the given CPU is in big endian mode (E bit is set) -bool InBigEndianMode(ARMul_State* cpu) -{ - return (cpu->Cpsr & (1 << 9)) != 0; -} - -// Whether or not the given CPU is in a mode other than user mode. -bool InAPrivilegedMode(ARMul_State* cpu) -{ - return (cpu->Mode != USER32MODE); -} - -// Reads from the CP15 registers. Used with implementation of the MRC instruction. -// Note that since the 3DS does not have the hypervisor extensions, these registers -// are not implemented. -u32 ReadCP15Register(ARMul_State* cpu, u32 crn, u32 opcode_1, u32 crm, u32 opcode_2) -{ - // Unprivileged registers - if (crn == 13 && opcode_1 == 0 && crm == 0) - { - if (opcode_2 == 2) - return cpu->CP15[CP15_THREAD_UPRW]; - - if (opcode_2 == 3) - return cpu->CP15[CP15_THREAD_URO]; - } - - if (InAPrivilegedMode(cpu)) - { - if (crn == 0 && opcode_1 == 0) - { - if (crm == 0) - { - if (opcode_2 == 0) - return cpu->CP15[CP15_MAIN_ID]; - - if (opcode_2 == 1) - return cpu->CP15[CP15_CACHE_TYPE]; - - if (opcode_2 == 3) - return cpu->CP15[CP15_TLB_TYPE]; - - if (opcode_2 == 5) - return cpu->CP15[CP15_CPU_ID]; - } - else if (crm == 1) - { - if (opcode_2 == 0) - return cpu->CP15[CP15_PROCESSOR_FEATURE_0]; - - if (opcode_2 == 1) - return cpu->CP15[CP15_PROCESSOR_FEATURE_1]; - - if (opcode_2 == 2) - return cpu->CP15[CP15_DEBUG_FEATURE_0]; - - if (opcode_2 == 4) - return cpu->CP15[CP15_MEMORY_MODEL_FEATURE_0]; - - if (opcode_2 == 5) - return cpu->CP15[CP15_MEMORY_MODEL_FEATURE_1]; - - if (opcode_2 == 6) - return cpu->CP15[CP15_MEMORY_MODEL_FEATURE_2]; - - if (opcode_2 == 7) - return cpu->CP15[CP15_MEMORY_MODEL_FEATURE_3]; - } - else if (crm == 2) - { - if (opcode_2 == 0) - return cpu->CP15[CP15_ISA_FEATURE_0]; - - if (opcode_2 == 1) - return cpu->CP15[CP15_ISA_FEATURE_1]; - - if (opcode_2 == 2) - return cpu->CP15[CP15_ISA_FEATURE_2]; - - if (opcode_2 == 3) - return cpu->CP15[CP15_ISA_FEATURE_3]; - - if (opcode_2 == 4) - return cpu->CP15[CP15_ISA_FEATURE_4]; - } - } - - if (crn == 1 && opcode_1 == 0 && crm == 0) - { - if (opcode_2 == 0) - return cpu->CP15[CP15_CONTROL]; - - if (opcode_2 == 1) - return cpu->CP15[CP15_AUXILIARY_CONTROL]; - - if (opcode_2 == 2) - return cpu->CP15[CP15_COPROCESSOR_ACCESS_CONTROL]; - } - - if (crn == 2 && opcode_1 == 0 && crm == 0) - { - if (opcode_2 == 0) - return cpu->CP15[CP15_TRANSLATION_BASE_TABLE_0]; - - if (opcode_2 == 1) - return cpu->CP15[CP15_TRANSLATION_BASE_TABLE_1]; - - if (opcode_2 == 2) - return cpu->CP15[CP15_TRANSLATION_BASE_CONTROL]; - } - - if (crn == 3 && opcode_1 == 0 && crm == 0 && opcode_2 == 0) - return cpu->CP15[CP15_DOMAIN_ACCESS_CONTROL]; - - if (crn == 5 && opcode_1 == 0 && crm == 0) - { - if (opcode_2 == 0) - return cpu->CP15[CP15_FAULT_STATUS]; - - if (opcode_2 == 1) - return cpu->CP15[CP15_INSTR_FAULT_STATUS]; - } - - if (crn == 6 && opcode_1 == 0 && crm == 0) - { - if (opcode_2 == 0) - return cpu->CP15[CP15_FAULT_ADDRESS]; - - if (opcode_2 == 1) - return cpu->CP15[CP15_WFAR]; - } - - if (crn == 7 && opcode_1 == 0 && crm == 4 && opcode_2 == 0) - return cpu->CP15[CP15_PHYS_ADDRESS]; - - if (crn == 9 && opcode_1 == 0 && crm == 0 && opcode_2 == 0) - return cpu->CP15[CP15_DATA_CACHE_LOCKDOWN]; - - if (crn == 10 && opcode_1 == 0) - { - if (crm == 0 && opcode_2 == 0) - return cpu->CP15[CP15_TLB_LOCKDOWN]; - - if (crm == 2) - { - if (opcode_2 == 0) - return cpu->CP15[CP15_PRIMARY_REGION_REMAP]; - - if (opcode_2 == 1) - return cpu->CP15[CP15_NORMAL_REGION_REMAP]; - } - } - - if (crn == 13 && crm == 0) - { - if (opcode_2 == 0) - return cpu->CP15[CP15_PID]; - - if (opcode_2 == 1) - return cpu->CP15[CP15_CONTEXT_ID]; - - if (opcode_2 == 4) - return cpu->CP15[CP15_THREAD_PRW]; - } - - if (crn == 15) - { - if (opcode_1 == 0 && crm == 12) - { - if (opcode_2 == 0) - return cpu->CP15[CP15_PERFORMANCE_MONITOR_CONTROL]; - - if (opcode_2 == 1) - return cpu->CP15[CP15_CYCLE_COUNTER]; - - if (opcode_2 == 2) - return cpu->CP15[CP15_COUNT_0]; - - if (opcode_2 == 3) - return cpu->CP15[CP15_COUNT_1]; - } - - if (opcode_1 == 5 && opcode_2 == 2) - { - if (crm == 5) - return cpu->CP15[CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS]; - - if (crm == 6) - return cpu->CP15[CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS]; - - if (crm == 7) - return cpu->CP15[CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE]; - } - - if (opcode_1 == 7 && crm == 1 && opcode_2 == 0) - return cpu->CP15[CP15_TLB_DEBUG_CONTROL]; - } - } - - LOG_ERROR(Core_ARM11, "MRC CRn=%u, CRm=%u, OP1=%u OP2=%u is not implemented. Returning zero.", crn, crm, opcode_1, opcode_2); - return 0; -} - -// Write to the CP15 registers. Used with implementation of the MCR instruction. -// Note that since the 3DS does not have the hypervisor extensions, these registers -// are not implemented. -void WriteCP15Register(ARMul_State* cpu, u32 value, u32 crn, u32 opcode_1, u32 crm, u32 opcode_2) -{ - if (InAPrivilegedMode(cpu)) - { - if (crn == 1 && opcode_1 == 0 && crm == 0) - { - if (opcode_2 == 0) - cpu->CP15[CP15_CONTROL] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_AUXILIARY_CONTROL] = value; - else if (opcode_2 == 2) - cpu->CP15[CP15_COPROCESSOR_ACCESS_CONTROL] = value; - } - else if (crn == 2 && opcode_1 == 0 && crm == 0) - { - if (opcode_2 == 0) - cpu->CP15[CP15_TRANSLATION_BASE_TABLE_0] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_TRANSLATION_BASE_TABLE_1] = value; - else if (opcode_2 == 2) - cpu->CP15[CP15_TRANSLATION_BASE_CONTROL] = value; - } - else if (crn == 3 && opcode_1 == 0 && crm == 0 && opcode_2 == 0) - { - cpu->CP15[CP15_DOMAIN_ACCESS_CONTROL] = value; - } - else if (crn == 5 && opcode_1 == 0 && crm == 0) - { - if (opcode_2 == 0) - cpu->CP15[CP15_FAULT_STATUS] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_INSTR_FAULT_STATUS] = value; - } - else if (crn == 6 && opcode_1 == 0 && crm == 0) - { - if (opcode_2 == 0) - cpu->CP15[CP15_FAULT_ADDRESS] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_WFAR] = value; - } - else if (crn == 7 && opcode_1 == 0) - { - if (crm == 0 && opcode_2 == 4) - { - cpu->CP15[CP15_WAIT_FOR_INTERRUPT] = value; - } - else if (crm == 4 && opcode_2 == 0) - { - // NOTE: Not entirely accurate. This should do permission checks. - cpu->CP15[CP15_PHYS_ADDRESS] = Memory::VirtualToPhysicalAddress(value); - } - else if (crm == 5) - { - if (opcode_2 == 0) - cpu->CP15[CP15_INVALIDATE_INSTR_CACHE] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_INVALIDATE_INSTR_CACHE_USING_MVA] = value; - else if (opcode_2 == 2) - cpu->CP15[CP15_INVALIDATE_INSTR_CACHE_USING_INDEX] = value; - else if (opcode_2 == 6) - cpu->CP15[CP15_FLUSH_BRANCH_TARGET_CACHE] = value; - else if (opcode_2 == 7) - cpu->CP15[CP15_FLUSH_BRANCH_TARGET_CACHE_ENTRY] = value; - } - else if (crm == 6) - { - if (opcode_2 == 0) - cpu->CP15[CP15_INVALIDATE_DATA_CACHE] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_INVALIDATE_DATA_CACHE_LINE_USING_MVA] = value; - else if (opcode_2 == 2) - cpu->CP15[CP15_INVALIDATE_DATA_CACHE_LINE_USING_INDEX] = value; - } - else if (crm == 7 && opcode_2 == 0) - { - cpu->CP15[CP15_INVALIDATE_DATA_AND_INSTR_CACHE] = value; - } - else if (crm == 10) - { - if (opcode_2 == 0) - cpu->CP15[CP15_CLEAN_DATA_CACHE] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_CLEAN_DATA_CACHE_LINE_USING_MVA] = value; - else if (opcode_2 == 2) - cpu->CP15[CP15_CLEAN_DATA_CACHE_LINE_USING_INDEX] = value; - } - else if (crm == 14) - { - if (opcode_2 == 0) - cpu->CP15[CP15_CLEAN_AND_INVALIDATE_DATA_CACHE] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_MVA] = value; - else if (opcode_2 == 2) - cpu->CP15[CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_INDEX] = value; - } - } - else if (crn == 8 && opcode_1 == 0) - { - LOG_WARNING(Core_ARM11, "TLB operations not fully implemented."); - - if (crm == 5) - { - if (opcode_2 == 0) - cpu->CP15[CP15_INVALIDATE_ITLB] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_INVALIDATE_ITLB_SINGLE_ENTRY] = value; - else if (opcode_2 == 2) - cpu->CP15[CP15_INVALIDATE_ITLB_ENTRY_ON_ASID_MATCH] = value; - else if (opcode_2 == 3) - cpu->CP15[CP15_INVALIDATE_ITLB_ENTRY_ON_MVA] = value; - } - else if (crm == 6) - { - if (opcode_2 == 0) - cpu->CP15[CP15_INVALIDATE_DTLB] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_INVALIDATE_DTLB_SINGLE_ENTRY] = value; - else if (opcode_2 == 2) - cpu->CP15[CP15_INVALIDATE_DTLB_ENTRY_ON_ASID_MATCH] = value; - else if (opcode_2 == 3) - cpu->CP15[CP15_INVALIDATE_DTLB_ENTRY_ON_MVA] = value; - } - else if (crm == 7) - { - if (opcode_2 == 0) - cpu->CP15[CP15_INVALIDATE_UTLB] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_INVALIDATE_UTLB_SINGLE_ENTRY] = value; - else if (opcode_2 == 2) - cpu->CP15[CP15_INVALIDATE_UTLB_ENTRY_ON_ASID_MATCH] = value; - else if (opcode_2 == 3) - cpu->CP15[CP15_INVALIDATE_UTLB_ENTRY_ON_MVA] = value; - } - } - else if (crn == 9 && opcode_1 == 0 && crm == 0 && opcode_2 == 0) - { - cpu->CP15[CP15_DATA_CACHE_LOCKDOWN] = value; - } - else if (crn == 10 && opcode_1 == 0) - { - if (crm == 0 && opcode_2 == 0) - { - cpu->CP15[CP15_TLB_LOCKDOWN] = value; - } - else if (crm == 2) - { - if (opcode_2 == 0) - cpu->CP15[CP15_PRIMARY_REGION_REMAP] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_NORMAL_REGION_REMAP] = value; - } - } - else if (crn == 13 && opcode_1 == 0 && crm == 0) - { - if (opcode_2 == 0) - cpu->CP15[CP15_PID] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_CONTEXT_ID] = value; - else if (opcode_2 == 3) - cpu->CP15[CP15_THREAD_URO] = value; - else if (opcode_2 == 4) - cpu->CP15[CP15_THREAD_PRW] = value; - } - else if (crn == 15) - { - if (opcode_1 == 0 && crm == 12) - { - if (opcode_2 == 0) - cpu->CP15[CP15_PERFORMANCE_MONITOR_CONTROL] = value; - else if (opcode_2 == 1) - cpu->CP15[CP15_CYCLE_COUNTER] = value; - else if (opcode_2 == 2) - cpu->CP15[CP15_COUNT_0] = value; - else if (opcode_2 == 3) - cpu->CP15[CP15_COUNT_1] = value; - } - else if (opcode_1 == 5) - { - if (crm == 4) - { - if (opcode_2 == 2) - cpu->CP15[CP15_READ_MAIN_TLB_LOCKDOWN_ENTRY] = value; - else if (opcode_2 == 4) - cpu->CP15[CP15_WRITE_MAIN_TLB_LOCKDOWN_ENTRY] = value; - } - else if (crm == 5 && opcode_2 == 2) - { - cpu->CP15[CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS] = value; - } - else if (crm == 6 && opcode_2 == 2) - { - cpu->CP15[CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS] = value; - } - else if (crm == 7 && opcode_2 == 2) - { - cpu->CP15[CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE] = value; - } - } - else if (opcode_1 == 7 && crm == 1 && opcode_2 == 0) - { - cpu->CP15[CP15_TLB_DEBUG_CONTROL] = value; - } - } - } - - // Unprivileged registers - if (crn == 7 && opcode_1 == 0 && crm == 5 && opcode_2 == 4) - { - cpu->CP15[CP15_FLUSH_PREFETCH_BUFFER] = value; - } - else if (crn == 7 && opcode_1 == 0 && crm == 10) - { - if (opcode_2 == 4) - cpu->CP15[CP15_DATA_SYNC_BARRIER] = value; - else if (opcode_2 == 5) - cpu->CP15[CP15_DATA_MEMORY_BARRIER] = value; - - } - else if (crn == 13 && opcode_1 == 0 && crm == 0 && opcode_2 == 2) - { - cpu->CP15[CP15_THREAD_UPRW] = value; - } -} diff --git a/src/core/arm/skyeye_common/armsupp.h b/src/core/arm/skyeye_common/armsupp.h index 5cf1cd1d3..391309fa8 100644 --- a/src/core/arm/skyeye_common/armsupp.h +++ b/src/core/arm/skyeye_common/armsupp.h @@ -6,8 +6,6 @@ #include "common/common_types.h" -struct ARMul_State; - #define BITS(s, a, b) ((s << ((sizeof(s) * 8 - 1) - b)) >> (sizeof(s) * 8 - b + a - 1)) #define BIT(s, n) ((s >> (n)) & 1) @@ -32,9 +30,3 @@ u16 ARMul_UnsignedSaturatedSub16(u16, u16); u8 ARMul_UnsignedAbsoluteDifference(u8, u8); u32 ARMul_SignedSatQ(s32, u8, bool*); u32 ARMul_UnsignedSatQ(s32, u8, bool*); - -bool InBigEndianMode(ARMul_State*); -bool InAPrivilegedMode(ARMul_State*); - -u32 ReadCP15Register(ARMul_State* cpu, u32 crn, u32 opcode_1, u32 crm, u32 opcode_2); -void WriteCP15Register(ARMul_State* cpu, u32 value, u32 crn, u32 opcode_1, u32 crm, u32 opcode_2); diff --git a/src/core/arm/skyeye_common/vfp/vfp.cpp b/src/core/arm/skyeye_common/vfp/vfp.cpp index 26f303de4..0537135e2 100644 --- a/src/core/arm/skyeye_common/vfp/vfp.cpp +++ b/src/core/arm/skyeye_common/vfp/vfp.cpp @@ -21,6 +21,7 @@ /* Note: this file handles interface with arm core and vfp registers */ #include "common/common_funcs.h" +#include "common/common_types.h" #include "common/logging/log.h" #include "core/arm/skyeye_common/armstate.h" @@ -110,30 +111,30 @@ void VMOVR(ARMul_State* state, u32 single, u32 d, u32 m) } /* Miscellaneous functions */ -int32_t vfp_get_float(ARMul_State* state, unsigned int reg) +s32 vfp_get_float(ARMul_State* state, unsigned int reg) { LOG_TRACE(Core_ARM11, "VFP get float: s%d=[%08x]\n", reg, state->ExtReg[reg]); return state->ExtReg[reg]; } -void vfp_put_float(ARMul_State* state, int32_t val, unsigned int reg) +void vfp_put_float(ARMul_State* state, s32 val, unsigned int reg) { LOG_TRACE(Core_ARM11, "VFP put float: s%d <= [%08x]\n", reg, val); state->ExtReg[reg] = val; } -uint64_t vfp_get_double(ARMul_State* state, unsigned int reg) +u64 vfp_get_double(ARMul_State* state, unsigned int reg) { - uint64_t result = ((uint64_t) state->ExtReg[reg*2+1])<<32 | state->ExtReg[reg*2]; + u64 result = ((u64) state->ExtReg[reg*2+1])<<32 | state->ExtReg[reg*2]; LOG_TRACE(Core_ARM11, "VFP get double: s[%d-%d]=[%016llx]\n", reg * 2 + 1, reg * 2, result); return result; } -void vfp_put_double(ARMul_State* state, uint64_t val, unsigned int reg) +void vfp_put_double(ARMul_State* state, u64 val, unsigned int reg) { - LOG_TRACE(Core_ARM11, "VFP put double: s[%d-%d] <= [%08x-%08x]\n", reg * 2 + 1, reg * 2, (uint32_t)(val >> 32), (uint32_t)(val & 0xffffffff)); - state->ExtReg[reg*2] = (uint32_t) (val & 0xffffffff); - state->ExtReg[reg*2+1] = (uint32_t) (val>>32); + LOG_TRACE(Core_ARM11, "VFP put double: s[%d-%d] <= [%08x-%08x]\n", reg * 2 + 1, reg * 2, (u32)(val >> 32), (u32)(val & 0xffffffff)); + state->ExtReg[reg*2] = (u32) (val & 0xffffffff); + state->ExtReg[reg*2+1] = (u32) (val>>32); } /* diff --git a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp index 1d844a66e..47a9fe804 100644 --- a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp +++ b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp @@ -51,6 +51,7 @@ * =========================================================================== */ +#include <algorithm> #include "common/logging/log.h" #include "core/arm/skyeye_common/vfp/vfp.h" #include "core/arm/skyeye_common/vfp/vfp_helper.h" @@ -785,9 +786,7 @@ u32 vfp_double_add(struct vfp_double *vdd, struct vfp_double *vdn,struct vfp_dou * This ensures that NaN propagation works correctly. */ if (vdn->exponent < vdm->exponent) { - struct vfp_double *t = vdn; - vdn = vdm; - vdm = t; + std::swap(vdm, vdn); } /* @@ -843,9 +842,7 @@ vfp_double_multiply(struct vfp_double *vdd, struct vfp_double *vdn, * This ensures that NaN propagation works correctly. */ if (vdn->exponent < vdm->exponent) { - struct vfp_double *t = vdn; - vdn = vdm; - vdm = t; + std::swap(vdm, vdn); LOG_TRACE(Core_ARM11, "VFP: swapping M <-> N\n"); } diff --git a/src/core/arm/skyeye_common/vfp/vfpinstr.cpp b/src/core/arm/skyeye_common/vfp/vfpinstr.cpp index 8efcbab1c..49298d7ba 100644 --- a/src/core/arm/skyeye_common/vfp/vfpinstr.cpp +++ b/src/core/arm/skyeye_common/vfp/vfpinstr.cpp @@ -51,7 +51,7 @@ VMLA_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmla_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -100,7 +100,7 @@ VMLS_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmls_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -149,7 +149,7 @@ VNMLA_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vnmla_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -199,7 +199,7 @@ VNMLS_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vnmls_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -248,7 +248,7 @@ VNMUL_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vnmul_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -297,7 +297,7 @@ VMUL_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmul_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -346,7 +346,7 @@ VADD_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vadd_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -395,7 +395,7 @@ VSUB_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vsub_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -444,7 +444,7 @@ VDIV_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vdiv_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -492,7 +492,7 @@ VMOVI_INST: VMOVI(cpu, inst_cream->single, inst_cream->d, inst_cream->imm); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmovi_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -536,7 +536,7 @@ VMOVR_INST: VMOVR(cpu, inst_cream->single, inst_cream->d, inst_cream->m); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmovr_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -585,7 +585,7 @@ VABS_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vabs_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -635,7 +635,7 @@ VNEG_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vneg_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -684,7 +684,7 @@ VSQRT_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vsqrt_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -733,7 +733,7 @@ VCMP_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vcmp_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -782,7 +782,7 @@ VCMP2_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vcmp2_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -831,7 +831,7 @@ VCVTBDS_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vcvtbds_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -882,7 +882,7 @@ VCVTBFF_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vcvtbff_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -931,7 +931,7 @@ VCVTBFI_INST: CHECK_VFP_CDP_RET; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vcvtbfi_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -981,7 +981,7 @@ VMOVBRS_INST: VMOVBRS(cpu, inst_cream->to_arm, inst_cream->t, inst_cream->n, &(cpu->Reg[inst_cream->t])); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmovbrs_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1032,7 +1032,7 @@ VMSR_INST: { cpu->VFP[VFP_FPSCR] = cpu->Reg[rt]; } - else if (InAPrivilegedMode(cpu)) + else if (cpu->InAPrivilegedMode()) { if (reg == 8) cpu->VFP[VFP_FPEXC] = cpu->Reg[rt]; @@ -1042,7 +1042,7 @@ VMSR_INST: cpu->VFP[VFP_FPINST2] = cpu->Reg[rt]; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmsr_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1090,7 +1090,7 @@ VMOVBRC_INST: cpu->ExtReg[(2 * inst_cream->d) + inst_cream->index] = cpu->Reg[inst_cream->t]; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmovbrc_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1163,7 +1163,7 @@ VMRS_INST: { cpu->Reg[rt] = cpu->VFP[VFP_MVFR0]; } - else if (InAPrivilegedMode(cpu)) + else if (cpu->InAPrivilegedMode()) { if (reg == 8) cpu->Reg[rt] = cpu->VFP[VFP_FPEXC]; @@ -1173,7 +1173,7 @@ VMRS_INST: cpu->Reg[rt] = cpu->VFP[VFP_FPINST2]; } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmrs_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1221,7 +1221,7 @@ VMOVBCR_INST: cpu->Reg[inst_cream->t] = cpu->ExtReg[(2 * inst_cream->d) + inst_cream->index]; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmovbcr_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1274,7 +1274,7 @@ VMOVBRRSS_INST: VMOVBRRSS(cpu, inst_cream->to_arm, inst_cream->t, inst_cream->t2, inst_cream->m, &cpu->Reg[inst_cream->t], &cpu->Reg[inst_cream->t2]); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmovbrrss_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1322,7 +1322,7 @@ VMOVBRRD_INST: VMOVBRRD(cpu, inst_cream->to_arm, inst_cream->t, inst_cream->t2, inst_cream->m, &(cpu->Reg[inst_cream->t]), &(cpu->Reg[inst_cream->t2])); } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vmovbrrd_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1378,23 +1378,23 @@ VSTR_INST: if (inst_cream->single) { - WriteMemory32(cpu, addr, cpu->ExtReg[inst_cream->d]); + cpu->WriteMemory32(addr, cpu->ExtReg[inst_cream->d]); } else { const u32 word1 = cpu->ExtReg[inst_cream->d*2+0]; const u32 word2 = cpu->ExtReg[inst_cream->d*2+1]; - if (InBigEndianMode(cpu)) { - WriteMemory32(cpu, addr + 0, word2); - WriteMemory32(cpu, addr + 4, word1); + if (cpu->InBigEndianMode()) { + cpu->WriteMemory32(addr + 0, word2); + cpu->WriteMemory32(addr + 4, word1); } else { - WriteMemory32(cpu, addr + 0, word1); - WriteMemory32(cpu, addr + 4, word2); + cpu->WriteMemory32(addr + 0, word1); + cpu->WriteMemory32(addr + 4, word2); } } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vstr_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1444,7 +1444,7 @@ VPUSH_INST: { if (inst_cream->single) { - WriteMemory32(cpu, addr, cpu->ExtReg[inst_cream->d+i]); + cpu->WriteMemory32(addr, cpu->ExtReg[inst_cream->d+i]); addr += 4; } else @@ -1452,12 +1452,12 @@ VPUSH_INST: const u32 word1 = cpu->ExtReg[(inst_cream->d+i)*2+0]; const u32 word2 = cpu->ExtReg[(inst_cream->d+i)*2+1]; - if (InBigEndianMode(cpu)) { - WriteMemory32(cpu, addr + 0, word2); - WriteMemory32(cpu, addr + 4, word1); + if (cpu->InBigEndianMode()) { + cpu->WriteMemory32(addr + 0, word2); + cpu->WriteMemory32(addr + 4, word1); } else { - WriteMemory32(cpu, addr + 0, word1); - WriteMemory32(cpu, addr + 4, word2); + cpu->WriteMemory32(addr + 0, word1); + cpu->WriteMemory32(addr + 4, word2); } addr += 8; @@ -1466,7 +1466,7 @@ VPUSH_INST: cpu->Reg[R13] -= inst_cream->imm32; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vpush_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1511,37 +1511,44 @@ static ARM_INST_PTR INTERPRETER_TRANSLATE(vstm)(unsigned int inst, int index) #ifdef VFP_INTERPRETER_IMPL VSTM_INST: /* encoding 1 */ { - if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) { + if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { CHECK_VFP_ENABLED; - vstm_inst *inst_cream = (vstm_inst *)inst_base->component; + vstm_inst* inst_cream = (vstm_inst*)inst_base->component; + + u32 address = cpu->Reg[inst_cream->n]; - addr = (inst_cream->add ? cpu->Reg[inst_cream->n] : cpu->Reg[inst_cream->n] - inst_cream->imm32); + // Only possible in ARM mode, where PC accesses have an 8 byte offset. + if (inst_cream->n == 15) + address += 8; + + if (inst_cream->add == 0) + address -= inst_cream->imm32; for (unsigned int i = 0; i < inst_cream->regs; i++) { if (inst_cream->single) { - WriteMemory32(cpu, addr, cpu->ExtReg[inst_cream->d+i]); - addr += 4; + cpu->WriteMemory32(address, cpu->ExtReg[inst_cream->d+i]); + address += 4; } else { const u32 word1 = cpu->ExtReg[(inst_cream->d+i)*2+0]; const u32 word2 = cpu->ExtReg[(inst_cream->d+i)*2+1]; - if (InBigEndianMode(cpu)) { - WriteMemory32(cpu, addr + 0, word2); - WriteMemory32(cpu, addr + 4, word1); + if (cpu->InBigEndianMode()) { + cpu->WriteMemory32(address + 0, word2); + cpu->WriteMemory32(address + 4, word1); } else { - WriteMemory32(cpu, addr + 0, word1); - WriteMemory32(cpu, addr + 4, word2); + cpu->WriteMemory32(address + 0, word1); + cpu->WriteMemory32(address + 4, word2); } - addr += 8; + address += 8; } } - if (inst_cream->wback){ + if (inst_cream->wback) { cpu->Reg[inst_cream->n] = (inst_cream->add ? cpu->Reg[inst_cream->n] + inst_cream->imm32 : cpu->Reg[inst_cream->n] - inst_cream->imm32); } @@ -1597,15 +1604,15 @@ VPOP_INST: { if (inst_cream->single) { - cpu->ExtReg[inst_cream->d+i] = ReadMemory32(cpu, addr); + cpu->ExtReg[inst_cream->d+i] = cpu->ReadMemory32(addr); addr += 4; } else { - const u32 word1 = ReadMemory32(cpu, addr + 0); - const u32 word2 = ReadMemory32(cpu, addr + 4); + const u32 word1 = cpu->ReadMemory32(addr + 0); + const u32 word2 = cpu->ReadMemory32(addr + 4); - if (InBigEndianMode(cpu)) { + if (cpu->InBigEndianMode()) { cpu->ExtReg[(inst_cream->d+i)*2+0] = word2; cpu->ExtReg[(inst_cream->d+i)*2+1] = word1; } else { @@ -1618,7 +1625,7 @@ VPOP_INST: } cpu->Reg[R13] += inst_cream->imm32; } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vpop_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1670,14 +1677,14 @@ VLDR_INST: if (inst_cream->single) { - cpu->ExtReg[inst_cream->d] = ReadMemory32(cpu, addr); + cpu->ExtReg[inst_cream->d] = cpu->ReadMemory32(addr); } else { - const u32 word1 = ReadMemory32(cpu, addr + 0); - const u32 word2 = ReadMemory32(cpu, addr + 4); + const u32 word1 = cpu->ReadMemory32(addr + 0); + const u32 word2 = cpu->ReadMemory32(addr + 4); - if (InBigEndianMode(cpu)) { + if (cpu->InBigEndianMode()) { cpu->ExtReg[inst_cream->d*2+0] = word2; cpu->ExtReg[inst_cream->d*2+1] = word1; } else { @@ -1686,7 +1693,7 @@ VLDR_INST: } } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vldr_inst)); FETCH_INST; GOTO_NEXT_INST; @@ -1731,26 +1738,33 @@ static ARM_INST_PTR INTERPRETER_TRANSLATE(vldm)(unsigned int inst, int index) #ifdef VFP_INTERPRETER_IMPL VLDM_INST: { - if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) { + if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { CHECK_VFP_ENABLED; - vldm_inst *inst_cream = (vldm_inst *)inst_base->component; + vldm_inst* inst_cream = (vldm_inst*)inst_base->component; + + u32 address = cpu->Reg[inst_cream->n]; - addr = (inst_cream->add ? cpu->Reg[inst_cream->n] : cpu->Reg[inst_cream->n] - inst_cream->imm32); + // Only possible in ARM mode, where PC accesses have an 8 byte offset. + if (inst_cream->n == 15) + address += 8; + + if (inst_cream->add == 0) + address -= inst_cream->imm32; for (unsigned int i = 0; i < inst_cream->regs; i++) { if (inst_cream->single) { - cpu->ExtReg[inst_cream->d+i] = ReadMemory32(cpu, addr); - addr += 4; + cpu->ExtReg[inst_cream->d+i] = cpu->ReadMemory32(address); + address += 4; } else { - const u32 word1 = ReadMemory32(cpu, addr + 0); - const u32 word2 = ReadMemory32(cpu, addr + 4); + const u32 word1 = cpu->ReadMemory32(address + 0); + const u32 word2 = cpu->ReadMemory32(address + 4); - if (InBigEndianMode(cpu)) { + if (cpu->InBigEndianMode()) { cpu->ExtReg[(inst_cream->d+i)*2+0] = word2; cpu->ExtReg[(inst_cream->d+i)*2+1] = word1; } else { @@ -1758,15 +1772,15 @@ VLDM_INST: cpu->ExtReg[(inst_cream->d+i)*2+1] = word2; } - addr += 8; + address += 8; } } - if (inst_cream->wback){ + if (inst_cream->wback) { cpu->Reg[inst_cream->n] = (inst_cream->add ? cpu->Reg[inst_cream->n] + inst_cream->imm32 : cpu->Reg[inst_cream->n] - inst_cream->imm32); } } - cpu->Reg[15] += GET_INST_SIZE(cpu); + cpu->Reg[15] += cpu->GetInstructionSize(); INC_PC(sizeof(vldm_inst)); FETCH_INST; GOTO_NEXT_INST; diff --git a/src/core/arm/skyeye_common/vfp/vfpsingle.cpp b/src/core/arm/skyeye_common/vfp/vfpsingle.cpp index e5d339252..0fb3c3bf1 100644 --- a/src/core/arm/skyeye_common/vfp/vfpsingle.cpp +++ b/src/core/arm/skyeye_common/vfp/vfpsingle.cpp @@ -51,6 +51,7 @@ * =========================================================================== */ +#include <algorithm> #include <cinttypes> #include "common/common_funcs.h" @@ -815,9 +816,7 @@ vfp_single_add(struct vfp_single *vsd, struct vfp_single *vsn, * This ensures that NaN propagation works correctly. */ if (vsn->exponent < vsm->exponent) { - struct vfp_single *t = vsn; - vsn = vsm; - vsm = t; + std::swap(vsm, vsn); } /* @@ -872,9 +871,7 @@ vfp_single_multiply(struct vfp_single *vsd, struct vfp_single *vsn, struct vfp_s * This ensures that NaN propagation works correctly. */ if (vsn->exponent < vsm->exponent) { - struct vfp_single *t = vsn; - vsn = vsm; - vsm = t; + std::swap(vsm, vsn); LOG_TRACE(Core_ARM11, "swapping M <-> N"); } diff --git a/src/core/hle/applets/applet.cpp b/src/core/hle/applets/applet.cpp index 826f6cbb6..bc2a1829e 100644 --- a/src/core/hle/applets/applet.cpp +++ b/src/core/hle/applets/applet.cpp @@ -89,12 +89,21 @@ ResultCode Applet::Start(const Service::APT::AppletStartupParameter& parameter) return result; } +bool IsLibraryAppletRunning() { + // Check the applets map for instances of any applet + for (auto itr = applets.begin(); itr != applets.end(); ++itr) + if (itr->second != nullptr) + return true; + return false; +} + void Init() { // Register the applet update callback applet_update_event = CoreTiming::RegisterEvent("HLE Applet Update Event", AppletUpdateEvent); } void Shutdown() { + CoreTiming::RemoveEvent(applet_update_event); } } diff --git a/src/core/hle/applets/applet.h b/src/core/hle/applets/applet.h index b235d0b8a..af442f81d 100644 --- a/src/core/hle/applets/applet.h +++ b/src/core/hle/applets/applet.h @@ -67,6 +67,9 @@ protected: Service::APT::AppletId id; ///< Id of this Applet }; +/// Returns whether a library applet is currently running +bool IsLibraryAppletRunning(); + /// Initializes the HLE applets void Init(); diff --git a/src/core/hle/service/am/am.cpp b/src/core/hle/service/am/am.cpp index 7332478fb..7ae4859a7 100644 --- a/src/core/hle/service/am/am.cpp +++ b/src/core/hle/service/am/am.cpp @@ -5,6 +5,7 @@ #include "common/logging/log.h" #include "core/hle/service/service.h" +#include "core/hle/service/am/am.h" #include "core/hle/service/am/am_app.h" #include "core/hle/service/am/am_net.h" #include "core/hle/service/am/am_sys.h" @@ -35,7 +36,7 @@ void GetTitleIDList(Service::Interface* self) { cmd_buff[1] = RESULT_SUCCESS.raw; cmd_buff[2] = 0; - LOG_WARNING(Service_AM, "(STUBBED) Requested %u titles from media type %u", num_titles, media_type); + LOG_WARNING(Service_AM, "(STUBBED) Requested %u titles from media type %u. Address=0x%08X", num_titles, media_type, addr); } void GetNumContentInfos(Service::Interface* self) { diff --git a/src/core/hle/service/am/am_net.cpp b/src/core/hle/service/am/am_net.cpp index b1af0e9d8..aa391f3b2 100644 --- a/src/core/hle/service/am/am_net.cpp +++ b/src/core/hle/service/am/am_net.cpp @@ -28,7 +28,8 @@ const Interface::FunctionInfo FunctionTable[] = { {0x08130000, nullptr, "GetTotalContents"}, {0x08140042, nullptr, "GetContentIndexes"}, {0x08150044, nullptr, "GetContentsInfo"}, - {0x08190108, nullptr, "Unknown"}, + {0x08180042, nullptr, "GetCTCert"}, + {0x08190108, nullptr, "SetCertificates"}, {0x081B00C2, nullptr, "InstallTitlesFinish"}, }; diff --git a/src/core/hle/service/apt/apt.cpp b/src/core/hle/service/apt/apt.cpp index 7b6ab4ce0..35402341b 100644 --- a/src/core/hle/service/apt/apt.cpp +++ b/src/core/hle/service/apt/apt.cpp @@ -101,18 +101,19 @@ void NotifyToWait(Service::Interface* self) { void GetLockHandle(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - u32 flags = cmd_buff[1]; // TODO(bunnei): Figure out the purpose of the flag field + // Bits [0:2] are the applet type (System, Library, etc) + // Bit 5 tells the application that there's a pending APT parameter, + // this will cause the app to wait until parameter_event is signaled. + u32 applet_attributes = cmd_buff[1]; cmd_buff[1] = RESULT_SUCCESS.raw; // No error - // Not sure what these parameters are used for, but retail apps check that they are 0 after - // GetLockHandle has been called. - cmd_buff[2] = 0; // Applet Attributes, this value is passed to Enable. - cmd_buff[3] = 0; - cmd_buff[4] = 0; - + cmd_buff[2] = applet_attributes; // Applet Attributes, this value is passed to Enable. + cmd_buff[3] = 0; // Least significant bit = power button state + cmd_buff[4] = IPC::CopyHandleDesc(); cmd_buff[5] = Kernel::g_handle_table.Create(lock).MoveFrom(); - LOG_TRACE(Service_APT, "called handle=0x%08X", cmd_buff[5]); + + LOG_WARNING(Service_APT, "(STUBBED) called handle=0x%08X applet_attributes=0x%08X", cmd_buff[5], applet_attributes); } void Enable(Service::Interface* self) { @@ -139,13 +140,16 @@ void IsRegistered(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); u32 app_id = cmd_buff[1]; cmd_buff[1] = RESULT_SUCCESS.raw; // No error - /// TODO(Subv): It is currently unknown what this value (0x400) means, - /// but i believe it is used as a global "LibraryApplet" id, to verify if there's - /// any LibApplet currently running. This is not verified. - if (app_id != 0x400) + + // TODO(Subv): An application is considered "registered" if it has already called APT::Enable + // handle this properly once we implement multiprocess support. + cmd_buff[2] = 0; // Set to not registered by default + + if (app_id == static_cast<u32>(AppletId::AnyLibraryApplet)) { + cmd_buff[2] = HLE::Applets::IsLibraryAppletRunning() ? 1 : 0; + } else if (auto applet = HLE::Applets::Applet::Get(static_cast<AppletId>(app_id))) { cmd_buff[2] = 1; // Set to registered - else - cmd_buff[2] = 0; // Set to not registered + } LOG_WARNING(Service_APT, "(STUBBED) called app_id=0x%08X", app_id); } @@ -330,7 +334,26 @@ void GetAppCpuTimeLimit(Service::Interface* self) { void PrepareToStartLibraryApplet(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); AppletId applet_id = static_cast<AppletId>(cmd_buff[1]); - cmd_buff[1] = HLE::Applets::Applet::Create(applet_id).raw; + auto applet = HLE::Applets::Applet::Get(applet_id); + if (applet) { + LOG_WARNING(Service_APT, "applet has already been started id=%08X", applet_id); + cmd_buff[1] = RESULT_SUCCESS.raw; + } else { + cmd_buff[1] = HLE::Applets::Applet::Create(applet_id).raw; + } + LOG_DEBUG(Service_APT, "called applet_id=%08X", applet_id); +} + +void PreloadLibraryApplet(Service::Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + AppletId applet_id = static_cast<AppletId>(cmd_buff[1]); + auto applet = HLE::Applets::Applet::Get(applet_id); + if (applet) { + LOG_WARNING(Service_APT, "applet has already been started id=%08X", applet_id); + cmd_buff[1] = RESULT_SUCCESS.raw; + } else { + cmd_buff[1] = HLE::Applets::Applet::Create(applet_id).raw; + } LOG_DEBUG(Service_APT, "called applet_id=%08X", applet_id); } diff --git a/src/core/hle/service/apt/apt.h b/src/core/hle/service/apt/apt.h index 72972d05b..4a72b6b5c 100644 --- a/src/core/hle/service/apt/apt.h +++ b/src/core/hle/service/apt/apt.h @@ -62,6 +62,7 @@ enum class AppletId : u32 { Extrapad = 0x208, Memolib = 0x209, Application = 0x300, + AnyLibraryApplet = 0x400, SoftwareKeyboard2 = 0x401, }; @@ -96,8 +97,26 @@ void GetSharedFont(Service::Interface* self); */ void NotifyToWait(Service::Interface* self); +/** + * APT::GetLockHandle service function + * Inputs: + * 1 : Applet attributes + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + * 2 : Applet attributes + * 3 : Power button state + * 4 : IPC handle descriptor + * 5 : APT mutex handle + */ void GetLockHandle(Service::Interface* self); +/** + * APT::Enable service function + * Inputs: + * 1 : Applet attributes + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + */ void Enable(Service::Interface* self); /** @@ -284,6 +303,17 @@ void GetAppCpuTimeLimit(Service::Interface* self); void PrepareToStartLibraryApplet(Service::Interface* self); /** + * APT::PreloadLibraryApplet service function + * Inputs: + * 0 : Command header [0x00160040] + * 1 : Id of the applet to start + * Outputs: + * 0 : Return header + * 1 : Result of function, 0 on success, otherwise error code + */ +void PreloadLibraryApplet(Service::Interface* self); + +/** * APT::StartLibraryApplet service function * Inputs: * 0 : Command header [0x001E0084] diff --git a/src/core/hle/service/apt/apt_a.cpp b/src/core/hle/service/apt/apt_a.cpp index 88de339f9..22800c56f 100644 --- a/src/core/hle/service/apt/apt_a.cpp +++ b/src/core/hle/service/apt/apt_a.cpp @@ -21,6 +21,7 @@ const Interface::FunctionInfo FunctionTable[] = { {0x000D0080, ReceiveParameter, "ReceiveParameter"}, {0x000E0080, GlanceParameter, "GlanceParameter"}, {0x000F0100, CancelParameter, "CancelParameter"}, + {0x00160040, PreloadLibraryApplet, "PreloadLibraryApplet"}, {0x00180040, PrepareToStartLibraryApplet, "PrepareToStartLibraryApplet"}, {0x001E0084, StartLibraryApplet, "StartLibraryApplet"}, {0x003B0040, nullptr, "CancelLibraryApplet?"}, diff --git a/src/core/hle/service/apt/apt_s.cpp b/src/core/hle/service/apt/apt_s.cpp index 396d1f04a..3ac6ff94f 100644 --- a/src/core/hle/service/apt/apt_s.cpp +++ b/src/core/hle/service/apt/apt_s.cpp @@ -32,9 +32,9 @@ const Interface::FunctionInfo FunctionTable[] = { {0x00130000, nullptr, "GetPreparationState"}, {0x00140040, nullptr, "SetPreparationState"}, {0x00150140, nullptr, "PrepareToStartApplication"}, - {0x00160040, nullptr, "PreloadLibraryApplet"}, + {0x00160040, PreloadLibraryApplet, "PreloadLibraryApplet"}, {0x00170040, nullptr, "FinishPreloadingLibraryApplet"}, - {0x00180040, nullptr, "PrepareToStartLibraryApplet"}, + {0x00180040, PrepareToStartLibraryApplet,"PrepareToStartLibraryApplet"}, {0x00190040, nullptr, "PrepareToStartSystemApplet"}, {0x001A0000, nullptr, "PrepareToStartNewestHomeMenu"}, {0x001B00C4, nullptr, "StartApplication"}, diff --git a/src/core/hle/service/apt/apt_u.cpp b/src/core/hle/service/apt/apt_u.cpp index b724cd72b..146bfd595 100644 --- a/src/core/hle/service/apt/apt_u.cpp +++ b/src/core/hle/service/apt/apt_u.cpp @@ -33,7 +33,7 @@ const Interface::FunctionInfo FunctionTable[] = { {0x00130000, nullptr, "GetPreparationState"}, {0x00140040, nullptr, "SetPreparationState"}, {0x00150140, nullptr, "PrepareToStartApplication"}, - {0x00160040, nullptr, "PreloadLibraryApplet"}, + {0x00160040, PreloadLibraryApplet, "PreloadLibraryApplet"}, {0x00170040, nullptr, "FinishPreloadingLibraryApplet"}, {0x00180040, PrepareToStartLibraryApplet, "PrepareToStartLibraryApplet"}, {0x00190040, nullptr, "PrepareToStartSystemApplet"}, diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp index 70caa7d80..c35b13b25 100644 --- a/src/core/hle/service/hid/hid.cpp +++ b/src/core/hle/service/hid/hid.cpp @@ -35,6 +35,16 @@ static Kernel::SharedPtr<Kernel::Event> event_debug_pad; static u32 next_pad_index; static u32 next_touch_index; +const std::array<Service::HID::PadState, Settings::NativeInput::NUM_INPUTS> pad_mapping = { + Service::HID::PAD_A, Service::HID::PAD_B, Service::HID::PAD_X, Service::HID::PAD_Y, + Service::HID::PAD_L, Service::HID::PAD_R, Service::HID::PAD_ZL, Service::HID::PAD_ZR, + Service::HID::PAD_START, Service::HID::PAD_SELECT, Service::HID::PAD_NONE, + Service::HID::PAD_UP, Service::HID::PAD_DOWN, Service::HID::PAD_LEFT, Service::HID::PAD_RIGHT, + Service::HID::PAD_CIRCLE_UP, Service::HID::PAD_CIRCLE_DOWN, Service::HID::PAD_CIRCLE_LEFT, Service::HID::PAD_CIRCLE_RIGHT, + Service::HID::PAD_C_UP, Service::HID::PAD_C_DOWN, Service::HID::PAD_C_LEFT, Service::HID::PAD_C_RIGHT +}; + + // TODO(peachum): // Add a method for setting analog input from joystick device for the circle Pad. // diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h index d50d479f8..517f4f2ae 100644 --- a/src/core/hle/service/hid/hid.h +++ b/src/core/hle/service/hid/hid.h @@ -9,7 +9,7 @@ #ifndef _MSC_VER #include <cstddef> #endif - +#include "core/settings.h" #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" @@ -157,6 +157,9 @@ const PadState PAD_CIRCLE_LEFT = {{1u << 29}}; const PadState PAD_CIRCLE_UP = {{1u << 30}}; const PadState PAD_CIRCLE_DOWN = {{1u << 31}}; + +extern const std::array<Service::HID::PadState, Settings::NativeInput::NUM_INPUTS> pad_mapping; + /** * HID::GetIPCHandles service function * Inputs: diff --git a/src/core/hle/service/ldr_ro.cpp b/src/core/hle/service/ldr_ro.cpp index 155b97f69..f84ce4d72 100644 --- a/src/core/hle/service/ldr_ro.cpp +++ b/src/core/hle/service/ldr_ro.cpp @@ -40,7 +40,8 @@ static void Initialize(Service::Interface* self) { cmd_buff[1] = RESULT_SUCCESS.raw; // No error - LOG_WARNING(Service_LDR, "(STUBBED) called"); + LOG_WARNING(Service_LDR, "(STUBBED) called. crs_buffer_ptr=0x%08X, crs_size=0x%08X, address=0x%08X, value=0x%08X, process=0x%08X", + crs_buffer_ptr, crs_size, address, value, process); } /** @@ -69,7 +70,8 @@ static void LoadCRR(Service::Interface* self) { cmd_buff[1] = RESULT_SUCCESS.raw; // No error - LOG_WARNING(Service_LDR, "(STUBBED) called"); + LOG_WARNING(Service_LDR, "(STUBBED) called. crs_buffer_ptr=0x%08X, crs_size=0x%08X, value=0x%08X, process=0x%08X", + crs_buffer_ptr, crs_size, value, process); } const Interface::FunctionInfo FunctionTable[] = { diff --git a/src/core/hle/service/soc_u.cpp b/src/core/hle/service/soc_u.cpp index d0e166fdf..d768a3fc7 100644 --- a/src/core/hle/service/soc_u.cpp +++ b/src/core/hle/service/soc_u.cpp @@ -481,11 +481,17 @@ static void GetHostId(Service::Interface* self) { char name[128]; gethostname(name, sizeof(name)); - hostent* host = gethostbyname(name); - in_addr* addr = reinterpret_cast<in_addr*>(host->h_addr); + addrinfo hints = {}; + addrinfo* res; + + hints.ai_family = AF_INET; + getaddrinfo(name, NULL, &hints, &res); + sockaddr_in* sock_addr = reinterpret_cast<sockaddr_in*>(res->ai_addr); + in_addr* addr = &sock_addr->sin_addr; cmd_buffer[2] = addr->s_addr; cmd_buffer[1] = 0; + freeaddrinfo(res); } static void Close(Service::Interface* self) { diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp index f80e26ecd..082a4db82 100644 --- a/src/core/hw/y2r.cpp +++ b/src/core/hw/y2r.cpp @@ -14,6 +14,7 @@ #include "common/vector_math.h" #include "core/hle/service/y2r_u.h" +#include "core/hw/y2r.h" #include "core/memory.h" namespace HW { diff --git a/src/core/loader/loader.cpp b/src/core/loader/loader.cpp index f5b349a77..062291006 100644 --- a/src/core/loader/loader.cpp +++ b/src/core/loader/loader.cpp @@ -77,6 +77,8 @@ static const char* GetFileTypeString(FileType type) { return "NCSD"; case FileType::CXI: return "NCCH"; + case FileType::CIA: + return "CIA"; case FileType::ELF: return "ELF"; case FileType::THREEDSX: @@ -134,6 +136,10 @@ ResultStatus LoadFile(const std::string& filename) { break; } + // CIA file format... + case FileType::CIA: + return ResultStatus::ErrorNotImplemented; + // Error occurred durring IdentifyFile... case FileType::Error: diff --git a/src/core/settings.h b/src/core/settings.h index 5a70d157a..6ca0e1afc 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -5,34 +5,42 @@ #pragma once #include <string> +#include <array> namespace Settings { +namespace NativeInput { +enum Values { + A, B, X, Y, + L, R, ZL, ZR, + START, SELECT, HOME, + DUP, DDOWN, DLEFT, DRIGHT, + SUP, SDOWN, SLEFT, SRIGHT, + CUP, CDOWN, CLEFT, CRIGHT, + NUM_INPUTS +}; +static const std::array<const char*, NUM_INPUTS> Mapping = { + "pad_a", "pad_b", "pad_x", "pad_y", + "pad_l", "pad_r", "pad_zl", "pad_zr", + "pad_start", "pad_select", "pad_home", + "pad_dup", "pad_ddown", "pad_dleft", "pad_dright", + "pad_sup", "pad_sdown", "pad_sleft", "pad_sright", + "pad_cup", "pad_cdown", "pad_cleft", "pad_cright" +}; +static const std::array<Values, NUM_INPUTS> All = { + A, B, X, Y, + L, R, ZL, ZR, + START, SELECT, HOME, + DUP, DDOWN, DLEFT, DRIGHT, + SUP, SDOWN, SLEFT, SRIGHT, + CUP, CDOWN, CLEFT, CRIGHT +}; +} + + struct Values { // Controls - int pad_a_key; - int pad_b_key; - int pad_x_key; - int pad_y_key; - int pad_l_key; - int pad_r_key; - int pad_zl_key; - int pad_zr_key; - int pad_start_key; - int pad_select_key; - int pad_home_key; - int pad_dup_key; - int pad_ddown_key; - int pad_dleft_key; - int pad_dright_key; - int pad_sup_key; - int pad_sdown_key; - int pad_sleft_key; - int pad_sright_key; - int pad_cup_key; - int pad_cdown_key; - int pad_cleft_key; - int pad_cright_key; + std::array<int, NativeInput::NUM_INPUTS> input_mappings; // Core int frame_skip; @@ -45,6 +53,7 @@ struct Values { // Renderer bool use_hw_renderer; + bool use_shader_jit; float bg_red; float bg_green; diff --git a/src/core/tracer/citrace.h b/src/core/tracer/citrace.h index 5deb6ce9e..709abdfb3 100644 --- a/src/core/tracer/citrace.h +++ b/src/core/tracer/citrace.h @@ -4,7 +4,7 @@ #pragma once -#include <cstdint> +#include "common/common_types.h" namespace CiTrace { @@ -17,38 +17,38 @@ struct CTHeader { return "CiTr"; } - static uint32_t ExpectedVersion() { + static u32 ExpectedVersion() { return 1; } char magic[4]; - uint32_t version; - uint32_t header_size; + u32 version; + u32 header_size; struct { // NOTE: Register range sizes are technically hardware-constants, but the actual limits // aren't known. Hence we store the presumed limits along the offsets. - // Sizes are given in uint32_t units. - uint32_t gpu_registers; - uint32_t gpu_registers_size; - uint32_t lcd_registers; - uint32_t lcd_registers_size; - uint32_t pica_registers; - uint32_t pica_registers_size; - uint32_t default_attributes; - uint32_t default_attributes_size; - uint32_t vs_program_binary; - uint32_t vs_program_binary_size; - uint32_t vs_swizzle_data; - uint32_t vs_swizzle_data_size; - uint32_t vs_float_uniforms; - uint32_t vs_float_uniforms_size; - uint32_t gs_program_binary; - uint32_t gs_program_binary_size; - uint32_t gs_swizzle_data; - uint32_t gs_swizzle_data_size; - uint32_t gs_float_uniforms; - uint32_t gs_float_uniforms_size; + // Sizes are given in u32 units. + u32 gpu_registers; + u32 gpu_registers_size; + u32 lcd_registers; + u32 lcd_registers_size; + u32 pica_registers; + u32 pica_registers_size; + u32 default_attributes; + u32 default_attributes_size; + u32 vs_program_binary; + u32 vs_program_binary_size; + u32 vs_swizzle_data; + u32 vs_swizzle_data_size; + u32 vs_float_uniforms; + u32 vs_float_uniforms_size; + u32 gs_program_binary; + u32 gs_program_binary_size; + u32 gs_swizzle_data; + u32 gs_swizzle_data_size; + u32 gs_float_uniforms; + u32 gs_float_uniforms_size; // Other things we might want to store here: // - Initial framebuffer data, maybe even a full copy of FCRAM/VRAM @@ -56,27 +56,27 @@ struct CTHeader { // - Lookup tables for procedural textures } initial_state_offsets; - uint32_t stream_offset; - uint32_t stream_size; + u32 stream_offset; + u32 stream_size; }; -enum CTStreamElementType : uint32_t { +enum CTStreamElementType : u32 { FrameMarker = 0xE1, MemoryLoad = 0xE2, RegisterWrite = 0xE3, }; struct CTMemoryLoad { - uint32_t file_offset; - uint32_t size; - uint32_t physical_address; - uint32_t pad; + u32 file_offset; + u32 size; + u32 physical_address; + u32 pad; }; struct CTRegisterWrite { - uint32_t physical_address; + u32 physical_address; - enum : uint32_t { + enum : u32 { SIZE_8 = 0xD1, SIZE_16 = 0xD2, SIZE_32 = 0xD3, @@ -84,7 +84,7 @@ struct CTRegisterWrite { } size; // TODO: Make it clearer which bits of this member are used for sizes other than 32 bits - uint64_t value; + u64 value; }; struct CTStreamElement { diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 5c7f4ae18..183709d8b 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -2,7 +2,6 @@ set(SRCS renderer_opengl/generated/gl_3_2_core.c renderer_opengl/gl_rasterizer.cpp renderer_opengl/gl_rasterizer_cache.cpp - renderer_opengl/gl_resource_manager.cpp renderer_opengl/gl_shader_util.cpp renderer_opengl/gl_state.cpp renderer_opengl/renderer_opengl.cpp @@ -12,8 +11,9 @@ set(SRCS pica.cpp primitive_assembly.cpp rasterizer.cpp + shader/shader.cpp + shader/shader_interpreter.cpp utils.cpp - vertex_shader.cpp video_core.cpp ) @@ -36,11 +36,20 @@ set(HEADERS primitive_assembly.h rasterizer.h renderer_base.h + shader/shader.h + shader/shader_interpreter.h utils.h - vertex_shader.h video_core.h ) +if(ARCHITECTURE_x86_64) + set(SRCS ${SRCS} + shader/shader_jit_x64.cpp) + + set(HEADERS ${HEADERS} + shader/shader_jit_x64.h) +endif() + create_directory_groups(${SRCS} ${HEADERS}) add_library(video_core STATIC ${SRCS} ${HEADERS}) diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 558b49d60..bb6048cc0 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -7,7 +7,7 @@ #include "clipper.h" #include "pica.h" #include "rasterizer.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" namespace Pica { diff --git a/src/video_core/clipper.h b/src/video_core/clipper.h index 19ce8e140..6ed01e877 100644 --- a/src/video_core/clipper.h +++ b/src/video_core/clipper.h @@ -6,13 +6,13 @@ namespace Pica { -namespace VertexShader { +namespace Shader { struct OutputVertex; } namespace Clipper { -using VertexShader::OutputVertex; +using Shader::OutputVertex; void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2); diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index deed24412..d82e20f86 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -18,7 +18,7 @@ #include "pica.h" #include "primitive_assembly.h" #include "renderer_base.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" #include "video_core.h" namespace Pica { @@ -127,7 +127,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { { Common::Profiling::ScopeTimer scope_timer(category_drawing); +#if PICA_LOG_TEV DebugUtils::DumpTevStageConfig(regs.GetTevStages()); +#endif if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr); @@ -170,9 +172,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { const u16* index_address_16 = (u16*)index_address_8; bool index_u16 = index_info.format != 0; +#if PICA_DUMP_GEOMETRY DebugUtils::GeometryDumper geometry_dumper; - PrimitiveAssembler<VertexShader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value()); +#endif + PrimitiveAssembler<Shader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); if (g_debug_context) { for (int i = 0; i < 3; ++i) { @@ -213,97 +217,124 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { std::map<u32, u32> ranges; } memory_accesses; + // Simple circular-replacement vertex cache + // The size has been tuned for optimal balance between hit-rate and the cost of lookup + const size_t VERTEX_CACHE_SIZE = 32; + std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids; + std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache; + + unsigned int vertex_cache_pos = 0; + vertex_cache_ids.fill(-1); + + Shader::UnitState<false> shader_unit; + Shader::Setup(shader_unit); + for (unsigned int index = 0; index < regs.num_vertices; ++index) { unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; + // -1 is a common special value used for primitive restart. Since it's unknown if + // the PICA supports it, and it would mess up the caching, guard against it here. + ASSERT(vertex != -1); + + bool vertex_cache_hit = false; + Shader::OutputVertex output; + if (is_indexed) { - // TODO: Implement some sort of vertex cache! if (g_debug_context && Pica::g_debug_context->recorder) { int size = index_u16 ? 2 : 1; memory_accesses.AddAccess(base_address + index_info.offset + size * index, size); } - } - // Initialize data for the current vertex - VertexShader::InputVertex input; - - // Load a debugging token to check whether this gets loaded by the running - // application or not. - static const float24 debug_token = float24::FromRawFloat24(0x00abcdef); - input.attr[0].w = debug_token; - - for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { - // Load the default attribute if we're configured to do so, this data will be overwritten by the loader data if it's set - if (attribute_config.IsDefaultAttribute(i)) { - input.attr[i] = g_state.vs.default_attributes[i]; - LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", - i, vertex, index, - input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), - input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32()); + for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { + if (vertex == vertex_cache_ids[i]) { + output = vertex_cache[i]; + vertex_cache_hit = true; + break; + } } + } - // Load per-vertex data from the loader arrays - for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { - u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]; - const u8* srcdata = Memory::GetPhysicalPointer(source_addr); - - if (g_debug_context && Pica::g_debug_context->recorder) { - memory_accesses.AddAccess(source_addr, - (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4 - : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1); + if (!vertex_cache_hit) { + // Initialize data for the current vertex + Shader::InputVertex input; + + for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { + if (vertex_attribute_elements[i] != 0) { + // Default attribute values set if array elements have < 4 components. This + // is *not* carried over from the default attribute settings even if they're + // enabled for this attribute. + static const float24 zero = float24::FromFloat32(0.0f); + static const float24 one = float24::FromFloat32(1.0f); + input.attr[i] = Math::Vec4<float24>(zero, zero, zero, one); + + // Load per-vertex data from the loader arrays + for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { + u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]; + const u8* srcdata = Memory::GetPhysicalPointer(source_addr); + + if (g_debug_context && Pica::g_debug_context->recorder) { + memory_accesses.AddAccess(source_addr, + (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4 + : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1); + } + + const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata : + (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata : + (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata : + *(float*)srcdata; + + input.attr[i][comp] = float24::FromFloat32(srcval); + LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f", + comp, i, vertex, index, + attribute_config.GetPhysicalBaseAddress(), + vertex_attribute_sources[i] - base_address, + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i], + input.attr[i][comp].ToFloat32()); + } + } else if (attribute_config.IsDefaultAttribute(i)) { + // Load the default attribute if we're configured to do so + input.attr[i] = g_state.vs.default_attributes[i]; + LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", + i, vertex, index, + input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), + input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32()); + } else { + // TODO(yuriks): In this case, no data gets loaded and the vertex + // remains with the last value it had. This isn't currently maintained + // as global state, however, and so won't work in Citra yet. } - - const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata : - (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata : - (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata : - *(float*)srcdata; - - input.attr[i][comp] = float24::FromFloat32(srcval); - LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f", - comp, i, vertex, index, - attribute_config.GetPhysicalBaseAddress(), - vertex_attribute_sources[i] - base_address, - vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i], - input.attr[i][comp].ToFloat32()); } - } - // HACK: Some games do not initialize the vertex position's w component. This leads - // to critical issues since it messes up perspective division. As a - // workaround, we force the fourth component to 1.0 if we find this to be the - // case. - // To do this, we additionally have to assume that the first input attribute - // is the vertex position, since there's no information about this other than - // the empiric observation that this is usually the case. - if (input.attr[0].w == debug_token) - input.attr[0].w = float24::FromFloat32(1.0); - - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input); - - // NOTE: When dumping geometry, we simply assume that the first input attribute - // corresponds to the position for now. - DebugUtils::GeometryDumper::Vertex dumped_vertex = { - input.attr[0][0].ToFloat32(), input.attr[0][1].ToFloat32(), input.attr[0][2].ToFloat32() - }; - using namespace std::placeholders; - dumping_primitive_assembler.SubmitVertex(dumped_vertex, - std::bind(&DebugUtils::GeometryDumper::AddTriangle, - &geometry_dumper, _1, _2, _3)); - - // Send to vertex shader - VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs); + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input); - if (is_indexed) { - // TODO: Add processed vertex to vertex cache! +#if PICA_DUMP_GEOMETRY + // NOTE: When dumping geometry, we simply assume that the first input attribute + // corresponds to the position for now. + DebugUtils::GeometryDumper::Vertex dumped_vertex = { + input.attr[0][0].ToFloat32(), input.attr[0][1].ToFloat32(), input.attr[0][2].ToFloat32() + }; + using namespace std::placeholders; + dumping_primitive_assembler.SubmitVertex(dumped_vertex, + std::bind(&DebugUtils::GeometryDumper::AddTriangle, + &geometry_dumper, _1, _2, _3)); +#endif + // Send to vertex shader + output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes()); + + if (is_indexed) { + vertex_cache[vertex_cache_pos] = output; + vertex_cache_ids[vertex_cache_pos] = vertex; + vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; + } } if (Settings::values.use_hw_renderer) { // Send to hardware renderer - static auto AddHWTriangle = [](const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) { + static auto AddHWTriangle = [](const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) { VideoCore::g_renderer->hw_rasterizer->AddTriangle(v0, v1, v2); }; @@ -323,7 +354,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { VideoCore::g_renderer->hw_rasterizer->DrawTriangles(); } +#if PICA_DUMP_GEOMETRY geometry_dumper.Dump(); +#endif if (g_debug_context) { g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr); diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp index 27000283d..e4b397303 100644 --- a/src/video_core/debug_utils/debug_utils.cpp +++ b/src/video_core/debug_utils/debug_utils.cpp @@ -14,10 +14,12 @@ #include <png.h> #endif +#include <nihstro/float24.h> #include <nihstro/shader_binary.h> #include "common/assert.h" #include "common/color.h" +#include "common/common_types.h" #include "common/file_util.h" #include "common/math_util.h" #include "common/vector_math.h" @@ -90,10 +92,6 @@ void GeometryDumper::AddTriangle(Vertex& v0, Vertex& v1, Vertex& v2) { } void GeometryDumper::Dump() { - // NOTE: Permanently enabling this just trashes the hard disk for no reason. - // Hence, this is currently disabled. - return; - static int index = 0; std::string filename = std::string("geometry_dump") + std::to_string(++index) + ".obj"; @@ -113,13 +111,8 @@ void GeometryDumper::Dump() { } -void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data, u32 swizzle_size, - u32 main_offset, const Regs::VSOutputAttributes* output_attributes) +void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, const State::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes) { - // NOTE: Permanently enabling this just trashes hard disks for no reason. - // Hence, this is currently disabled. - return; - struct StuffToWrite { u8* pointer; u32 size; @@ -138,11 +131,14 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data // into shbin format (separate type and component mask). union OutputRegisterInfo { enum Type : u64 { - POSITION = 0, - COLOR = 2, - TEXCOORD0 = 3, - TEXCOORD1 = 5, - TEXCOORD2 = 6, + POSITION = 0, + QUATERNION = 1, + COLOR = 2, + TEXCOORD0 = 3, + TEXCOORD1 = 5, + TEXCOORD2 = 6, + + VIEW = 8, }; BitField< 0, 64, u64> hex; @@ -164,6 +160,10 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data { OutputAttributes::POSITION_Y, { OutputRegisterInfo::POSITION, 2} }, { OutputAttributes::POSITION_Z, { OutputRegisterInfo::POSITION, 4} }, { OutputAttributes::POSITION_W, { OutputRegisterInfo::POSITION, 8} }, + { OutputAttributes::QUATERNION_X, { OutputRegisterInfo::QUATERNION, 1} }, + { OutputAttributes::QUATERNION_Y, { OutputRegisterInfo::QUATERNION, 2} }, + { OutputAttributes::QUATERNION_Z, { OutputRegisterInfo::QUATERNION, 4} }, + { OutputAttributes::QUATERNION_W, { OutputRegisterInfo::QUATERNION, 8} }, { OutputAttributes::COLOR_R, { OutputRegisterInfo::COLOR, 1} }, { OutputAttributes::COLOR_G, { OutputRegisterInfo::COLOR, 2} }, { OutputAttributes::COLOR_B, { OutputRegisterInfo::COLOR, 4} }, @@ -173,7 +173,10 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data { OutputAttributes::TEXCOORD1_U, { OutputRegisterInfo::TEXCOORD1, 1} }, { OutputAttributes::TEXCOORD1_V, { OutputRegisterInfo::TEXCOORD1, 2} }, { OutputAttributes::TEXCOORD2_U, { OutputRegisterInfo::TEXCOORD2, 1} }, - { OutputAttributes::TEXCOORD2_V, { OutputRegisterInfo::TEXCOORD2, 2} } + { OutputAttributes::TEXCOORD2_V, { OutputRegisterInfo::TEXCOORD2, 2} }, + { OutputAttributes::VIEW_X, { OutputRegisterInfo::VIEW, 1} }, + { OutputAttributes::VIEW_Y, { OutputRegisterInfo::VIEW, 2} }, + { OutputAttributes::VIEW_Z, { OutputRegisterInfo::VIEW, 4} } }; for (const auto& semantic : std::vector<OutputAttributes::Semantic>{ @@ -228,28 +231,69 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data // TODO: Reduce the amount of binary code written to relevant portions dvlp.binary_offset = write_offset - dvlp_offset; - dvlp.binary_size_words = binary_size; - QueueForWriting((u8*)binary_data, binary_size * sizeof(u32)); + dvlp.binary_size_words = setup.program_code.size(); + QueueForWriting((u8*)setup.program_code.data(), setup.program_code.size() * sizeof(u32)); dvlp.swizzle_info_offset = write_offset - dvlp_offset; - dvlp.swizzle_info_num_entries = swizzle_size; + dvlp.swizzle_info_num_entries = setup.swizzle_data.size(); u32 dummy = 0; - for (unsigned int i = 0; i < swizzle_size; ++i) { - QueueForWriting((u8*)&swizzle_data[i], sizeof(swizzle_data[i])); + for (unsigned int i = 0; i < setup.swizzle_data.size(); ++i) { + QueueForWriting((u8*)&setup.swizzle_data[i], sizeof(setup.swizzle_data[i])); QueueForWriting((u8*)&dummy, sizeof(dummy)); } - dvle.main_offset_words = main_offset; + dvle.main_offset_words = config.main_offset; dvle.output_register_table_offset = write_offset - dvlb.dvle_offset; - dvle.output_register_table_size = static_cast<uint32_t>(output_info_table.size()); + dvle.output_register_table_size = static_cast<u32>(output_info_table.size()); QueueForWriting((u8*)output_info_table.data(), static_cast<u32>(output_info_table.size() * sizeof(OutputRegisterInfo))); // TODO: Create a label table for "main" + std::vector<nihstro::ConstantInfo> constant_table; + for (unsigned i = 0; i < setup.uniforms.b.size(); ++i) { + nihstro::ConstantInfo constant; + memset(&constant, 0, sizeof(constant)); + constant.type = nihstro::ConstantInfo::Bool; + constant.regid = i; + constant.b = setup.uniforms.b[i]; + constant_table.emplace_back(constant); + } + for (unsigned i = 0; i < setup.uniforms.i.size(); ++i) { + nihstro::ConstantInfo constant; + memset(&constant, 0, sizeof(constant)); + constant.type = nihstro::ConstantInfo::Int; + constant.regid = i; + constant.i.x = setup.uniforms.i[i].x; + constant.i.y = setup.uniforms.i[i].y; + constant.i.z = setup.uniforms.i[i].z; + constant.i.w = setup.uniforms.i[i].w; + constant_table.emplace_back(constant); + } + for (unsigned i = 0; i < sizeof(setup.uniforms.f) / sizeof(setup.uniforms.f[0]); ++i) { + nihstro::ConstantInfo constant; + memset(&constant, 0, sizeof(constant)); + constant.type = nihstro::ConstantInfo::Float; + constant.regid = i; + constant.f.x = nihstro::to_float24(setup.uniforms.f[i].x.ToFloat32()); + constant.f.y = nihstro::to_float24(setup.uniforms.f[i].y.ToFloat32()); + constant.f.z = nihstro::to_float24(setup.uniforms.f[i].z.ToFloat32()); + constant.f.w = nihstro::to_float24(setup.uniforms.f[i].w.ToFloat32()); + + // Store constant if it's different from zero.. + if (setup.uniforms.f[i].x.ToFloat32() != 0.0 || + setup.uniforms.f[i].y.ToFloat32() != 0.0 || + setup.uniforms.f[i].z.ToFloat32() != 0.0 || + setup.uniforms.f[i].w.ToFloat32() != 0.0) + constant_table.emplace_back(constant); + } + dvle.constant_table_offset = write_offset - dvlb.dvle_offset; + dvle.constant_table_size = constant_table.size(); + for (const auto& constant : constant_table) { + QueueForWriting((uint8_t*)&constant, sizeof(constant)); + } // Write data to file static int dump_index = 0; - std::string filename = std::string("shader_dump") + std::to_string(++dump_index) + std::string(".shbin"); std::ofstream file(filename, std::ios_base::out | std::ios_base::binary); for (auto& chunk : writing_queue) { @@ -564,10 +608,6 @@ TextureInfo TextureInfo::FromPicaRegister(const Regs::TextureConfig& config, } void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) { - // NOTE: Permanently enabling this just trashes hard disks for no reason. - // Hence, this is currently disabled. - return; - #ifndef HAVE_PNG return; #else diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h index acb75a4b2..85762f5b4 100644 --- a/src/video_core/debug_utils/debug_utils.h +++ b/src/video_core/debug_utils/debug_utils.h @@ -157,6 +157,10 @@ extern std::shared_ptr<DebugContext> g_debug_context; // TODO: Get rid of this g namespace DebugUtils { +#define PICA_DUMP_GEOMETRY 0 +#define PICA_DUMP_TEXTURES 0 +#define PICA_LOG_TEV 0 + // Simple utility class for dumping geometry data to an OBJ file class GeometryDumper { public: @@ -177,8 +181,8 @@ private: std::vector<Face> faces; }; -void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data, u32 swizzle_size, - u32 main_offset, const Regs::VSOutputAttributes* output_attributes); +void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, + const State::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes); // Utility class to log Pica commands. diff --git a/src/video_core/hwrasterizer_base.h b/src/video_core/hwrasterizer_base.h index c8746c608..54b8892fb 100644 --- a/src/video_core/hwrasterizer_base.h +++ b/src/video_core/hwrasterizer_base.h @@ -7,7 +7,7 @@ #include "common/common_types.h" namespace Pica { -namespace VertexShader { +namespace Shader { struct OutputVertex; } } @@ -24,9 +24,9 @@ public: virtual void Reset() = 0; /// Queues the primitive formed by the given vertices for rendering - virtual void AddTriangle(const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) = 0; + virtual void AddTriangle(const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) = 0; /// Draw the current batch of triangles virtual void DrawTriangles() = 0; diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index 543d9c443..c73a8178e 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp @@ -2,18 +2,91 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <string.h> +#include <cstring> +#include <unordered_map> #include "pica.h" +#include "shader/shader.h" namespace Pica { State g_state; +std::string Regs::GetCommandName(int index) { + static std::unordered_map<u32, std::string> map; + + if (map.empty()) { + #define ADD_FIELD(name) \ + map.insert({static_cast<u32>(PICA_REG_INDEX(name)), #name}); \ + /* TODO: change to Regs::name when VS2015 and other compilers support it */ \ + for (u32 i = PICA_REG_INDEX(name) + 1; i < PICA_REG_INDEX(name) + sizeof(Regs().name) / 4; ++i) \ + map.insert({i, #name + std::string("+") + std::to_string(i-PICA_REG_INDEX(name))}); \ + + ADD_FIELD(trigger_irq); + ADD_FIELD(cull_mode); + ADD_FIELD(viewport_size_x); + ADD_FIELD(viewport_size_y); + ADD_FIELD(viewport_depth_range); + ADD_FIELD(viewport_depth_far_plane); + ADD_FIELD(viewport_corner); + ADD_FIELD(texture0_enable); + ADD_FIELD(texture0); + ADD_FIELD(texture0_format); + ADD_FIELD(texture1); + ADD_FIELD(texture1_format); + ADD_FIELD(texture2); + ADD_FIELD(texture2_format); + ADD_FIELD(tev_stage0); + ADD_FIELD(tev_stage1); + ADD_FIELD(tev_stage2); + ADD_FIELD(tev_stage3); + ADD_FIELD(tev_combiner_buffer_input); + ADD_FIELD(tev_stage4); + ADD_FIELD(tev_stage5); + ADD_FIELD(tev_combiner_buffer_color); + ADD_FIELD(output_merger); + ADD_FIELD(framebuffer); + ADD_FIELD(vertex_attributes); + ADD_FIELD(index_array); + ADD_FIELD(num_vertices); + ADD_FIELD(trigger_draw); + ADD_FIELD(trigger_draw_indexed); + ADD_FIELD(vs_default_attributes_setup); + ADD_FIELD(command_buffer); + ADD_FIELD(triangle_topology); + ADD_FIELD(gs.bool_uniforms); + ADD_FIELD(gs.int_uniforms); + ADD_FIELD(gs.main_offset); + ADD_FIELD(gs.input_register_map); + ADD_FIELD(gs.uniform_setup); + ADD_FIELD(gs.program); + ADD_FIELD(gs.swizzle_patterns); + ADD_FIELD(vs.bool_uniforms); + ADD_FIELD(vs.int_uniforms); + ADD_FIELD(vs.main_offset); + ADD_FIELD(vs.input_register_map); + ADD_FIELD(vs.uniform_setup); + ADD_FIELD(vs.program); + ADD_FIELD(vs.swizzle_patterns); + +#undef ADD_FIELD + } + + // Return empty string if no match is found + auto it = map.find(index); + if (it != map.end()) { + return it->second; + } else { + return std::string(); + } +} + void Init() { } void Shutdown() { + Shader::Shutdown(); + memset(&g_state, 0, sizeof(State)); } diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 38599a7a3..36916f862 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -7,7 +7,6 @@ #include <array> #include <cmath> #include <cstddef> -#include <map> #include <string> #include "common/assert.h" @@ -81,6 +80,11 @@ struct Regs { POSITION_Z = 2, POSITION_W = 3, + QUATERNION_X = 4, + QUATERNION_Y = 5, + QUATERNION_Z = 6, + QUATERNION_W = 7, + COLOR_R = 8, COLOR_G = 9, COLOR_B = 10, @@ -90,6 +94,12 @@ struct Regs { TEXCOORD0_V = 13, TEXCOORD1_U = 14, TEXCOORD1_V = 15, + + // TODO: Not verified + VIEW_X = 18, + VIEW_Y = 19, + VIEW_Z = 20, + TEXCOORD2_U = 22, TEXCOORD2_V = 23, @@ -908,69 +918,7 @@ struct Regs { // Map register indices to names readable by humans // Used for debugging purposes, so performance is not an issue here - static std::string GetCommandName(int index) { - std::map<u32, std::string> map; - - #define ADD_FIELD(name) \ - do { \ - map.insert({static_cast<u32>(PICA_REG_INDEX(name)), #name}); \ - /* TODO: change to Regs::name when VS2015 and other compilers support it */ \ - for (u32 i = PICA_REG_INDEX(name) + 1; i < PICA_REG_INDEX(name) + sizeof(Regs().name) / 4; ++i) \ - map.insert({i, #name + std::string("+") + std::to_string(i-PICA_REG_INDEX(name))}); \ - } while(false) - - ADD_FIELD(trigger_irq); - ADD_FIELD(cull_mode); - ADD_FIELD(viewport_size_x); - ADD_FIELD(viewport_size_y); - ADD_FIELD(viewport_depth_range); - ADD_FIELD(viewport_depth_far_plane); - ADD_FIELD(viewport_corner); - ADD_FIELD(texture0_enable); - ADD_FIELD(texture0); - ADD_FIELD(texture0_format); - ADD_FIELD(texture1); - ADD_FIELD(texture1_format); - ADD_FIELD(texture2); - ADD_FIELD(texture2_format); - ADD_FIELD(tev_stage0); - ADD_FIELD(tev_stage1); - ADD_FIELD(tev_stage2); - ADD_FIELD(tev_stage3); - ADD_FIELD(tev_combiner_buffer_input); - ADD_FIELD(tev_stage4); - ADD_FIELD(tev_stage5); - ADD_FIELD(tev_combiner_buffer_color); - ADD_FIELD(output_merger); - ADD_FIELD(framebuffer); - ADD_FIELD(vertex_attributes); - ADD_FIELD(index_array); - ADD_FIELD(num_vertices); - ADD_FIELD(trigger_draw); - ADD_FIELD(trigger_draw_indexed); - ADD_FIELD(vs_default_attributes_setup); - ADD_FIELD(command_buffer); - ADD_FIELD(triangle_topology); - ADD_FIELD(gs.bool_uniforms); - ADD_FIELD(gs.int_uniforms); - ADD_FIELD(gs.main_offset); - ADD_FIELD(gs.input_register_map); - ADD_FIELD(gs.uniform_setup); - ADD_FIELD(gs.program); - ADD_FIELD(gs.swizzle_patterns); - ADD_FIELD(vs.bool_uniforms); - ADD_FIELD(vs.int_uniforms); - ADD_FIELD(vs.main_offset); - ADD_FIELD(vs.input_register_map); - ADD_FIELD(vs.uniform_setup); - ADD_FIELD(vs.program); - ADD_FIELD(vs.swizzle_patterns); - - #undef ADD_FIELD - - // Return empty string if no match is found - return map[index]; - } + static std::string GetCommandName(int index); static inline size_t NumIds() { return sizeof(Regs) / sizeof(u32); @@ -1146,6 +1094,7 @@ private: // TODO: Perform proper arithmetic on this! float value; }; +static_assert(sizeof(float24) == sizeof(float), "Shader JIT assumes float24 is implemented as a 32-bit float"); /// Struct used to describe current Pica state struct State { @@ -1155,7 +1104,10 @@ struct State { /// Vertex shader memory struct ShaderSetup { struct { - Math::Vec4<float24> f[96]; + // The float uniforms are accessed by the shader JIT using SSE instructions, and are + // therefore required to be 16-byte aligned. + Math::Vec4<float24> MEMORY_ALIGNED16(f[96]); + std::array<bool, 16> b; std::array<Math::Vec4<u8>, 4> i; } uniforms; diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp index 2f22bdcce..e2b1df44c 100644 --- a/src/video_core/primitive_assembly.cpp +++ b/src/video_core/primitive_assembly.cpp @@ -4,7 +4,7 @@ #include "pica.h" #include "primitive_assembly.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" #include "common/logging/log.h" #include "video_core/debug_utils/debug_utils.h" @@ -56,7 +56,7 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx, TriangleHandl // explicitly instantiate use cases template -struct PrimitiveAssembler<VertexShader::OutputVertex>; +struct PrimitiveAssembler<Shader::OutputVertex>; template struct PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex>; diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h index 52ff4cd89..80432d68a 100644 --- a/src/video_core/primitive_assembly.h +++ b/src/video_core/primitive_assembly.h @@ -8,7 +8,7 @@ #include "video_core/pica.h" -#include "video_core/vertex_shader.h" +#include "video_core/shader/shader_interpreter.h" namespace Pica { diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index e2b90ad1c..b83798b0f 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -16,7 +16,7 @@ #include "math.h" #include "pica.h" #include "rasterizer.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" #include "video_core/utils.h" namespace Pica { @@ -272,9 +272,9 @@ static Common::Profiling::TimingCategory rasterization_category("Rasterization") * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing * culling via recursion. */ -static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, - const VertexShader::OutputVertex& v1, - const VertexShader::OutputVertex& v2, +static void ProcessTriangleInternal(const Shader::OutputVertex& v0, + const Shader::OutputVertex& v1, + const Shader::OutputVertex& v2, bool reversed = false) { const auto& regs = g_state.regs; @@ -462,7 +462,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, // TODO: Apply the min and mag filters to the texture texture_color[i] = DebugUtils::LookupTexture(texture_data, s, t, info); +#if PICA_DUMP_TEXTURES DebugUtils::DumpTexture(texture.config, texture_data); +#endif } } @@ -1105,9 +1107,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, } } -void ProcessTriangle(const VertexShader::OutputVertex& v0, - const VertexShader::OutputVertex& v1, - const VertexShader::OutputVertex& v2) { +void ProcessTriangle(const Shader::OutputVertex& v0, + const Shader::OutputVertex& v1, + const Shader::OutputVertex& v2) { ProcessTriangleInternal(v0, v1, v2); } diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h index 42148f8b1..a6a9634b4 100644 --- a/src/video_core/rasterizer.h +++ b/src/video_core/rasterizer.h @@ -6,15 +6,15 @@ namespace Pica { -namespace VertexShader { +namespace Shader { struct OutputVertex; } namespace Rasterizer { -void ProcessTriangle(const VertexShader::OutputVertex& v0, - const VertexShader::OutputVertex& v1, - const VertexShader::OutputVertex& v2); +void ProcessTriangle(const Shader::OutputVertex& v0, + const Shader::OutputVertex& v1, + const Shader::OutputVertex& v2); } // namespace Rasterizer diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 2db845da6..9f1552adf 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -7,6 +7,7 @@ #include "common/color.h" #include "common/math_util.h" +#include "common/profiler.h" #include "core/hw/gpu.h" #include "core/memory.h" @@ -98,7 +99,6 @@ void RasterizerOpenGL::InitObjects() { fb_color_texture.texture.Create(); ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 1, 1); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.Apply(); @@ -114,7 +114,6 @@ void RasterizerOpenGL::InitObjects() { fb_depth_texture.texture.Create(); ReconfigureDepthTexture(fb_depth_texture, Pica::Regs::DepthFormat::D16, 1, 1); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_depth_texture.texture.handle; state.Apply(); @@ -203,9 +202,9 @@ void RasterizerOpenGL::Reset() { res_cache.FullFlush(); } -void RasterizerOpenGL::AddTriangle(const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) { +void RasterizerOpenGL::AddTriangle(const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) { vertex_batch.push_back(HardwareVertex(v0)); vertex_batch.push_back(HardwareVertex(v1)); vertex_batch.push_back(HardwareVertex(v2)); @@ -492,7 +491,6 @@ void RasterizerOpenGL::ReconfigureColorTexture(TextureInfo& texture, Pica::Regs: break; } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.texture.handle; state.Apply(); @@ -536,7 +534,6 @@ void RasterizerOpenGL::ReconfigureDepthTexture(DepthTextureInfo& texture, Pica:: break; } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.texture.handle; state.Apply(); @@ -765,10 +762,9 @@ void RasterizerOpenGL::SyncDrawState() { const auto& texture = pica_textures[texture_index]; if (texture.enabled) { - state.texture_units[texture_index].enabled_2d = true; res_cache.LoadAndBindTexture(state, texture_index, texture); } else { - state.texture_units[texture_index].enabled_2d = false; + state.texture_units[texture_index].texture_2d = 0; } } @@ -803,7 +799,6 @@ void RasterizerOpenGL::ReloadColorBuffer() { } } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.Apply(); @@ -861,7 +856,6 @@ void RasterizerOpenGL::ReloadDepthBuffer() { } } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_depth_texture.texture.handle; state.Apply(); @@ -873,16 +867,19 @@ void RasterizerOpenGL::ReloadDepthBuffer() { state.Apply(); } +Common::Profiling::TimingCategory buffer_commit_category("Framebuffer Commit"); + void RasterizerOpenGL::CommitColorBuffer() { if (last_fb_color_addr != 0) { u8* color_buffer = Memory::GetPhysicalPointer(last_fb_color_addr); if (color_buffer != nullptr) { + Common::Profiling::ScopeTimer timer(buffer_commit_category); + u32 bytes_per_pixel = Pica::Regs::BytesPerColorPixel(fb_color_texture.format); std::unique_ptr<u8[]> temp_gl_color_buffer(new u8[fb_color_texture.width * fb_color_texture.height * bytes_per_pixel]); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.Apply(); @@ -913,6 +910,8 @@ void RasterizerOpenGL::CommitDepthBuffer() { u8* depth_buffer = Memory::GetPhysicalPointer(last_fb_depth_addr); if (depth_buffer != nullptr) { + Common::Profiling::ScopeTimer timer(buffer_commit_category); + u32 bytes_per_pixel = Pica::Regs::BytesPerDepthPixel(fb_depth_texture.format); // OpenGL needs 4 bpp alignment for D24 @@ -920,7 +919,6 @@ void RasterizerOpenGL::CommitDepthBuffer() { std::unique_ptr<u8[]> temp_gl_depth_buffer(new u8[fb_depth_texture.width * fb_depth_texture.height * gl_bpp]); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_depth_texture.texture.handle; state.Apply(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index ae7b26fc6..a02d5c856 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -9,7 +9,7 @@ #include "common/common_types.h" #include "video_core/hwrasterizer_base.h" -#include "video_core/vertex_shader.h" +#include "video_core/shader/shader_interpreter.h" #include "gl_state.h" #include "gl_rasterizer_cache.h" @@ -27,9 +27,9 @@ public: void Reset() override; /// Queues the primitive formed by the given vertices for rendering - void AddTriangle(const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) override; + void AddTriangle(const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) override; /// Draw the current batch of triangles void DrawTriangles() override; @@ -82,7 +82,7 @@ private: /// Structure that the hardware rendered vertices are composed of struct HardwareVertex { - HardwareVertex(const Pica::VertexShader::OutputVertex& v) { + HardwareVertex(const Pica::Shader::OutputVertex& v) { position[0] = v.pos.x.ToFloat32(); position[1] = v.pos.y.ToFloat32(); position[2] = v.pos.z.ToFloat32(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index dc3ffdf22..70f0ba5f1 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -30,6 +30,7 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text new_texture->texture.Create(); state.texture_units[texture_unit].texture_2d = new_texture->texture.handle; state.Apply(); + glActiveTexture(GL_TEXTURE0 + texture_unit); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, PicaToGL::TextureFilterMode(config.config.mag_filter)); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, PicaToGL::TextureFilterMode(config.config.min_filter)); diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp deleted file mode 100644 index 8f4ae28a4..000000000 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_shader_util.h" - -// Textures -OGLTexture::OGLTexture() : handle(0) { -} - -OGLTexture::~OGLTexture() { - Release(); -} - -void OGLTexture::Create() { - if (handle != 0) { - return; - } - - glGenTextures(1, &handle); -} - -void OGLTexture::Release() { - glDeleteTextures(1, &handle); - handle = 0; -} - -// Shaders -OGLShader::OGLShader() : handle(0) { -} - -OGLShader::~OGLShader() { - Release(); -} - -void OGLShader::Create(const char* vert_shader, const char* frag_shader) { - if (handle != 0) { - return; - } - - handle = ShaderUtil::LoadShaders(vert_shader, frag_shader); -} - -void OGLShader::Release() { - glDeleteProgram(handle); - handle = 0; -} - -// Buffer objects -OGLBuffer::OGLBuffer() : handle(0) { -} - -OGLBuffer::~OGLBuffer() { - Release(); -} - -void OGLBuffer::Create() { - if (handle != 0) { - return; - } - - glGenBuffers(1, &handle); -} - -void OGLBuffer::Release() { - glDeleteBuffers(1, &handle); - handle = 0; -} - -// Vertex array objects -OGLVertexArray::OGLVertexArray() : handle(0) { -} - -OGLVertexArray::~OGLVertexArray() { - Release(); -} - -void OGLVertexArray::Create() { - if (handle != 0) { - return; - } - - glGenVertexArrays(1, &handle); -} - -void OGLVertexArray::Release() { - glDeleteVertexArrays(1, &handle); - handle = 0; -} - -// Framebuffers -OGLFramebuffer::OGLFramebuffer() : handle(0) { -} - -OGLFramebuffer::~OGLFramebuffer() { - Release(); -} - -void OGLFramebuffer::Create() { - if (handle != 0) { - return; - } - - glGenFramebuffers(1, &handle); -} - -void OGLFramebuffer::Release() { - glDeleteFramebuffers(1, &handle); - handle = 0; -} diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index 975720d0a..82173d59a 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -4,76 +4,130 @@ #pragma once +#include <utility> + #include "common/common_types.h" -#include "generated/gl_3_2_core.h" +#include "video_core/renderer_opengl/generated/gl_3_2_core.h" +#include "video_core/renderer_opengl/gl_shader_util.h" +#include "video_core/renderer_opengl/gl_state.h" -class OGLTexture : public NonCopyable { +class OGLTexture : private NonCopyable { public: - OGLTexture(); - ~OGLTexture(); + OGLTexture() = default; + OGLTexture(OGLTexture&& o) { std::swap(handle, o.handle); } + ~OGLTexture() { Release(); } + OGLTexture& operator=(OGLTexture&& o) { std::swap(handle, o.handle); return *this; } /// Creates a new internal OpenGL resource and stores the handle - void Create(); + void Create() { + if (handle != 0) return; + glGenTextures(1, &handle); + } /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle; + void Release() { + if (handle == 0) return; + glDeleteTextures(1, &handle); + OpenGLState::ResetTexture(handle); + handle = 0; + } + + GLuint handle = 0; }; -class OGLShader : public NonCopyable { +class OGLShader : private NonCopyable { public: - OGLShader(); - ~OGLShader(); + OGLShader() = default; + OGLShader(OGLShader&& o) { std::swap(handle, o.handle); } + ~OGLShader() { Release(); } + OGLShader& operator=(OGLShader&& o) { std::swap(handle, o.handle); return *this; } /// Creates a new internal OpenGL resource and stores the handle - void Create(const char* vert_shader, const char* frag_shader); + void Create(const char* vert_shader, const char* frag_shader) { + if (handle != 0) return; + handle = ShaderUtil::LoadShaders(vert_shader, frag_shader); + } /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle; + void Release() { + if (handle == 0) return; + glDeleteProgram(handle); + OpenGLState::ResetProgram(handle); + handle = 0; + } + + GLuint handle = 0; }; -class OGLBuffer : public NonCopyable { +class OGLBuffer : private NonCopyable { public: - OGLBuffer(); - ~OGLBuffer(); + OGLBuffer() = default; + OGLBuffer(OGLBuffer&& o) { std::swap(handle, o.handle); } + ~OGLBuffer() { Release(); } + OGLBuffer& operator=(OGLBuffer&& o) { std::swap(handle, o.handle); return *this; } /// Creates a new internal OpenGL resource and stores the handle - void Create(); + void Create() { + if (handle != 0) return; + glGenBuffers(1, &handle); + } /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle; + void Release() { + if (handle == 0) return; + glDeleteBuffers(1, &handle); + OpenGLState::ResetBuffer(handle); + handle = 0; + } + + GLuint handle = 0; }; -class OGLVertexArray : public NonCopyable { +class OGLVertexArray : private NonCopyable { public: - OGLVertexArray(); - ~OGLVertexArray(); + OGLVertexArray() = default; + OGLVertexArray(OGLVertexArray&& o) { std::swap(handle, o.handle); } + ~OGLVertexArray() { Release(); } + OGLVertexArray& operator=(OGLVertexArray&& o) { std::swap(handle, o.handle); return *this; } /// Creates a new internal OpenGL resource and stores the handle - void Create(); + void Create() { + if (handle != 0) return; + glGenVertexArrays(1, &handle); + } /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle; + void Release() { + if (handle == 0) return; + glDeleteVertexArrays(1, &handle); + OpenGLState::ResetVertexArray(handle); + handle = 0; + } + + GLuint handle = 0; }; -class OGLFramebuffer : public NonCopyable { +class OGLFramebuffer : private NonCopyable { public: - OGLFramebuffer(); - ~OGLFramebuffer(); + OGLFramebuffer() = default; + OGLFramebuffer(OGLFramebuffer&& o) { std::swap(handle, o.handle); } + ~OGLFramebuffer() { Release(); } + OGLFramebuffer& operator=(OGLFramebuffer&& o) { std::swap(handle, o.handle); return *this; } /// Creates a new internal OpenGL resource and stores the handle - void Create(); + void Create() { + if (handle != 0) return; + glGenFramebuffers(1, &handle); + } /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle; + void Release() { + if (handle == 0) return; + glDeleteFramebuffers(1, &handle); + OpenGLState::ResetFramebuffer(handle); + handle = 0; + } + + GLuint handle = 0; }; diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 9efc15337..871324014 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -40,7 +40,6 @@ OpenGLState::OpenGLState() { logic_op = GL_COPY; for (auto& texture_unit : texture_units) { - texture_unit.enabled_2d = false; texture_unit.texture_2d = 0; } @@ -147,16 +146,9 @@ void OpenGLState::Apply() { // Textures for (unsigned texture_index = 0; texture_index < ARRAY_SIZE(texture_units); ++texture_index) { - if (texture_units[texture_index].enabled_2d != cur_state.texture_units[texture_index].enabled_2d || - texture_units[texture_index].texture_2d != cur_state.texture_units[texture_index].texture_2d) { - + if (texture_units[texture_index].texture_2d != cur_state.texture_units[texture_index].texture_2d) { glActiveTexture(GL_TEXTURE0 + texture_index); - - if (texture_units[texture_index].enabled_2d) { - glBindTexture(GL_TEXTURE_2D, texture_units[texture_index].texture_2d); - } else { - glBindTexture(GL_TEXTURE_2D, 0); - } + glBindTexture(GL_TEXTURE_2D, texture_units[texture_index].texture_2d); } } @@ -182,3 +174,35 @@ void OpenGLState::Apply() { cur_state = *this; } + +void OpenGLState::ResetTexture(GLuint id) { + for (auto& unit : cur_state.texture_units) { + if (unit.texture_2d == id) { + unit.texture_2d = 0; + } + } +} + +void OpenGLState::ResetProgram(GLuint id) { + if (cur_state.draw.shader_program == id) { + cur_state.draw.shader_program = 0; + } +} + +void OpenGLState::ResetBuffer(GLuint id) { + if (cur_state.draw.vertex_buffer == id) { + cur_state.draw.vertex_buffer = 0; + } +} + +void OpenGLState::ResetVertexArray(GLuint id) { + if (cur_state.draw.vertex_array == id) { + cur_state.draw.vertex_array = 0; + } +} + +void OpenGLState::ResetFramebuffer(GLuint id) { + if (cur_state.draw.framebuffer == id) { + cur_state.draw.framebuffer = 0; + } +} diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 26b916360..3e2379021 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -53,7 +53,6 @@ public: // 3 texture units - one for each that is used in PICA fragment shader emulation struct { - bool enabled_2d; // GL_TEXTURE_2D GLuint texture_2d; // GL_TEXTURE_BINDING_2D } texture_units[3]; @@ -74,6 +73,12 @@ public: /// Apply this state as the current OpenGL state void Apply(); + static void ResetTexture(GLuint id); + static void ResetProgram(GLuint id); + static void ResetBuffer(GLuint id); + static void ResetVertexArray(GLuint id); + static void ResetFramebuffer(GLuint id); + private: static OpenGLState cur_state; }; diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 96e12839a..79a940ff6 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -163,7 +163,6 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig& // only allows rows to have a memory alignement of 4. ASSERT(pixel_stride % 4 == 0); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -191,7 +190,6 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig& */ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, const TextureInfo& texture) { - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -239,7 +237,6 @@ void RendererOpenGL::InitOpenGLObjects() { // Allocation of storage is deferred until the first frame, when we // know the framebuffer size. - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -305,7 +302,6 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, UNIMPLEMENTED(); } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -325,7 +321,6 @@ void RendererOpenGL::DrawSingleScreenRotated(const TextureInfo& texture, float x ScreenRectVertex(x+w, y+h, 0.f, 1.f), }; - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp new file mode 100644 index 000000000..4e9836c80 --- /dev/null +++ b/src/video_core/shader/shader.cpp @@ -0,0 +1,180 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <memory> +#include <unordered_map> + +#include <boost/range/algorithm/fill.hpp> + +#include "common/hash.h" +#include "common/make_unique.h" +#include "common/profiler.h" + +#include "video_core/debug_utils/debug_utils.h" +#include "video_core/pica.h" +#include "video_core/video_core.h" + +#include "shader.h" +#include "shader_interpreter.h" + +#ifdef ARCHITECTURE_x86_64 +#include "shader_jit_x64.h" +#endif // ARCHITECTURE_x86_64 + +namespace Pica { + +namespace Shader { + +#ifdef ARCHITECTURE_x86_64 +static std::unordered_map<u64, CompiledShader*> shader_map; +static JitCompiler jit; +static CompiledShader* jit_shader; +#endif // ARCHITECTURE_x86_64 + +void Setup(UnitState<false>& state) { +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) { + u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ + Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)) ^ + g_state.regs.vs.main_offset); + + auto iter = shader_map.find(cache_key); + if (iter != shader_map.end()) { + jit_shader = iter->second; + } else { + jit_shader = jit.Compile(); + shader_map.emplace(cache_key, jit_shader); + } + } +#endif // ARCHITECTURE_x86_64 +} + +void Shutdown() { + shader_map.clear(); +} + +static Common::Profiling::TimingCategory shader_category("Vertex Shader"); + +OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) { + auto& config = g_state.regs.vs; + + Common::Profiling::ScopeTimer timer(shader_category); + + state.program_counter = config.main_offset; + state.debug.max_offset = 0; + state.debug.max_opdesc_id = 0; + + // Setup input register table + const auto& attribute_register_map = config.input_register_map; + + // TODO: Instead of this cumbersome logic, just load the input data directly like + // for (int attr = 0; attr < num_attributes; ++attr) { input_attr[0] = state.registers.input[attribute_register_map.attribute0_register]; } + if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0]; + if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1]; + if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2]; + if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = input.attr[3]; + if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = input.attr[4]; + if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = input.attr[5]; + if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = input.attr[6]; + if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = input.attr[7]; + if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = input.attr[8]; + if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = input.attr[9]; + if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = input.attr[10]; + if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = input.attr[11]; + if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = input.attr[12]; + if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = input.attr[13]; + if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = input.attr[14]; + if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = input.attr[15]; + + state.conditional_code[0] = false; + state.conditional_code[1] = false; + +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) + jit_shader(&state.registers); + else + RunInterpreter(state); +#else + RunInterpreter(state); +#endif // ARCHITECTURE_x86_64 + + // Setup output data + OutputVertex ret; + // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to + // figure out what those circumstances are and enable the remaining outputs then. + for (int i = 0; i < 7; ++i) { + const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here + + u32 semantics[4] = { + output_register_map.map_x, output_register_map.map_y, + output_register_map.map_z, output_register_map.map_w + }; + + for (int comp = 0; comp < 4; ++comp) { + float24* out = ((float24*)&ret) + semantics[comp]; + if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { + *out = state.registers.output[i][comp]; + } else { + // Zero output so that attributes which aren't output won't have denormals in them, + // which would slow us down later. + memset(out, 0, sizeof(*out)); + } + } + } + + // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation + for (int i = 0; i < 4; ++i) { + ret.color[i] = float24::FromFloat32( + std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); + } + + LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), quat (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", + ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), + ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(), + ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), + ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); + + return ret; +} + +DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) { + UnitState<true> state; + + const auto& shader_memory = setup.program_code; + state.program_counter = config.main_offset; + state.debug.max_offset = 0; + state.debug.max_opdesc_id = 0; + + // Setup input register table + const auto& attribute_register_map = config.input_register_map; + float24 dummy_register; + boost::fill(state.registers.input, &dummy_register); + + if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = &input.attr[0].x; + if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = &input.attr[1].x; + if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = &input.attr[2].x; + if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = &input.attr[3].x; + if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = &input.attr[4].x; + if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = &input.attr[5].x; + if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = &input.attr[6].x; + if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = &input.attr[7].x; + if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = &input.attr[8].x; + if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = &input.attr[9].x; + if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = &input.attr[10].x; + if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = &input.attr[11].x; + if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = &input.attr[12].x; + if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = &input.attr[13].x; + if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = &input.attr[14].x; + if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = &input.attr[15].x; + + state.conditional_code[0] = false; + state.conditional_code[1] = false; + + RunInterpreter(state); + return state.debug; +} + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h new file mode 100644 index 000000000..58d21f7cd --- /dev/null +++ b/src/video_core/shader/shader.h @@ -0,0 +1,352 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> + +#include <boost/container/static_vector.hpp> + +#include <nihstro/shader_binary.h> + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/vector_math.h" + +#include "video_core/pica.h" + +using nihstro::RegisterType; +using nihstro::SourceRegister; +using nihstro::DestRegister; + +namespace Pica { + +namespace Shader { + +struct InputVertex { + Math::Vec4<float24> attr[16]; +}; + +struct OutputVertex { + OutputVertex() = default; + + // VS output attributes + Math::Vec4<float24> pos; + Math::Vec4<float24> dummy; // quaternions (not implemented, yet) + Math::Vec4<float24> color; + Math::Vec2<float24> tc0; + Math::Vec2<float24> tc1; + float24 pad[6]; + Math::Vec2<float24> tc2; + + // Padding for optimal alignment + float24 pad2[4]; + + // Attributes used to store intermediate results + + // position after perspective divide + Math::Vec3<float24> screenpos; + float24 pad3; + + // Linear interpolation + // factor: 0=this, 1=vtx + void Lerp(float24 factor, const OutputVertex& vtx) { + pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); + + // TODO: Should perform perspective correct interpolation here... + tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); + tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); + tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); + + screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); + + color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); + } + + // Linear interpolation + // factor: 0=v0, 1=v1 + static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { + OutputVertex ret = v0; + ret.Lerp(factor, v1); + return ret; + } +}; +static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); +static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); + + +// Helper structure used to keep track of data useful for inspection of shader emulation +template<bool full_debugging> +struct DebugData; + +template<> +struct DebugData<false> { + // TODO: Hide these behind and interface and move them to DebugData<true> + u32 max_offset; // maximum program counter ever reached + u32 max_opdesc_id; // maximum swizzle pattern index ever used +}; + +template<> +struct DebugData<true> { + // Records store the input and output operands of a particular instruction. + struct Record { + enum Type { + // Floating point arithmetic operands + SRC1 = 0x1, + SRC2 = 0x2, + SRC3 = 0x4, + + // Initial and final output operand value + DEST_IN = 0x8, + DEST_OUT = 0x10, + + // Current and next instruction offset (in words) + CUR_INSTR = 0x20, + NEXT_INSTR = 0x40, + + // Output address register value + ADDR_REG_OUT = 0x80, + + // Result of a comparison instruction + CMP_RESULT = 0x100, + + // Input values for conditional flow control instructions + COND_BOOL_IN = 0x200, + COND_CMP_IN = 0x400, + + // Input values for a loop + LOOP_INT_IN = 0x800, + }; + + Math::Vec4<float24> src1; + Math::Vec4<float24> src2; + Math::Vec4<float24> src3; + + Math::Vec4<float24> dest_in; + Math::Vec4<float24> dest_out; + + s32 address_registers[2]; + bool conditional_code[2]; + bool cond_bool; + bool cond_cmp[2]; + Math::Vec4<u8> loop_int; + + u32 instruction_offset; + u32 next_instruction; + + // set of enabled fields (as a combination of Type flags) + unsigned mask = 0; + }; + + u32 max_offset; // maximum program counter ever reached + u32 max_opdesc_id; // maximum swizzle pattern index ever used + + // List of records for each executed shader instruction + std::vector<DebugData<true>::Record> records; +}; + +// Type alias for better readability +using DebugDataRecord = DebugData<true>::Record; + +// Helper function to set a DebugData<true>::Record field based on the template enum parameter. +template<DebugDataRecord::Type type, typename ValueType> +inline void SetField(DebugDataRecord& record, ValueType value); + +template<> +inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) { + record.src1.x = value[0]; + record.src1.y = value[1]; + record.src1.z = value[2]; + record.src1.w = value[3]; +} + +template<> +inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) { + record.src2.x = value[0]; + record.src2.y = value[1]; + record.src2.z = value[2]; + record.src2.w = value[3]; +} + +template<> +inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) { + record.src3.x = value[0]; + record.src3.y = value[1]; + record.src3.z = value[2]; + record.src3.w = value[3]; +} + +template<> +inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) { + record.dest_in.x = value[0]; + record.dest_in.y = value[1]; + record.dest_in.z = value[2]; + record.dest_in.w = value[3]; +} + +template<> +inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) { + record.dest_out.x = value[0]; + record.dest_out.y = value[1]; + record.dest_out.z = value[2]; + record.dest_out.w = value[3]; +} + +template<> +inline void SetField<DebugDataRecord::ADDR_REG_OUT>(DebugDataRecord& record, s32* value) { + record.address_registers[0] = value[0]; + record.address_registers[1] = value[1]; +} + +template<> +inline void SetField<DebugDataRecord::CMP_RESULT>(DebugDataRecord& record, bool* value) { + record.conditional_code[0] = value[0]; + record.conditional_code[1] = value[1]; +} + +template<> +inline void SetField<DebugDataRecord::COND_BOOL_IN>(DebugDataRecord& record, bool value) { + record.cond_bool = value; +} + +template<> +inline void SetField<DebugDataRecord::COND_CMP_IN>(DebugDataRecord& record, bool* value) { + record.cond_cmp[0] = value[0]; + record.cond_cmp[1] = value[1]; +} + +template<> +inline void SetField<DebugDataRecord::LOOP_INT_IN>(DebugDataRecord& record, Math::Vec4<u8> value) { + record.loop_int = value; +} + +template<> +inline void SetField<DebugDataRecord::CUR_INSTR>(DebugDataRecord& record, u32 value) { + record.instruction_offset = value; +} + +template<> +inline void SetField<DebugDataRecord::NEXT_INSTR>(DebugDataRecord& record, u32 value) { + record.next_instruction = value; +} + +// Helper function to set debug information on the current shader iteration. +template<DebugDataRecord::Type type, typename ValueType> +inline void Record(DebugData<false>& debug_data, u32 offset, ValueType value) { + // Debugging disabled => nothing to do +} + +template<DebugDataRecord::Type type, typename ValueType> +inline void Record(DebugData<true>& debug_data, u32 offset, ValueType value) { + if (offset >= debug_data.records.size()) + debug_data.records.resize(offset + 1); + + SetField<type, ValueType>(debug_data.records[offset], value); + debug_data.records[offset].mask |= type; +} + + +/** + * This structure contains the state information that needs to be unique for a shader unit. The 3DS + * has four shader units that process shaders in parallel. At the present, Citra only implements a + * single shader unit that processes all shaders serially. Putting the state information in a struct + * here will make it easier for us to parallelize the shader processing later. + */ +template<bool Debug> +struct UnitState { + struct Registers { + // The registers are accessed by the shader JIT using SSE instructions, and are therefore + // required to be 16-byte aligned. + Math::Vec4<float24> MEMORY_ALIGNED16(input[16]); + Math::Vec4<float24> MEMORY_ALIGNED16(output[16]); + Math::Vec4<float24> MEMORY_ALIGNED16(temporary[16]); + } registers; + static_assert(std::is_pod<Registers>::value, "Structure is not POD"); + + u32 program_counter; + bool conditional_code[2]; + + // Two Address registers and one loop counter + // TODO: How many bits do these actually have? + s32 address_registers[3]; + + enum { + INVALID_ADDRESS = 0xFFFFFFFF + }; + + struct CallStackElement { + u32 final_address; // Address upon which we jump to return_address + u32 return_address; // Where to jump when leaving scope + u8 repeat_counter; // How often to repeat until this call stack element is removed + u8 loop_increment; // Which value to add to the loop counter after an iteration + // TODO: Should this be a signed value? Does it even matter? + u32 loop_address; // The address where we'll return to after each loop iteration + }; + + // TODO: Is there a maximal size for this? + boost::container::static_vector<CallStackElement, 16> call_stack; + + DebugData<Debug> debug; + + static int InputOffset(const SourceRegister& reg) { + switch (reg.GetRegisterType()) { + case RegisterType::Input: + return (int)offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + case RegisterType::Temporary: + return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + default: + UNREACHABLE(); + return 0; + } + } + + static int OutputOffset(const DestRegister& reg) { + switch (reg.GetRegisterType()) { + case RegisterType::Output: + return (int)offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + case RegisterType::Temporary: + return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + default: + UNREACHABLE(); + return 0; + } + } +}; + +/** + * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per + * vertex, which would happen within the `Run` function). + * @param state Shader unit state, must be setup per shader and per shader unit + */ +void Setup(UnitState<false>& state); + +/// Performs any cleanup when the emulator is shutdown +void Shutdown(); + +/** + * Runs the currently setup shader + * @param state Shader unit state, must be setup per shader and per shader unit + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @return The output vertex, after having been processed by the vertex shader + */ +OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes); + +/** + * Produce debug information based on the given shader and input vertex + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @param config Configuration object for the shader pipeline + * @param setup Setup object for the shader pipeline + * @return Debug information for this shader with regards to the given vertex + */ +DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup); + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/shader/shader_interpreter.cpp index e73a1d365..e14de0768 100644 --- a/src/video_core/vertex_shader.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -2,18 +2,14 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <boost/container/static_vector.hpp> -#include <boost/range/algorithm.hpp> - #include <common/file_util.h> #include <nihstro/shader_bytecode.h> -#include "common/profiler.h" +#include "video_core/pica.h" -#include "pica.h" -#include "vertex_shader.h" -#include "debug_utils/debug_utils.h" +#include "shader.h" +#include "shader_interpreter.h" using nihstro::OpCode; using nihstro::Instruction; @@ -23,44 +19,10 @@ using nihstro::SwizzlePattern; namespace Pica { -namespace VertexShader { - -struct VertexShaderState { - u32 program_counter; - - const float24* input_register_table[16]; - Math::Vec4<float24> output_registers[16]; - - Math::Vec4<float24> temporary_registers[16]; - bool conditional_code[2]; - - // Two Address registers and one loop counter - // TODO: How many bits do these actually have? - s32 address_registers[3]; - - enum { - INVALID_ADDRESS = 0xFFFFFFFF - }; - - struct CallStackElement { - u32 final_address; // Address upon which we jump to return_address - u32 return_address; // Where to jump when leaving scope - u8 repeat_counter; // How often to repeat until this call stack element is removed - u8 loop_increment; // Which value to add to the loop counter after an iteration - // TODO: Should this be a signed value? Does it even matter? - u32 loop_address; // The address where we'll return to after each loop iteration - }; +namespace Shader { - // TODO: Is there a maximal size for this? - boost::container::static_vector<CallStackElement, 16> call_stack; - - struct { - u32 max_offset; // maximum program counter ever reached - u32 max_opdesc_id; // maximum swizzle pattern index ever used - } debug; -}; - -static void ProcessShaderCode(VertexShaderState& state) { +template<bool Debug> +void RunInterpreter(UnitState<Debug>& state) { const auto& uniforms = g_state.vs.uniforms; const auto& swizzle_data = g_state.vs.swizzle_data; const auto& program_code = g_state.vs.program_code; @@ -68,7 +30,9 @@ static void ProcessShaderCode(VertexShaderState& state) { // Placeholder for invalid inputs static float24 dummy_vec4_float24[4]; - while (true) { + unsigned iteration = 0; + bool exit_loop = false; + while (!exit_loop) { if (!state.call_stack.empty()) { auto& top = state.call_stack.back(); if (state.program_counter == top.final_address) { @@ -86,25 +50,28 @@ static void ProcessShaderCode(VertexShaderState& state) { } } - bool exit_loop = false; const Instruction instr = { program_code[state.program_counter] }; const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; - static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions, + static auto call = [](UnitState<Debug>& state, u32 offset, u32 num_instructions, u32 return_offset, u8 repeat_count, u8 loop_increment) { state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset ASSERT(state.call_stack.size() < state.call_stack.capacity()); state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset }); }; + Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, state.program_counter); + if (iteration > 0) + Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, state.program_counter); + state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter); auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { switch (source_reg.GetRegisterType()) { case RegisterType::Input: - return state.input_register_table[source_reg.GetIndex()]; + return &state.registers.input[source_reg.GetIndex()].x; case RegisterType::Temporary: - return &state.temporary_registers[source_reg.GetIndex()].x; + return &state.registers.temporary[source_reg.GetIndex()].x; case RegisterType::FloatUniform: return &uniforms.f[source_reg.GetIndex()].x; @@ -153,8 +120,8 @@ static void ProcessShaderCode(VertexShaderState& state) { src2[3] = src2[3] * float24::FromFloat32(-1); } - float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers[instr.common.dest.Value().GetIndex()][0] - : (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0] + float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] + : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] : dummy_vec4_float24; state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id); @@ -162,58 +129,78 @@ static void ProcessShaderCode(VertexShaderState& state) { switch (instr.opcode.Value().EffectiveOpCode()) { case OpCode::Id::ADD: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = src1[i] + src2[i]; } - + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } case OpCode::Id::MUL: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = src1[i] * src2[i]; } - + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } case OpCode::Id::FLR: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32())); } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; case OpCode::Id::MAX: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = std::max(src1[i], src2[i]); } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; case OpCode::Id::MIN: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = std::min(src1[i], src2[i]); } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; case OpCode::Id::DP3: case OpCode::Id::DP4: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); float24 dot = float24::FromFloat32(0.f); int num_components = (instr.opcode.Value() == OpCode::Id::DP3) ? 3 : 4; for (int i = 0; i < num_components; ++i) @@ -225,12 +212,15 @@ static void ProcessShaderCode(VertexShaderState& state) { dest[i] = dot; } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } // Reciprocal case OpCode::Id::RCP: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; @@ -239,13 +229,15 @@ static void ProcessShaderCode(VertexShaderState& state) { // TODO: I think this might be wrong... we should only use one component here dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32()); } - + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } // Reciprocal Square Root case OpCode::Id::RSQ: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; @@ -254,12 +246,13 @@ static void ProcessShaderCode(VertexShaderState& state) { // TODO: I think this might be wrong... we should only use one component here dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32())); } - + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } case OpCode::Id::MOVA: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); for (int i = 0; i < 2; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; @@ -267,32 +260,41 @@ static void ProcessShaderCode(VertexShaderState& state) { // TODO: Figure out how the rounding is done on hardware state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32()); } - + Record<DebugDataRecord::ADDR_REG_OUT>(state.debug, iteration, state.address_registers); break; } case OpCode::Id::MOV: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = src1[i]; } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } case OpCode::Id::SLT: case OpCode::Id::SLTI: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f); } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; case OpCode::Id::CMP: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); for (int i = 0; i < 2; ++i) { // TODO: Can you restrict to one compare via dest masking? @@ -300,27 +302,27 @@ static void ProcessShaderCode(VertexShaderState& state) { auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value(); switch (op) { - case compare_op.Equal: + case Instruction::Common::CompareOpType::Equal: state.conditional_code[i] = (src1[i] == src2[i]); break; - case compare_op.NotEqual: + case Instruction::Common::CompareOpType::NotEqual: state.conditional_code[i] = (src1[i] != src2[i]); break; - case compare_op.LessThan: + case Instruction::Common::CompareOpType::LessThan: state.conditional_code[i] = (src1[i] < src2[i]); break; - case compare_op.LessEqual: + case Instruction::Common::CompareOpType::LessEqual: state.conditional_code[i] = (src1[i] <= src2[i]); break; - case compare_op.GreaterThan: + case Instruction::Common::CompareOpType::GreaterThan: state.conditional_code[i] = (src1[i] > src2[i]); break; - case compare_op.GreaterEqual: + case Instruction::Common::CompareOpType::GreaterEqual: state.conditional_code[i] = (src1[i] >= src2[i]); break; @@ -329,6 +331,7 @@ static void ProcessShaderCode(VertexShaderState& state) { break; } } + Record<DebugDataRecord::CMP_RESULT>(state.debug, iteration, state.conditional_code); break; default: @@ -394,16 +397,21 @@ static void ProcessShaderCode(VertexShaderState& state) { src3[3] = src3[3] * float24::FromFloat32(-1); } - float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers[instr.mad.dest.Value().GetIndex()][0] - : (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0] + float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] + : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] : dummy_vec4_float24; + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::SRC3>(state.debug, iteration, src3); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = src1[i] * src2[i] + src3[i]; } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); } else { LOG_ERROR(HW_GPU, "Unhandled multiply-add instruction: 0x%02x (%s): 0x%08x", (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); @@ -413,7 +421,7 @@ static void ProcessShaderCode(VertexShaderState& state) { default: { - static auto evaluate_condition = [](const VertexShaderState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { + static auto evaluate_condition = [](const UnitState<Debug>& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { bool results[2] = { refx == state.conditional_code[0], refy == state.conditional_code[1] }; @@ -439,12 +447,14 @@ static void ProcessShaderCode(VertexShaderState& state) { break; case OpCode::Id::JMPC: + Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { state.program_counter = instr.flow_control.dest_offset - 1; } break; case OpCode::Id::JMPU: + Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id]) { state.program_counter = instr.flow_control.dest_offset - 1; } @@ -458,6 +468,7 @@ static void ProcessShaderCode(VertexShaderState& state) { break; case OpCode::Id::CALLU: + Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id]) { call(state, instr.flow_control.dest_offset, @@ -467,6 +478,7 @@ static void ProcessShaderCode(VertexShaderState& state) { break; case OpCode::Id::CALLC: + Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { call(state, instr.flow_control.dest_offset, @@ -479,6 +491,7 @@ static void ProcessShaderCode(VertexShaderState& state) { break; case OpCode::Id::IFU: + Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id]) { call(state, state.program_counter + 1, @@ -497,6 +510,7 @@ static void ProcessShaderCode(VertexShaderState& state) { { // TODO: Do we need to consider swizzlers here? + Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { call(state, state.program_counter + 1, @@ -514,14 +528,19 @@ static void ProcessShaderCode(VertexShaderState& state) { case OpCode::Id::LOOP: { - state.address_registers[2] = uniforms.i[instr.flow_control.int_uniform_id].y; + Math::Vec4<u8> loop_param(uniforms.i[instr.flow_control.int_uniform_id].x, + uniforms.i[instr.flow_control.int_uniform_id].y, + uniforms.i[instr.flow_control.int_uniform_id].z, + uniforms.i[instr.flow_control.int_uniform_id].w); + state.address_registers[2] = loop_param.y; + Record<DebugDataRecord::LOOP_INT_IN>(state.debug, iteration, loop_param); call(state, state.program_counter + 1, instr.flow_control.dest_offset - state.program_counter + 1, instr.flow_control.dest_offset + 1, - uniforms.i[instr.flow_control.int_uniform_id].x, - uniforms.i[instr.flow_control.int_uniform_id].z); + loop_param.x, + loop_param.z); break; } @@ -536,85 +555,13 @@ static void ProcessShaderCode(VertexShaderState& state) { } ++state.program_counter; - - if (exit_loop) - break; + ++iteration; } } -static Common::Profiling::TimingCategory shader_category("Vertex Shader"); - -OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) { - Common::Profiling::ScopeTimer timer(shader_category); - - VertexShaderState state; - - state.program_counter = config.main_offset; - state.debug.max_offset = 0; - state.debug.max_opdesc_id = 0; - - // Setup input register table - const auto& attribute_register_map = config.input_register_map; - float24 dummy_register; - boost::fill(state.input_register_table, &dummy_register); - - if (num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x; - if (num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x; - if (num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x; - if (num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x; - if (num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x; - if (num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x; - if (num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x; - if (num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x; - if (num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x; - if (num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x; - if (num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x; - if (num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x; - if (num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x; - if (num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x; - if (num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; - if (num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; - - state.conditional_code[0] = false; - state.conditional_code[1] = false; - - ProcessShaderCode(state); - DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), - state.debug.max_opdesc_id, config.main_offset, - g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here - - // Setup output data - OutputVertex ret; - // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to - // figure out what those circumstances are and enable the remaining outputs then. - for (int i = 0; i < 7; ++i) { - const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here - - u32 semantics[4] = { - output_register_map.map_x, output_register_map.map_y, - output_register_map.map_z, output_register_map.map_w - }; - - for (int comp = 0; comp < 4; ++comp) { - float24* out = ((float24*)&ret) + semantics[comp]; - if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { - *out = state.output_registers[i][comp]; - } else { - // Zero output so that attributes which aren't output won't have denormals in them, - // which would slow us down later. - memset(out, 0, sizeof(*out)); - } - } - } - - LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", - ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), - ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), - ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); - - return ret; -} - +// Explicit instantiation +template void RunInterpreter(UnitState<false>& state); +template void RunInterpreter(UnitState<true>& state); } // namespace diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h new file mode 100644 index 000000000..71bcad5ac --- /dev/null +++ b/src/video_core/shader/shader_interpreter.h @@ -0,0 +1,20 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "video_core/pica.h" + +#include "shader.h" + +namespace Pica { + +namespace Shader { + +template<bool Debug> +void RunInterpreter(UnitState<Debug>& state); + +} // namespace + +} // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp new file mode 100644 index 000000000..836942c6b --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -0,0 +1,675 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <smmintrin.h> + +#include "common/x64/abi.h" +#include "common/x64/cpu_detect.h" +#include "common/x64/emitter.h" + +#include "shader.h" +#include "shader_jit_x64.h" + +namespace Pica { + +namespace Shader { + +using namespace Gen; + +typedef void (JitCompiler::*JitFunction)(Instruction instr); + +const JitFunction instr_table[64] = { + &JitCompiler::Compile_ADD, // add + &JitCompiler::Compile_DP3, // dp3 + &JitCompiler::Compile_DP4, // dp4 + nullptr, // dph + nullptr, // unknown + nullptr, // ex2 + nullptr, // lg2 + nullptr, // unknown + &JitCompiler::Compile_MUL, // mul + nullptr, // lge + nullptr, // slt + &JitCompiler::Compile_FLR, // flr + &JitCompiler::Compile_MAX, // max + &JitCompiler::Compile_MIN, // min + &JitCompiler::Compile_RCP, // rcp + &JitCompiler::Compile_RSQ, // rsq + nullptr, // unknown + nullptr, // unknown + &JitCompiler::Compile_MOVA, // mova + &JitCompiler::Compile_MOV, // mov + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // dphi + nullptr, // unknown + nullptr, // sgei + &JitCompiler::Compile_SLTI, // slti + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + &JitCompiler::Compile_NOP, // nop + &JitCompiler::Compile_END, // end + nullptr, // break + &JitCompiler::Compile_CALL, // call + &JitCompiler::Compile_CALLC, // callc + &JitCompiler::Compile_CALLU, // callu + &JitCompiler::Compile_IF, // ifu + &JitCompiler::Compile_IF, // ifc + &JitCompiler::Compile_LOOP, // loop + nullptr, // emit + nullptr, // sete + &JitCompiler::Compile_JMP, // jmpc + &JitCompiler::Compile_JMP, // jmpu + &JitCompiler::Compile_CMP, // cmp + &JitCompiler::Compile_CMP, // cmp + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad +}; + +// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can +// be used as scratch registers within a compiler function. The other registers have designated +// purposes, as documented below: + +/// Pointer to the uniform memory +static const X64Reg UNIFORMS = R9; +/// The two 32-bit VS address offset registers set by the MOVA instruction +static const X64Reg ADDROFFS_REG_0 = R10; +static const X64Reg ADDROFFS_REG_1 = R11; +/// VS loop count register +static const X64Reg LOOPCOUNT_REG = R12; +/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) +static const X64Reg LOOPCOUNT = RSI; +/// Number to increment LOOPCOUNT_REG by on each loop iteration +static const X64Reg LOOPINC = RDI; +/// Result of the previous CMP instruction for the X-component comparison +static const X64Reg COND0 = R13; +/// Result of the previous CMP instruction for the Y-component comparison +static const X64Reg COND1 = R14; +/// Pointer to the UnitState instance for the current VS unit +static const X64Reg REGISTERS = R15; +/// SIMD scratch register +static const X64Reg SCRATCH = XMM0; +/// Loaded with the first swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC1 = XMM1; +/// Loaded with the second swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC2 = XMM2; +/// Loaded with the third swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC3 = XMM3; +/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one +static const X64Reg ONE = XMM14; +/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR +static const X64Reg NEGBIT = XMM15; + +/// Raw constant for the source register selector that indicates no swizzling is performed +static const u8 NO_SRC_REG_SWIZZLE = 0x1b; +/// Raw constant for the destination register enable mask that indicates all components are enabled +static const u8 NO_DEST_REG_MASK = 0xf; + +/** + * Loads and swizzles a source register into the specified XMM register. + * @param instr VS instruction, used for determining how to load the source register + * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) + * @param src_reg SourceRegister object corresponding to the source register to load + * @param dest Destination XMM register to store the loaded, swizzled source register + */ +void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) { + X64Reg src_ptr; + int src_offset; + + if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { + src_ptr = UNIFORMS; + src_offset = src_reg.GetIndex() * sizeof(float24) * 4; + } else { + src_ptr = REGISTERS; + src_offset = UnitState<false>::InputOffset(src_reg); + } + + unsigned operand_desc_id; + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + // The MAD and MADI instructions do not use the address offset registers, so loading the + // source is a bit simpler here + + operand_desc_id = instr.mad.operand_desc_id; + + // Load the source + MOVAPS(dest, MDisp(src_ptr, src_offset)); + } else { + operand_desc_id = instr.common.operand_desc_id; + + const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); + unsigned offset_src = is_inverted ? 2 : 1; + + if (src_num == offset_src && instr.common.address_register_index != 0) { + switch (instr.common.address_register_index) { + case 1: // address offset 1 + MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_0, 1, src_offset)); + break; + case 2: // address offset 2 + MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_1, 1, src_offset)); + break; + case 3: // adddress offet 3 + MOVAPS(dest, MComplex(src_ptr, LOOPCOUNT_REG, 1, src_offset)); + break; + default: + UNREACHABLE(); + break; + } + } else { + // Load the source + MOVAPS(dest, MDisp(src_ptr, src_offset)); + } + } + + SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + + // Generate instructions for source register swizzling as needed + u8 sel = swiz.GetRawSelector(src_num); + if (sel != NO_SRC_REG_SWIZZLE) { + // Selector component order needs to be reversed for the SHUFPS instruction + sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); + + // Shuffle inputs for swizzle + SHUFPS(dest, R(dest), sel); + } + + // If the source register should be negated, flip the negative bit using XOR + const bool negate[] = { swiz.negate_src1, swiz.negate_src2, swiz.negate_src3 }; + if (negate[src_num - 1]) { + XORPS(dest, R(NEGBIT)); + } +} + +void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { + DestRegister dest; + unsigned operand_desc_id; + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + operand_desc_id = instr.mad.operand_desc_id; + dest = instr.mad.dest.Value(); + } else { + operand_desc_id = instr.common.operand_desc_id; + dest = instr.common.dest.Value(); + } + + SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + + // If all components are enabled, write the result to the destination register + if (swiz.dest_mask == NO_DEST_REG_MASK) { + // Store dest back to memory + MOVAPS(MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)), src); + + } else { + // Not all components are enabled, so mask the result when storing to the destination register... + MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState<false>::OutputOffset(dest))); + + if (Common::GetCPUCaps().sse4_1) { + u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); + BLENDPS(SCRATCH, R(src), mask); + } else { + MOVAPS(XMM4, R(src)); + UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination + UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination + + // Compute selector to selectively copy source components to destination for SHUFPS instruction + u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | + ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | + ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | + ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); + SHUFPS(SCRATCH, R(XMM4), sel); + } + + // Store dest back to memory + MOVAPS(MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)), SCRATCH); + } +} + +void JitCompiler::Compile_EvaluateCondition(Instruction instr) { + // Note: NXOR is used below to check for equality + switch (instr.flow_control.op) { + case Instruction::FlowControlType::Or: + MOV(32, R(RAX), R(COND0)); + MOV(32, R(RBX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); + OR(32, R(RAX), R(RBX)); + break; + + case Instruction::FlowControlType::And: + MOV(32, R(RAX), R(COND0)); + MOV(32, R(RBX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); + AND(32, R(RAX), R(RBX)); + break; + + case Instruction::FlowControlType::JustX: + MOV(32, R(RAX), R(COND0)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + break; + + case Instruction::FlowControlType::JustY: + MOV(32, R(RAX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refy.Value() ^ 1)); + break; + } +} + +void JitCompiler::Compile_UniformCondition(Instruction instr) { + int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); + CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); +} + +void JitCompiler::Compile_ADD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + ADDPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_DP3(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + if (Common::GetCPUCaps().sse4_1) { + DPPS(SRC1, R(SRC2), 0x7f); + } else { + MULPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); + + MOVAPS(SRC3, R(SRC1)); + SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); + + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); + ADDPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_DP4(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + if (Common::GetCPUCaps().sse4_1) { + DPPS(SRC1, R(SRC2), 0xff); + } else { + MULPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + ADDPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + ADDPS(SRC1, R(SRC2)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MUL(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MULPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_FLR(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + if (Common::GetCPUCaps().sse4_1) { + ROUNDFLOORPS(SRC1, R(SRC1)); + } else { + CVTPS2DQ(SRC1, R(SRC1)); + CVTDQ2PS(SRC1, R(SRC1)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MAX(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MAXPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MIN(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MINPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MOVA(Instruction instr) { + SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; + + if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { + return; // NoOp + } + + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // Convert floats to integers (only care about X and Y components) + CVTPS2DQ(SRC1, R(SRC1)); + + // Get result + MOVQ_xmm(R(RAX), SRC1); + + // Handle destination enable + if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { + // Move and sign-extend low 32 bits + MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); + + // Move and sign-extend high 32 bits + SHR(64, R(RAX), Imm8(32)); + MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_0), Imm8(4)); + SHL(64, R(ADDROFFS_REG_1), Imm8(4)); + } else { + if (swiz.DestComponentEnabled(0)) { + // Move and sign-extend low 32 bits + MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_0), Imm8(4)); + } else if (swiz.DestComponentEnabled(1)) { + // Move and sign-extend high 32 bits + SHR(64, R(RAX), Imm8(32)); + MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_1), Imm8(4)); + } + } +} + +void JitCompiler::Compile_MOV(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_SLTI(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2); + + CMPSS(SRC1, R(SRC2), CMP_LT); + ANDPS(SRC1, R(ONE)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_RCP(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + RCPPS(SRC1, R(SRC1)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_RSQ(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + RSQRTPS(SRC1, R(SRC1)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_NOP(Instruction instr) { +} + +void JitCompiler::Compile_END(Instruction instr) { + ABI_PopAllCalleeSavedRegsAndAdjustStack(); + RET(); +} + +void JitCompiler::Compile_CALL(Instruction instr) { + unsigned offset = instr.flow_control.dest_offset; + while (offset < (instr.flow_control.dest_offset + instr.flow_control.num_instructions)) { + Compile_NextInstr(&offset); + } +} + +void JitCompiler::Compile_CALLC(Instruction instr) { + Compile_EvaluateCondition(instr); + FixupBranch b = J_CC(CC_Z, true); + Compile_CALL(instr); + SetJumpTarget(b); +} + +void JitCompiler::Compile_CALLU(Instruction instr) { + Compile_UniformCondition(instr); + FixupBranch b = J_CC(CC_Z, true); + Compile_CALL(instr); + SetJumpTarget(b); +} + +void JitCompiler::Compile_CMP(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; + + if (instr.common.compare_op.x == instr.common.compare_op.y) { + // Compare X-component and Y-component together + CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); + + MOVQ_xmm(R(COND0), SRC1); + MOV(64, R(COND1), R(COND0)); + } else { + // Compare X-component + MOVAPS(SCRATCH, R(SRC1)); + CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); + + // Compare Y-component + CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); + + MOVQ_xmm(R(COND0), SCRATCH); + MOVQ_xmm(R(COND1), SRC1); + } + + SHR(32, R(COND0), Imm8(31)); + SHR(64, R(COND1), Imm8(63)); +} + +void JitCompiler::Compile_MAD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); + + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); + } else { + Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); + } + + if (Common::GetCPUCaps().fma) { + VFMADD213PS(SRC1, SRC2, R(SRC3)); + } else { + MULPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_IF(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements not supported"); + + // Evaluate the "IF" condition + if (instr.opcode.Value() == OpCode::Id::IFU) { + Compile_UniformCondition(instr); + } else if (instr.opcode.Value() == OpCode::Id::IFC) { + Compile_EvaluateCondition(instr); + } + FixupBranch b = J_CC(CC_Z, true); + + // Compile the code that corresponds to the condition evaluating as true + Compile_Block(instr.flow_control.dest_offset - 1); + + // If there isn't an "ELSE" condition, we are done here + if (instr.flow_control.num_instructions == 0) { + SetJumpTarget(b); + return; + } + + FixupBranch b2 = J(true); + + SetJumpTarget(b); + + // This code corresponds to the "ELSE" condition + // Comple the code that corresponds to the condition evaluating as false + Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions - 1); + + SetJumpTarget(b2); +} + +void JitCompiler::Compile_LOOP(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops not supported"); + ASSERT_MSG(!looping, "Nested loops not supported"); + + looping = true; + + int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>)); + MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset)); + MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT)); + SHR(32, R(LOOPCOUNT_REG), Imm8(8)); + AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start + MOV(32, R(LOOPINC), R(LOOPCOUNT)); + SHR(32, R(LOOPINC), Imm8(16)); + MOVZX(32, 8, LOOPINC, R(LOOPINC)); // Z-component is the incrementer + MOVZX(32, 8, LOOPCOUNT, R(LOOPCOUNT)); // X-component is iteration count + ADD(32, R(LOOPCOUNT), Imm8(1)); // Iteration count is X-component + 1 + + auto loop_start = GetCodePtr(); + + Compile_Block(instr.flow_control.dest_offset); + + ADD(32, R(LOOPCOUNT_REG), R(LOOPINC)); // Increment LOOPCOUNT_REG by Z-component + SUB(32, R(LOOPCOUNT), Imm8(1)); // Increment loop count by 1 + J_CC(CC_NZ, loop_start); // Loop if not equal + + looping = false; +} + +void JitCompiler::Compile_JMP(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps not supported"); + + if (instr.opcode.Value() == OpCode::Id::JMPC) + Compile_EvaluateCondition(instr); + else if (instr.opcode.Value() == OpCode::Id::JMPU) + Compile_UniformCondition(instr); + else + UNREACHABLE(); + + FixupBranch b = J_CC(CC_NZ, true); + + Compile_Block(instr.flow_control.dest_offset); + + SetJumpTarget(b); +} + +void JitCompiler::Compile_Block(unsigned stop) { + // Save current offset pointer + unsigned* prev_offset_ptr = offset_ptr; + unsigned offset = *prev_offset_ptr; + + while (offset <= stop) + Compile_NextInstr(&offset); + + // Restore current offset pointer + offset_ptr = prev_offset_ptr; + *offset_ptr = offset; +} + +void JitCompiler::Compile_NextInstr(unsigned* offset) { + offset_ptr = offset; + + Instruction instr = *(Instruction*)&g_state.vs.program_code[(*offset_ptr)++]; + OpCode::Id opcode = instr.opcode.Value(); + auto instr_func = instr_table[static_cast<unsigned>(opcode)]; + + if (instr_func) { + // JIT the instruction! + ((*this).*instr_func)(instr); + } else { + // Unhandled instruction + LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", instr.opcode.Value(), instr.hex); + } +} + +CompiledShader* JitCompiler::Compile() { + const u8* start = GetCodePtr(); + const auto& code = g_state.vs.program_code; + unsigned offset = g_state.regs.vs.main_offset; + + ABI_PushAllCalleeSavedRegsAndAdjustStack(); + + MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); + MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); + + // Zero address/loop registers + XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0)); + XOR(64, R(ADDROFFS_REG_1), R(ADDROFFS_REG_1)); + XOR(64, R(LOOPCOUNT_REG), R(LOOPCOUNT_REG)); + + // Used to set a register to one + static const __m128 one = { 1.f, 1.f, 1.f, 1.f }; + MOV(PTRBITS, R(RAX), ImmPtr(&one)); + MOVAPS(ONE, MDisp(RAX, 0)); + + // Used to negate registers + static const __m128 neg = { -0.f, -0.f, -0.f, -0.f }; + MOV(PTRBITS, R(RAX), ImmPtr(&neg)); + MOVAPS(NEGBIT, MDisp(RAX, 0)); + + looping = false; + + while (offset < g_state.vs.program_code.size()) { + Compile_NextInstr(&offset); + } + + return (CompiledShader*)start; +} + +JitCompiler::JitCompiler() { + AllocCodeSpace(1024 * 1024 * 4); +} + +void JitCompiler::Clear() { + ClearCodeSpace(); +} + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h new file mode 100644 index 000000000..b88f2a0d2 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.h @@ -0,0 +1,79 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <nihstro/shader_bytecode.h> + +#include "common/x64/emitter.h" + +#include "video_core/pica.h" + +#include "shader.h" + +using nihstro::Instruction; +using nihstro::OpCode; +using nihstro::SwizzlePattern; + +namespace Pica { + +namespace Shader { + +using CompiledShader = void(void* registers); + +/** + * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 + * code that can be executed on the host machine directly. + */ +class JitCompiler : public Gen::XCodeBlock { +public: + JitCompiler(); + + CompiledShader* Compile(); + + void Clear(); + + void Compile_ADD(Instruction instr); + void Compile_DP3(Instruction instr); + void Compile_DP4(Instruction instr); + void Compile_MUL(Instruction instr); + void Compile_FLR(Instruction instr); + void Compile_MAX(Instruction instr); + void Compile_MIN(Instruction instr); + void Compile_RCP(Instruction instr); + void Compile_RSQ(Instruction instr); + void Compile_MOVA(Instruction instr); + void Compile_MOV(Instruction instr); + void Compile_SLTI(Instruction instr); + void Compile_NOP(Instruction instr); + void Compile_END(Instruction instr); + void Compile_CALL(Instruction instr); + void Compile_CALLC(Instruction instr); + void Compile_CALLU(Instruction instr); + void Compile_IF(Instruction instr); + void Compile_LOOP(Instruction instr); + void Compile_JMP(Instruction instr); + void Compile_CMP(Instruction instr); + void Compile_MAD(Instruction instr); + +private: + void Compile_Block(unsigned stop); + void Compile_NextInstr(unsigned* offset); + + void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); + void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); + + void Compile_EvaluateCondition(Instruction instr); + void Compile_UniformCondition(Instruction instr); + + /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks. + unsigned* offset_ptr = nullptr; + + /// Set to true if currently in a loop, used to check for the existence of nested loops + bool looping = false; +}; + +} // Shader + +} // Pica diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h deleted file mode 100644 index 97f9250dd..000000000 --- a/src/video_core/vertex_shader.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <type_traits> - -#include "common/vector_math.h" - -#include "pica.h" - -namespace Pica { - -namespace VertexShader { - -struct InputVertex { - Math::Vec4<float24> attr[16]; -}; - -struct OutputVertex { - OutputVertex() = default; - - // VS output attributes - Math::Vec4<float24> pos; - Math::Vec4<float24> dummy; // quaternions (not implemented, yet) - Math::Vec4<float24> color; - Math::Vec2<float24> tc0; - Math::Vec2<float24> tc1; - float24 pad[6]; - Math::Vec2<float24> tc2; - - // Padding for optimal alignment - float24 pad2[4]; - - // Attributes used to store intermediate results - - // position after perspective divide - Math::Vec3<float24> screenpos; - float24 pad3; - - // Linear interpolation - // factor: 0=this, 1=vtx - void Lerp(float24 factor, const OutputVertex& vtx) { - pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); - - // TODO: Should perform perspective correct interpolation here... - tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); - tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); - tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); - - screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); - - color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); - } - - // Linear interpolation - // factor: 0=v0, 1=v1 - static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { - OutputVertex ret = v0; - ret.Lerp(factor, v1); - return ret; - } -}; -static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); -static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); - -OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup); - -} // namespace - -} // namespace - diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index 3becc4261..943fde5ee 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -23,6 +23,7 @@ EmuWindow* g_emu_window = nullptr; ///< Frontend emulator window RendererBase* g_renderer = nullptr; ///< Renderer plugin std::atomic<bool> g_hw_renderer_enabled; +std::atomic<bool> g_shader_jit_enabled; /// Initialize the video core void Init(EmuWindow* emu_window) { diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h index 14b33c9dd..2867bf03e 100644 --- a/src/video_core/video_core.h +++ b/src/video_core/video_core.h @@ -32,8 +32,9 @@ static const int kScreenBottomHeight = 240; ///< 3DS bottom screen height extern RendererBase* g_renderer; ///< Renderer plugin extern EmuWindow* g_emu_window; ///< Emu window -// TODO: Wrap this in a user settings struct along with any other graphics settings (often set from qt ui) +// TODO: Wrap these in a user settings struct along with any other graphics settings (often set from qt ui) extern std::atomic<bool> g_hw_renderer_enabled; +extern std::atomic<bool> g_shader_jit_enabled; /// Start the video core void Start(); |