forked from eden-emu/eden
		
	Merge pull request #1546 from bunnei/refactor-shader-jit
Shader JIT Part 2
This commit is contained in:
		
						commit
						d89e48679e
					
				
					 7 changed files with 271 additions and 162 deletions
				
			
		|  | @ -455,6 +455,18 @@ void XEmitter::CALL(const void* fnptr) | |||
|     Write32(u32(distance)); | ||||
| } | ||||
| 
 | ||||
| FixupBranch XEmitter::CALL() | ||||
| { | ||||
|     FixupBranch branch; | ||||
|     branch.type = 1; | ||||
|     branch.ptr = code + 5; | ||||
| 
 | ||||
|     Write8(0xE8); | ||||
|     Write32(0); | ||||
| 
 | ||||
|     return branch; | ||||
| } | ||||
| 
 | ||||
| FixupBranch XEmitter::J(bool force5bytes) | ||||
| { | ||||
|     FixupBranch branch; | ||||
|  | @ -531,6 +543,22 @@ void XEmitter::SetJumpTarget(const FixupBranch& branch) | |||
|     } | ||||
| } | ||||
| 
 | ||||
| void XEmitter::SetJumpTarget(const FixupBranch& branch, const u8* target) | ||||
| { | ||||
|     if (branch.type == 0) | ||||
|     { | ||||
|         s64 distance = (s64)(target - branch.ptr); | ||||
|         ASSERT_MSG(distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true"); | ||||
|         branch.ptr[-1] = (u8)(s8)distance; | ||||
|     } | ||||
|     else if (branch.type == 1) | ||||
|     { | ||||
|         s64 distance = (s64)(target - branch.ptr); | ||||
|         ASSERT_MSG(distance >= -0x80000000LL && distance < 0x80000000LL, "Jump target too far away, needs indirect register"); | ||||
|         ((s32*)branch.ptr)[-1] = (s32)distance; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| //Single byte opcodes
 | ||||
| //There is no PUSHAD/POPAD in 64-bit mode.
 | ||||
| void XEmitter::INT3() {Write8(0xCC);} | ||||
|  |  | |||
|  | @ -425,12 +425,14 @@ public: | |||
| #undef CALL | ||||
| #endif | ||||
|     void CALL(const void* fnptr); | ||||
|     FixupBranch CALL(); | ||||
|     void CALLptr(OpArg arg); | ||||
| 
 | ||||
|     FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false); | ||||
|     void J_CC(CCFlags conditionCode, const u8* addr, bool force5Bytes = false); | ||||
| 
 | ||||
|     void SetJumpTarget(const FixupBranch& branch); | ||||
|     void SetJumpTarget(const FixupBranch& branch, const u8* target); | ||||
| 
 | ||||
|     void SETcc(CCFlags flag, OpArg dest); | ||||
|     // Note: CMOV brings small if any benefit on current cpus.
 | ||||
|  |  | |||
|  | @ -140,7 +140,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
|                         immediate_attribute_id = 0; | ||||
| 
 | ||||
|                         Shader::UnitState<false> shader_unit; | ||||
|                         Shader::Setup(shader_unit); | ||||
|                         Shader::Setup(); | ||||
| 
 | ||||
|                         if (g_debug_context) | ||||
|                             g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, static_cast<void*>(&immediate_input)); | ||||
|  | @ -300,7 +300,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
|             vertex_cache_ids.fill(-1); | ||||
| 
 | ||||
|             Shader::UnitState<false> shader_unit; | ||||
|             Shader::Setup(shader_unit); | ||||
|             Shader::Setup(); | ||||
| 
 | ||||
|             for (unsigned int index = 0; index < regs.num_vertices; ++index) | ||||
|             { | ||||
|  |  | |||
|  | @ -28,36 +28,24 @@ namespace Pica { | |||
| namespace Shader { | ||||
| 
 | ||||
| #ifdef ARCHITECTURE_x86_64 | ||||
| static std::unordered_map<u64, CompiledShader*> shader_map; | ||||
| static JitCompiler jit; | ||||
| static CompiledShader* jit_shader; | ||||
| 
 | ||||
| static void ClearCache() { | ||||
|     shader_map.clear(); | ||||
|     jit.Clear(); | ||||
|     LOG_INFO(HW_GPU, "Shader JIT cache cleared"); | ||||
| } | ||||
| static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map; | ||||
| static const JitShader* jit_shader; | ||||
| #endif // ARCHITECTURE_x86_64
 | ||||
| 
 | ||||
| void Setup(UnitState<false>& state) { | ||||
| void Setup() { | ||||
| #ifdef ARCHITECTURE_x86_64 | ||||
|     if (VideoCore::g_shader_jit_enabled) { | ||||
|         u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ | ||||
|             Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)) ^ | ||||
|             g_state.regs.vs.main_offset); | ||||
|             Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data))); | ||||
| 
 | ||||
|         auto iter = shader_map.find(cache_key); | ||||
|         if (iter != shader_map.end()) { | ||||
|             jit_shader = iter->second; | ||||
|             jit_shader = iter->second.get(); | ||||
|         } else { | ||||
|             // Check if remaining JIT code space is enough for at least one more (massive) shader
 | ||||
|             if (jit.GetSpaceLeft() < jit_shader_size) { | ||||
|                 // If not, clear the cache of all previously compiled shaders
 | ||||
|                 ClearCache(); | ||||
|             } | ||||
| 
 | ||||
|             jit_shader = jit.Compile(); | ||||
|             shader_map.emplace(cache_key, jit_shader); | ||||
|             auto shader = std::make_unique<JitShader>(); | ||||
|             shader->Compile(); | ||||
|             jit_shader = shader.get(); | ||||
|             shader_map[cache_key] = std::move(shader); | ||||
|         } | ||||
|     } | ||||
| #endif // ARCHITECTURE_x86_64
 | ||||
|  | @ -65,7 +53,7 @@ void Setup(UnitState<false>& state) { | |||
| 
 | ||||
| void Shutdown() { | ||||
| #ifdef ARCHITECTURE_x86_64 | ||||
|     ClearCache(); | ||||
|     shader_map.clear(); | ||||
| #endif // ARCHITECTURE_x86_64
 | ||||
| } | ||||
| 
 | ||||
|  | @ -109,7 +97,7 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr | |||
| 
 | ||||
| #ifdef ARCHITECTURE_x86_64 | ||||
|     if (VideoCore::g_shader_jit_enabled) | ||||
|         jit_shader(&state.registers); | ||||
|         jit_shader->Run(&state.registers, g_state.regs.vs.main_offset); | ||||
|     else | ||||
|         RunInterpreter(state); | ||||
| #else | ||||
|  |  | |||
|  | @ -339,9 +339,8 @@ struct UnitState { | |||
| /**
 | ||||
|  * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per | ||||
|  * vertex, which would happen within the `Run` function). | ||||
|  * @param state Shader unit state, must be setup per shader and per shader unit | ||||
|  */ | ||||
| void Setup(UnitState<false>& state); | ||||
| void Setup(); | ||||
| 
 | ||||
| /// Performs any cleanup when the emulator is shutdown
 | ||||
| void Shutdown(); | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ | |||
| // Licensed under GPLv2 or any later version
 | ||||
| // Refer to the license.txt file included.
 | ||||
| 
 | ||||
| #include <algorithm> | ||||
| #include <smmintrin.h> | ||||
| 
 | ||||
| #include "common/x64/abi.h" | ||||
|  | @ -19,73 +20,73 @@ namespace Shader { | |||
| 
 | ||||
| using namespace Gen; | ||||
| 
 | ||||
| typedef void (JitCompiler::*JitFunction)(Instruction instr); | ||||
| typedef void (JitShader::*JitFunction)(Instruction instr); | ||||
| 
 | ||||
| const JitFunction instr_table[64] = { | ||||
|     &JitCompiler::Compile_ADD,      // add
 | ||||
|     &JitCompiler::Compile_DP3,      // dp3
 | ||||
|     &JitCompiler::Compile_DP4,      // dp4
 | ||||
|     &JitCompiler::Compile_DPH,      // dph
 | ||||
|     &JitShader::Compile_ADD,        // add
 | ||||
|     &JitShader::Compile_DP3,        // dp3
 | ||||
|     &JitShader::Compile_DP4,        // dp4
 | ||||
|     &JitShader::Compile_DPH,        // dph
 | ||||
|     nullptr,                        // unknown
 | ||||
|     &JitCompiler::Compile_EX2,      // ex2
 | ||||
|     &JitCompiler::Compile_LG2,      // lg2
 | ||||
|     &JitShader::Compile_EX2,        // ex2
 | ||||
|     &JitShader::Compile_LG2,        // lg2
 | ||||
|     nullptr,                        // unknown
 | ||||
|     &JitCompiler::Compile_MUL,      // mul
 | ||||
|     &JitCompiler::Compile_SGE,      // sge
 | ||||
|     &JitCompiler::Compile_SLT,      // slt
 | ||||
|     &JitCompiler::Compile_FLR,      // flr
 | ||||
|     &JitCompiler::Compile_MAX,      // max
 | ||||
|     &JitCompiler::Compile_MIN,      // min
 | ||||
|     &JitCompiler::Compile_RCP,      // rcp
 | ||||
|     &JitCompiler::Compile_RSQ,      // rsq
 | ||||
|     &JitShader::Compile_MUL,        // mul
 | ||||
|     &JitShader::Compile_SGE,        // sge
 | ||||
|     &JitShader::Compile_SLT,        // slt
 | ||||
|     &JitShader::Compile_FLR,        // flr
 | ||||
|     &JitShader::Compile_MAX,        // max
 | ||||
|     &JitShader::Compile_MIN,        // min
 | ||||
|     &JitShader::Compile_RCP,        // rcp
 | ||||
|     &JitShader::Compile_RSQ,        // rsq
 | ||||
|     nullptr,                        // unknown
 | ||||
|     nullptr,                        // unknown
 | ||||
|     &JitCompiler::Compile_MOVA,     // mova
 | ||||
|     &JitCompiler::Compile_MOV,      // mov
 | ||||
|     &JitShader::Compile_MOVA,       // mova
 | ||||
|     &JitShader::Compile_MOV,        // mov
 | ||||
|     nullptr,                        // unknown
 | ||||
|     nullptr,                        // unknown
 | ||||
|     nullptr,                        // unknown
 | ||||
|     nullptr,                        // unknown
 | ||||
|     &JitCompiler::Compile_DPH,      // dphi
 | ||||
|     &JitShader::Compile_DPH,        // dphi
 | ||||
|     nullptr,                        // unknown
 | ||||
|     &JitCompiler::Compile_SGE,      // sgei
 | ||||
|     &JitCompiler::Compile_SLT,      // slti
 | ||||
|     &JitShader::Compile_SGE,        // sgei
 | ||||
|     &JitShader::Compile_SLT,        // slti
 | ||||
|     nullptr,                        // unknown
 | ||||
|     nullptr,                        // unknown
 | ||||
|     nullptr,                        // unknown
 | ||||
|     nullptr,                        // unknown
 | ||||
|     nullptr,                        // unknown
 | ||||
|     &JitCompiler::Compile_NOP,      // nop
 | ||||
|     &JitCompiler::Compile_END,      // end
 | ||||
|     &JitShader::Compile_NOP,        // nop
 | ||||
|     &JitShader::Compile_END,        // end
 | ||||
|     nullptr,                        // break
 | ||||
|     &JitCompiler::Compile_CALL,     // call
 | ||||
|     &JitCompiler::Compile_CALLC,    // callc
 | ||||
|     &JitCompiler::Compile_CALLU,    // callu
 | ||||
|     &JitCompiler::Compile_IF,       // ifu
 | ||||
|     &JitCompiler::Compile_IF,       // ifc
 | ||||
|     &JitCompiler::Compile_LOOP,     // loop
 | ||||
|     &JitShader::Compile_CALL,       // call
 | ||||
|     &JitShader::Compile_CALLC,      // callc
 | ||||
|     &JitShader::Compile_CALLU,      // callu
 | ||||
|     &JitShader::Compile_IF,         // ifu
 | ||||
|     &JitShader::Compile_IF,         // ifc
 | ||||
|     &JitShader::Compile_LOOP,       // loop
 | ||||
|     nullptr,                        // emit
 | ||||
|     nullptr,                        // sete
 | ||||
|     &JitCompiler::Compile_JMP,      // jmpc
 | ||||
|     &JitCompiler::Compile_JMP,      // jmpu
 | ||||
|     &JitCompiler::Compile_CMP,      // cmp
 | ||||
|     &JitCompiler::Compile_CMP,      // cmp
 | ||||
|     &JitCompiler::Compile_MAD,      // madi
 | ||||
|     &JitCompiler::Compile_MAD,      // madi
 | ||||
|     &JitCompiler::Compile_MAD,      // madi
 | ||||
|     &JitCompiler::Compile_MAD,      // madi
 | ||||
|     &JitCompiler::Compile_MAD,      // madi
 | ||||
|     &JitCompiler::Compile_MAD,      // madi
 | ||||
|     &JitCompiler::Compile_MAD,      // madi
 | ||||
|     &JitCompiler::Compile_MAD,      // madi
 | ||||
|     &JitCompiler::Compile_MAD,      // mad
 | ||||
|     &JitCompiler::Compile_MAD,      // mad
 | ||||
|     &JitCompiler::Compile_MAD,      // mad
 | ||||
|     &JitCompiler::Compile_MAD,      // mad
 | ||||
|     &JitCompiler::Compile_MAD,      // mad
 | ||||
|     &JitCompiler::Compile_MAD,      // mad
 | ||||
|     &JitCompiler::Compile_MAD,      // mad
 | ||||
|     &JitCompiler::Compile_MAD,      // mad
 | ||||
|     &JitShader::Compile_JMP,        // jmpc
 | ||||
|     &JitShader::Compile_JMP,        // jmpu
 | ||||
|     &JitShader::Compile_CMP,        // cmp
 | ||||
|     &JitShader::Compile_CMP,        // cmp
 | ||||
|     &JitShader::Compile_MAD,        // madi
 | ||||
|     &JitShader::Compile_MAD,        // madi
 | ||||
|     &JitShader::Compile_MAD,        // madi
 | ||||
|     &JitShader::Compile_MAD,        // madi
 | ||||
|     &JitShader::Compile_MAD,        // madi
 | ||||
|     &JitShader::Compile_MAD,        // madi
 | ||||
|     &JitShader::Compile_MAD,        // madi
 | ||||
|     &JitShader::Compile_MAD,        // madi
 | ||||
|     &JitShader::Compile_MAD,        // mad
 | ||||
|     &JitShader::Compile_MAD,        // mad
 | ||||
|     &JitShader::Compile_MAD,        // mad
 | ||||
|     &JitShader::Compile_MAD,        // mad
 | ||||
|     &JitShader::Compile_MAD,        // mad
 | ||||
|     &JitShader::Compile_MAD,        // mad
 | ||||
|     &JitShader::Compile_MAD,        // mad
 | ||||
|     &JitShader::Compile_MAD,        // mad
 | ||||
| }; | ||||
| 
 | ||||
| // The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
 | ||||
|  | @ -137,6 +138,25 @@ static const u8 NO_SRC_REG_SWIZZLE = 0x1b; | |||
| /// Raw constant for the destination register enable mask that indicates all components are enabled
 | ||||
| static const u8 NO_DEST_REG_MASK = 0xf; | ||||
| 
 | ||||
| /**
 | ||||
|  * Get the vertex shader instruction for a given offset in the current shader program | ||||
|  * @param offset Offset in the current shader program of the instruction | ||||
|  * @return Instruction at the specified offset | ||||
|  */ | ||||
| static Instruction GetVertexShaderInstruction(size_t offset) { | ||||
|     return { g_state.vs.program_code[offset] }; | ||||
| } | ||||
| 
 | ||||
| static void LogCritical(const char* msg) { | ||||
|     LOG_CRITICAL(HW_GPU, msg); | ||||
| } | ||||
| 
 | ||||
| void JitShader::Compile_Assert(bool condition, const char* msg) { | ||||
|     if (!condition) { | ||||
|         ABI_CallFunctionP(reinterpret_cast<const void*>(LogCritical), const_cast<char*>(msg)); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * Loads and swizzles a source register into the specified XMM register. | ||||
|  * @param instr VS instruction, used for determining how to load the source register | ||||
|  | @ -144,7 +164,7 @@ static const u8 NO_DEST_REG_MASK = 0xf; | |||
|  * @param src_reg SourceRegister object corresponding to the source register to load | ||||
|  * @param dest Destination XMM register to store the loaded, swizzled source register | ||||
|  */ | ||||
| void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) { | ||||
| void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) { | ||||
|     X64Reg src_ptr; | ||||
|     size_t src_offset; | ||||
| 
 | ||||
|  | @ -216,7 +236,7 @@ void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, Source | |||
|     } | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { | ||||
| void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { | ||||
|     DestRegister dest; | ||||
|     unsigned operand_desc_id; | ||||
|     if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||||
|  | @ -263,7 +283,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) { | ||||
| void JitShader::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) { | ||||
|     MOVAPS(scratch, R(src1)); | ||||
|     CMPPS(scratch, R(src2), CMP_ORD); | ||||
| 
 | ||||
|  | @ -276,7 +296,7 @@ void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen:: | |||
|     ANDPS(src1, R(scratch)); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_EvaluateCondition(Instruction instr) { | ||||
| void JitShader::Compile_EvaluateCondition(Instruction instr) { | ||||
|     // Note: NXOR is used below to check for equality
 | ||||
|     switch (instr.flow_control.op) { | ||||
|     case Instruction::FlowControlType::Or: | ||||
|  | @ -307,23 +327,23 @@ void JitCompiler::Compile_EvaluateCondition(Instruction instr) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_UniformCondition(Instruction instr) { | ||||
| void JitShader::Compile_UniformCondition(Instruction instr) { | ||||
|     int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); | ||||
|     CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); | ||||
| } | ||||
| 
 | ||||
| BitSet32 JitCompiler::PersistentCallerSavedRegs() { | ||||
| BitSet32 JitShader::PersistentCallerSavedRegs() { | ||||
|     return persistent_regs & ABI_ALL_CALLER_SAVED; | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_ADD(Instruction instr) { | ||||
| void JitShader::Compile_ADD(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||
|     ADDPS(SRC1, R(SRC2)); | ||||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_DP3(Instruction instr) { | ||||
| void JitShader::Compile_DP3(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||
| 
 | ||||
|  | @ -342,7 +362,7 @@ void JitCompiler::Compile_DP3(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_DP4(Instruction instr) { | ||||
| void JitShader::Compile_DP4(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||
| 
 | ||||
|  | @ -359,7 +379,7 @@ void JitCompiler::Compile_DP4(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_DPH(Instruction instr) { | ||||
| void JitShader::Compile_DPH(Instruction instr) { | ||||
|     if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { | ||||
|         Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||||
|         Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||||
|  | @ -391,7 +411,7 @@ void JitCompiler::Compile_DPH(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_EX2(Instruction instr) { | ||||
| void JitShader::Compile_EX2(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     MOVSS(XMM0, R(SRC1)); | ||||
| 
 | ||||
|  | @ -404,7 +424,7 @@ void JitCompiler::Compile_EX2(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_LG2(Instruction instr) { | ||||
| void JitShader::Compile_LG2(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     MOVSS(XMM0, R(SRC1)); | ||||
| 
 | ||||
|  | @ -417,14 +437,14 @@ void JitCompiler::Compile_LG2(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_MUL(Instruction instr) { | ||||
| void JitShader::Compile_MUL(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||
|     Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_SGE(Instruction instr) { | ||||
| void JitShader::Compile_SGE(Instruction instr) { | ||||
|     if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { | ||||
|         Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||||
|         Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||||
|  | @ -439,7 +459,7 @@ void JitCompiler::Compile_SGE(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC2); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_SLT(Instruction instr) { | ||||
| void JitShader::Compile_SLT(Instruction instr) { | ||||
|     if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { | ||||
|         Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||||
|         Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||||
|  | @ -454,7 +474,7 @@ void JitCompiler::Compile_SLT(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_FLR(Instruction instr) { | ||||
| void JitShader::Compile_FLR(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
| 
 | ||||
|     if (Common::GetCPUCaps().sse4_1) { | ||||
|  | @ -467,7 +487,7 @@ void JitCompiler::Compile_FLR(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_MAX(Instruction instr) { | ||||
| void JitShader::Compile_MAX(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||
|     // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
 | ||||
|  | @ -475,7 +495,7 @@ void JitCompiler::Compile_MAX(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_MIN(Instruction instr) { | ||||
| void JitShader::Compile_MIN(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||
|     // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
 | ||||
|  | @ -483,7 +503,7 @@ void JitCompiler::Compile_MIN(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_MOVA(Instruction instr) { | ||||
| void JitShader::Compile_MOVA(Instruction instr) { | ||||
|     SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; | ||||
| 
 | ||||
|     if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { | ||||
|  | @ -528,12 +548,12 @@ void JitCompiler::Compile_MOVA(Instruction instr) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_MOV(Instruction instr) { | ||||
| void JitShader::Compile_MOV(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_RCP(Instruction instr) { | ||||
| void JitShader::Compile_RCP(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
| 
 | ||||
|     // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
 | ||||
|  | @ -544,7 +564,7 @@ void JitCompiler::Compile_RCP(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_RSQ(Instruction instr) { | ||||
| void JitShader::Compile_RSQ(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
| 
 | ||||
|     // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
 | ||||
|  | @ -555,36 +575,41 @@ void JitCompiler::Compile_RSQ(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_NOP(Instruction instr) { | ||||
| void JitShader::Compile_NOP(Instruction instr) { | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_END(Instruction instr) { | ||||
| void JitShader::Compile_END(Instruction instr) { | ||||
|     ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); | ||||
|     RET(); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_CALL(Instruction instr) { | ||||
|     unsigned offset = instr.flow_control.dest_offset; | ||||
|     while (offset < (instr.flow_control.dest_offset + instr.flow_control.num_instructions)) { | ||||
|         Compile_NextInstr(&offset); | ||||
|     } | ||||
| void JitShader::Compile_CALL(Instruction instr) { | ||||
|     // Push offset of the return
 | ||||
|     PUSH(64, Imm32(instr.flow_control.dest_offset + instr.flow_control.num_instructions)); | ||||
| 
 | ||||
|     // Call the subroutine
 | ||||
|     FixupBranch b = CALL(); | ||||
|     fixup_branches.push_back({ b, instr.flow_control.dest_offset }); | ||||
| 
 | ||||
|     // Skip over the return offset that's on the stack
 | ||||
|     ADD(64, R(RSP), Imm32(8)); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_CALLC(Instruction instr) { | ||||
| void JitShader::Compile_CALLC(Instruction instr) { | ||||
|     Compile_EvaluateCondition(instr); | ||||
|     FixupBranch b = J_CC(CC_Z, true); | ||||
|     Compile_CALL(instr); | ||||
|     SetJumpTarget(b); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_CALLU(Instruction instr) { | ||||
| void JitShader::Compile_CALLU(Instruction instr) { | ||||
|     Compile_UniformCondition(instr); | ||||
|     FixupBranch b = J_CC(CC_Z, true); | ||||
|     Compile_CALL(instr); | ||||
|     SetJumpTarget(b); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_CMP(Instruction instr) { | ||||
| void JitShader::Compile_CMP(Instruction instr) { | ||||
|     using Op = Instruction::Common::CompareOpType::Op; | ||||
|     Op op_x = instr.common.compare_op.x; | ||||
|     Op op_y = instr.common.compare_op.y; | ||||
|  | @ -627,7 +652,7 @@ void JitCompiler::Compile_CMP(Instruction instr) { | |||
|     SHR(64, R(COND1), Imm8(63)); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_MAD(Instruction instr) { | ||||
| void JitShader::Compile_MAD(Instruction instr) { | ||||
|     Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); | ||||
| 
 | ||||
|     if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||||
|  | @ -644,9 +669,8 @@ void JitCompiler::Compile_MAD(Instruction instr) { | |||
|     Compile_DestEnable(instr, SRC1); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_IF(Instruction instr) { | ||||
|     ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements (%d -> %d) not supported", | ||||
|             *offset_ptr, instr.flow_control.dest_offset.Value()); | ||||
| void JitShader::Compile_IF(Instruction instr) { | ||||
|     Compile_Assert(instr.flow_control.dest_offset >= program_counter, "Backwards if-statements not supported"); | ||||
| 
 | ||||
|     // Evaluate the "IF" condition
 | ||||
|     if (instr.opcode.Value() == OpCode::Id::IFU) { | ||||
|  | @ -676,10 +700,9 @@ void JitCompiler::Compile_IF(Instruction instr) { | |||
|     SetJumpTarget(b2); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_LOOP(Instruction instr) { | ||||
|     ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops (%d -> %d) not supported", | ||||
|             *offset_ptr, instr.flow_control.dest_offset.Value()); | ||||
|     ASSERT_MSG(!looping, "Nested loops not supported"); | ||||
| void JitShader::Compile_LOOP(Instruction instr) { | ||||
|     Compile_Assert(instr.flow_control.dest_offset >= program_counter, "Backwards loops not supported"); | ||||
|     Compile_Assert(!looping, "Nested loops not supported"); | ||||
| 
 | ||||
|     looping = true; | ||||
| 
 | ||||
|  | @ -705,10 +728,7 @@ void JitCompiler::Compile_LOOP(Instruction instr) { | |||
|     looping = false; | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_JMP(Instruction instr) { | ||||
|     ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps (%d -> %d) not supported", | ||||
|             *offset_ptr, instr.flow_control.dest_offset.Value()); | ||||
| 
 | ||||
| void JitShader::Compile_JMP(Instruction instr) { | ||||
|     if (instr.opcode.Value() == OpCode::Id::JMPC) | ||||
|         Compile_EvaluateCondition(instr); | ||||
|     else if (instr.opcode.Value() == OpCode::Id::JMPU) | ||||
|  | @ -718,30 +738,38 @@ void JitCompiler::Compile_JMP(Instruction instr) { | |||
| 
 | ||||
|     bool inverted_condition = (instr.opcode.Value() == OpCode::Id::JMPU) && | ||||
|         (instr.flow_control.num_instructions & 1); | ||||
| 
 | ||||
|     FixupBranch b = J_CC(inverted_condition ? CC_Z : CC_NZ, true); | ||||
|     fixup_branches.push_back({ b, instr.flow_control.dest_offset }); | ||||
| } | ||||
| 
 | ||||
|     Compile_Block(instr.flow_control.dest_offset); | ||||
| void JitShader::Compile_Block(unsigned end) { | ||||
|     while (program_counter < end) { | ||||
|         Compile_NextInstr(); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| void JitShader::Compile_Return() { | ||||
|     // Peek return offset on the stack and check if we're at that offset
 | ||||
|     MOV(64, R(RAX), MDisp(RSP, 8)); | ||||
|     CMP(32, R(RAX), Imm32(program_counter)); | ||||
| 
 | ||||
|     // If so, jump back to before CALL
 | ||||
|     FixupBranch b = J_CC(CC_NZ, true); | ||||
|     RET(); | ||||
|     SetJumpTarget(b); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_Block(unsigned end) { | ||||
|     // Save current offset pointer
 | ||||
|     unsigned* prev_offset_ptr = offset_ptr; | ||||
|     unsigned offset = *prev_offset_ptr; | ||||
| void JitShader::Compile_NextInstr() { | ||||
|     if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) { | ||||
|         Compile_Return(); | ||||
|     } | ||||
| 
 | ||||
|     while (offset < end) | ||||
|         Compile_NextInstr(&offset); | ||||
|     ASSERT_MSG(code_ptr[program_counter] == nullptr, "Tried to compile already compiled shader location!"); | ||||
|     code_ptr[program_counter] = GetCodePtr(); | ||||
| 
 | ||||
|     // Restore current offset pointer
 | ||||
|     offset_ptr = prev_offset_ptr; | ||||
|     *offset_ptr = offset; | ||||
| } | ||||
|     Instruction instr = GetVertexShaderInstruction(program_counter++); | ||||
| 
 | ||||
| void JitCompiler::Compile_NextInstr(unsigned* offset) { | ||||
|     offset_ptr = offset; | ||||
| 
 | ||||
|     Instruction instr = *(Instruction*)&g_state.vs.program_code[(*offset_ptr)++]; | ||||
|     OpCode::Id opcode = instr.opcode.Value(); | ||||
|     auto instr_func = instr_table[static_cast<unsigned>(opcode)]; | ||||
| 
 | ||||
|  | @ -755,9 +783,35 @@ void JitCompiler::Compile_NextInstr(unsigned* offset) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| CompiledShader* JitCompiler::Compile() { | ||||
|     const u8* start = GetCodePtr(); | ||||
|     unsigned offset = g_state.regs.vs.main_offset; | ||||
| void JitShader::FindReturnOffsets() { | ||||
|     return_offsets.clear(); | ||||
| 
 | ||||
|     for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) { | ||||
|         Instruction instr = GetVertexShaderInstruction(offset); | ||||
| 
 | ||||
|         switch (instr.opcode.Value()) { | ||||
|         case OpCode::Id::CALL: | ||||
|         case OpCode::Id::CALLC: | ||||
|         case OpCode::Id::CALLU: | ||||
|             return_offsets.push_back(instr.flow_control.dest_offset + instr.flow_control.num_instructions); | ||||
|             break; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     // Sort for efficient binary search later
 | ||||
|     std::sort(return_offsets.begin(), return_offsets.end()); | ||||
| } | ||||
| 
 | ||||
| void JitShader::Compile() { | ||||
|     // Reset flow control state
 | ||||
|     program = (CompiledShader*)GetCodePtr(); | ||||
|     program_counter = 0; | ||||
|     looping = false; | ||||
|     code_ptr.fill(nullptr); | ||||
|     fixup_branches.clear(); | ||||
| 
 | ||||
|     // Find all `CALL` instructions and identify return locations
 | ||||
|     FindReturnOffsets(); | ||||
| 
 | ||||
|     // The stack pointer is 8 modulo 16 at the entry of a procedure
 | ||||
|     ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); | ||||
|  | @ -780,21 +834,31 @@ CompiledShader* JitCompiler::Compile() { | |||
|     MOV(PTRBITS, R(RAX), ImmPtr(&neg)); | ||||
|     MOVAPS(NEGBIT, MatR(RAX)); | ||||
| 
 | ||||
|     looping = false; | ||||
|     // Jump to start of the shader program
 | ||||
|     JMPptr(R(ABI_PARAM2)); | ||||
| 
 | ||||
|     while (offset < g_state.vs.program_code.size()) { | ||||
|         Compile_NextInstr(&offset); | ||||
|     // Compile entire program
 | ||||
|     Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size())); | ||||
| 
 | ||||
|     // Set the target for any incomplete branches now that the entire shader program has been emitted
 | ||||
|     for (const auto& branch : fixup_branches) { | ||||
|         SetJumpTarget(branch.first, code_ptr[branch.second]); | ||||
|     } | ||||
| 
 | ||||
|     return (CompiledShader*)start; | ||||
|     // Free memory that's no longer needed
 | ||||
|     return_offsets.clear(); | ||||
|     return_offsets.shrink_to_fit(); | ||||
|     fixup_branches.clear(); | ||||
|     fixup_branches.shrink_to_fit(); | ||||
| 
 | ||||
|     uintptr_t size = reinterpret_cast<uintptr_t>(GetCodePtr()) - reinterpret_cast<uintptr_t>(program); | ||||
|     ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); | ||||
| 
 | ||||
|     LOG_DEBUG(HW_GPU, "Compiled shader size=%d", size); | ||||
| } | ||||
| 
 | ||||
| JitCompiler::JitCompiler() { | ||||
|     AllocCodeSpace(jit_cache_size); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Clear() { | ||||
|     ClearCodeSpace(); | ||||
| JitShader::JitShader() { | ||||
|     AllocCodeSpace(MAX_SHADER_SIZE); | ||||
| } | ||||
| 
 | ||||
| } // namespace Shader
 | ||||
|  |  | |||
|  | @ -4,6 +4,9 @@ | |||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include <utility> | ||||
| #include <vector> | ||||
| 
 | ||||
| #include <nihstro/shader_bytecode.h> | ||||
| 
 | ||||
| #include "common/x64/emitter.h" | ||||
|  | @ -19,24 +22,22 @@ namespace Pica { | |||
| 
 | ||||
| namespace Shader { | ||||
| 
 | ||||
| /// Memory needed to be available to compile the next shader (otherwise, clear the cache)
 | ||||
| constexpr size_t jit_shader_size = 1024 * 512; | ||||
| /// Memory allocated for the JIT code space cache
 | ||||
| constexpr size_t jit_cache_size = 1024 * 1024 * 8; | ||||
| 
 | ||||
| using CompiledShader = void(void* registers); | ||||
| /// Memory allocated for each compiled shader (64Kb)
 | ||||
| constexpr size_t MAX_SHADER_SIZE = 1024 * 64; | ||||
| 
 | ||||
| /**
 | ||||
|  * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 | ||||
|  * code that can be executed on the host machine directly. | ||||
|  */ | ||||
| class JitCompiler : public Gen::XCodeBlock { | ||||
| class JitShader : public Gen::XCodeBlock { | ||||
| public: | ||||
|     JitCompiler(); | ||||
|     JitShader(); | ||||
| 
 | ||||
|     CompiledShader* Compile(); | ||||
|     void Run(void* registers, unsigned offset) const { | ||||
|         program(registers, code_ptr[offset]); | ||||
|     } | ||||
| 
 | ||||
|     void Clear(); | ||||
|     void Compile(); | ||||
| 
 | ||||
|     void Compile_ADD(Instruction instr); | ||||
|     void Compile_DP3(Instruction instr); | ||||
|  | @ -66,8 +67,9 @@ public: | |||
|     void Compile_MAD(Instruction instr); | ||||
| 
 | ||||
| private: | ||||
| 
 | ||||
|     void Compile_Block(unsigned end); | ||||
|     void Compile_NextInstr(unsigned* offset); | ||||
|     void Compile_NextInstr(); | ||||
| 
 | ||||
|     void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); | ||||
|     void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); | ||||
|  | @ -81,13 +83,39 @@ private: | |||
|     void Compile_EvaluateCondition(Instruction instr); | ||||
|     void Compile_UniformCondition(Instruction instr); | ||||
| 
 | ||||
|     /**
 | ||||
|      * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction. | ||||
|      */ | ||||
|     void Compile_Return(); | ||||
| 
 | ||||
|     BitSet32 PersistentCallerSavedRegs(); | ||||
| 
 | ||||
|     /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.
 | ||||
|     unsigned* offset_ptr = nullptr; | ||||
|     /**
 | ||||
|      * Assertion evaluated at compile-time, but only triggered if executed at runtime. | ||||
|      * @param msg Message to be logged if the assertion fails. | ||||
|      */ | ||||
|     void Compile_Assert(bool condition, const char* msg); | ||||
| 
 | ||||
|     /// Set to true if currently in a loop, used to check for the existence of nested loops
 | ||||
|     bool looping = false; | ||||
|     /**
 | ||||
|      * Analyzes the entire shader program for `CALL` instructions before emitting any code, | ||||
|      * identifying the locations where a return needs to be inserted. | ||||
|      */ | ||||
|     void FindReturnOffsets(); | ||||
| 
 | ||||
|     /// Mapping of Pica VS instructions to pointers in the emitted code
 | ||||
|     std::array<const u8*, 1024> code_ptr; | ||||
| 
 | ||||
|     /// Offsets in code where a return needs to be inserted
 | ||||
|     std::vector<unsigned> return_offsets; | ||||
| 
 | ||||
|     unsigned program_counter = 0;       ///< Offset of the next instruction to decode
 | ||||
|     bool looping = false;               ///< True if compiling a loop, used to check for nested loops
 | ||||
| 
 | ||||
|     /// Branches that need to be fixed up once the entire shader program is compiled
 | ||||
|     std::vector<std::pair<Gen::FixupBranch, unsigned>> fixup_branches; | ||||
| 
 | ||||
|     using CompiledShader = void(void* registers, const u8* start_addr); | ||||
|     CompiledShader* program = nullptr; | ||||
| }; | ||||
| 
 | ||||
| } // Shader
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 bunnei
						bunnei