[dynarmic] reduce opt pass latency

Signed-off-by: lizzie <lizzie@eden-emu.dev>
This commit is contained in:
lizzie 2025-08-30 05:54:53 +00:00 committed by crueter
parent 84fadd1506
commit 24cbb3a558
33 changed files with 1808 additions and 2302 deletions

View file

@ -23,15 +23,17 @@ One complication dynarmic has is that a compiled block is not uniquely identifia
the PC alone, but bits in the FPSCR and CPSR are also relevant. We resolve this by
computing a 64-bit `UniqueHash` that is guaranteed to uniquely identify a block.
u64 LocationDescriptor::UniqueHash() const {
// This value MUST BE UNIQUE.
// This calculation has to match up with EmitX64::EmitTerminalPopRSBHint
u64 pc_u64 = u64(arm_pc) << 32;
u64 fpscr_u64 = u64(fpscr.Value());
u64 t_u64 = cpsr.T() ? 1 : 0;
u64 e_u64 = cpsr.E() ? 2 : 0;
return pc_u64 | fpscr_u64 | t_u64 | e_u64;
}
```c++
u64 LocationDescriptor::UniqueHash() const {
// This value MUST BE UNIQUE.
// This calculation has to match up with EmitX64::EmitTerminalPopRSBHint
u64 pc_u64 = u64(arm_pc) << 32;
u64 fpscr_u64 = u64(fpscr.Value());
u64 t_u64 = cpsr.T() ? 1 : 0;
u64 e_u64 = cpsr.E() ? 2 : 0;
return pc_u64 | fpscr_u64 | t_u64 | e_u64;
}
```
## Our implementation isn't actually a stack
@ -49,97 +51,107 @@ host addresses for the corresponding the compiled blocks.
size of the real RSB in hardware (which has 3 entries). Larger RSBs than 8
showed degraded performance.
struct JitState {
// ...
```c++
struct JitState {
// ...
static constexpr size_t RSBSize = 8; // MUST be a power of 2.
u32 rsb_ptr = 0;
std::array<u64, RSBSize> rsb_location_descriptors;
std::array<u64, RSBSize> rsb_codeptrs;
void ResetRSB();
static constexpr size_t RSBSize = 8; // MUST be a power of 2.
u32 rsb_ptr = 0;
std::array<u64, RSBSize> rsb_location_descriptors;
std::array<u64, RSBSize> rsb_codeptrs;
void ResetRSB();
// ...
};
// ...
};
```
### RSB Push
We insert our prediction at the insertion point iff the RSB doesn't already
contain a prediction with the same `UniqueHash`.
void EmitX64::EmitPushRSB(IR::Block&, IR::Inst* inst) {
using namespace Xbyak::util;
```c++
void EmitX64::EmitPushRSB(IR::Block&, IR::Inst* inst) {
using namespace Xbyak::util;
ASSERT(inst->GetArg(0).IsImmediate());
u64 imm64 = inst->GetArg(0).GetU64();
ASSERT(inst->GetArg(0).IsImmediate());
u64 imm64 = inst->GetArg(0).GetU64();
Xbyak::Reg64 code_ptr_reg = reg_alloc.ScratchGpr({HostLoc::RCX});
Xbyak::Reg64 loc_desc_reg = reg_alloc.ScratchGpr();
Xbyak::Reg32 index_reg = reg_alloc.ScratchGpr().cvt32();
u64 code_ptr = unique_hash_to_code_ptr.find(imm64) != unique_hash_to_code_ptr.end()
? u64(unique_hash_to_code_ptr[imm64])
: u64(code->GetReturnFromRunCodeAddress());
Xbyak::Reg64 code_ptr_reg = reg_alloc.ScratchGpr({HostLoc::RCX});
Xbyak::Reg64 loc_desc_reg = reg_alloc.ScratchGpr();
Xbyak::Reg32 index_reg = reg_alloc.ScratchGpr().cvt32();
u64 code_ptr = unique_hash_to_code_ptr.find(imm64) != unique_hash_to_code_ptr.end()
? u64(unique_hash_to_code_ptr[imm64])
: u64(code->GetReturnFromRunCodeAddress());
code->mov(index_reg, dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)]);
code->add(index_reg, 1);
code->and_(index_reg, u32(JitState::RSBSize - 1));
code->mov(index_reg, dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)]);
code->add(index_reg, 1);
code->and_(index_reg, u32(JitState::RSBSize - 1));
code->mov(loc_desc_reg, u64(imm64));
CodePtr patch_location = code->getCurr<CodePtr>();
patch_unique_hash_locations[imm64].emplace_back(patch_location);
code->mov(code_ptr_reg, u64(code_ptr)); // This line has to match up with EmitX64::Patch.
code->EnsurePatchLocationSize(patch_location, 10);
code->mov(loc_desc_reg, u64(imm64));
CodePtr patch_location = code->getCurr<CodePtr>();
patch_unique_hash_locations[imm64].emplace_back(patch_location);
code->mov(code_ptr_reg, u64(code_ptr)); // This line has to match up with EmitX64::Patch.
code->EnsurePatchLocationSize(patch_location, 10);
Xbyak::Label label;
for (size_t i = 0; i < JitState::RSBSize; ++i) {
code->cmp(loc_desc_reg, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]);
code->je(label, code->T_SHORT);
}
code->mov(dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)], index_reg);
code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_location_descriptors)], loc_desc_reg);
code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_codeptrs)], code_ptr_reg);
code->L(label);
Xbyak::Label label;
for (size_t i = 0; i < JitState::RSBSize; ++i) {
code->cmp(loc_desc_reg, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]);
code->je(label, code->T_SHORT);
}
code->mov(dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)], index_reg);
code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_location_descriptors)], loc_desc_reg);
code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_codeptrs)], code_ptr_reg);
code->L(label);
}
```
In pseudocode:
for (i := 0 .. RSBSize-1)
if (rsb_location_descriptors[i] == imm64)
goto label;
rsb_ptr++;
rsb_ptr %= RSBSize;
rsb_location_desciptors[rsb_ptr] = imm64; //< The UniqueHash
rsb_codeptr[rsb_ptr] = /* codeptr corresponding to the UniqueHash */;
label:
```c++
for (i := 0 .. RSBSize-1)
if (rsb_location_descriptors[i] == imm64)
goto label;
rsb_ptr++;
rsb_ptr %= RSBSize;
rsb_location_desciptors[rsb_ptr] = imm64; //< The UniqueHash
rsb_codeptr[rsb_ptr] = /* codeptr corresponding to the UniqueHash */;
label:
```
## RSB Pop
To check if a predicition is in the RSB, we linearly scan the RSB.
void EmitX64::EmitTerminalPopRSBHint(IR::Term::PopRSBHint, IR::LocationDescriptor initial_location) {
using namespace Xbyak::util;
```c++
void EmitX64::EmitTerminalPopRSBHint(IR::Term::PopRSBHint, IR::LocationDescriptor initial_location) {
using namespace Xbyak::util;
// This calculation has to match up with IREmitter::PushRSB
code->mov(ecx, MJitStateReg(Arm::Reg::PC));
code->shl(rcx, 32);
code->mov(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, FPSCR_mode)]);
code->or_(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, CPSR_et)]);
code->or_(rbx, rcx);
// This calculation has to match up with IREmitter::PushRSB
code->mov(ecx, MJitStateReg(Arm::Reg::PC));
code->shl(rcx, 32);
code->mov(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, FPSCR_mode)]);
code->or_(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, CPSR_et)]);
code->or_(rbx, rcx);
code->mov(rax, u64(code->GetReturnFromRunCodeAddress()));
for (size_t i = 0; i < JitState::RSBSize; ++i) {
code->cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]);
code->cmove(rax, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_codeptrs) + i * sizeof(u64)]);
}
code->jmp(rax);
code->mov(rax, u64(code->GetReturnFromRunCodeAddress()));
for (size_t i = 0; i < JitState::RSBSize; ++i) {
code->cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]);
code->cmove(rax, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_codeptrs) + i * sizeof(u64)]);
}
code->jmp(rax);
}
```
In pseudocode:
rbx := ComputeUniqueHash()
rax := ReturnToDispatch
for (i := 0 .. RSBSize-1)
if (rbx == rsb_location_descriptors[i])
rax = rsb_codeptrs[i]
goto rax
```c++
rbx := ComputeUniqueHash()
rax := ReturnToDispatch
for (i := 0 .. RSBSize-1)
if (rbx == rsb_location_descriptors[i])
rax = rsb_codeptrs[i]
goto rax
```