[dynarmic] spill to XMM if possible, use sub/add instead of inc/dec as per recommendation

This commit is contained in:
lizzie 2025-07-28 06:22:09 +01:00
parent a01fb901e5
commit d3fccce260
Signed by untrusted user: Lizzie
GPG key ID: D9E134A23AD395CE
8 changed files with 47 additions and 47 deletions

View file

@ -233,7 +233,7 @@ void A32EmitX64::GenTerminalHandlers() {
terminal_handler_pop_rsb_hint = code.getCurr<const void*>(); terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
calculate_location_descriptor(); calculate_location_descriptor();
code.mov(eax, dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)]); code.mov(eax, dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)]);
code.dec(eax); code.sub(eax, 1);
code.and_(eax, u32(A32JitState::RSBPtrMask)); code.and_(eax, u32(A32JitState::RSBPtrMask));
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)], eax); code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]); code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]);

View file

@ -208,7 +208,7 @@ void A64EmitX64::GenTerminalHandlers() {
terminal_handler_pop_rsb_hint = code.getCurr<const void*>(); terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
calculate_location_descriptor(); calculate_location_descriptor();
code.mov(eax, dword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_ptr)]); code.mov(eax, dword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_ptr)]);
code.dec(eax); code.sub(eax, 1);
code.and_(eax, u32(A64JitState::RSBPtrMask)); code.and_(eax, u32(A64JitState::RSBPtrMask));
code.mov(dword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_ptr)], eax); code.mov(dword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]); code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]);

View file

@ -1206,7 +1206,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
} }
// a > 0 && a < 0x00800000; // a > 0 && a < 0x00800000;
code.dec(tmp); code.sub(tmp, 1);
code.cmp(tmp, 0x007FFFFF); code.cmp(tmp, 0x007FFFFF);
code.jb(fallback, code.T_NEAR); //within -127,128 code.jb(fallback, code.T_NEAR); //within -127,128
needs_fallback = true; needs_fallback = true;

View file

@ -3326,7 +3326,7 @@ void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
code.paddb(mask, mask); code.paddb(mask, mask);
code.paddb(xmm_a, xmm_a); code.paddb(xmm_a, xmm_a);
code.pblendvb(result, alternate); code.pblendvb(result, alternate);
code.dec(counter); code.sub(counter, 1);
code.jnz(loop); code.jnz(loop);
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
@ -3370,7 +3370,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst
code.paddw(mask, mask); code.paddw(mask, mask);
code.paddw(xmm_a, xmm_a); code.paddw(xmm_a, xmm_a);
code.pblendvb(result, alternate); code.pblendvb(result, alternate);
code.dec(counter); code.sub(counter, 1);
code.jnz(loop); code.jnz(loop);
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);

View file

@ -78,16 +78,16 @@ inline bool HostLocIsFlag(HostLoc reg) {
inline HostLoc HostLocRegIdx(int idx) { inline HostLoc HostLocRegIdx(int idx) {
ASSERT(idx >= 0 && idx <= 15); ASSERT(idx >= 0 && idx <= 15);
return static_cast<HostLoc>(idx); return HostLoc(idx);
} }
inline HostLoc HostLocXmmIdx(int idx) { inline HostLoc HostLocXmmIdx(int idx) {
ASSERT(idx >= 0 && idx <= 15); ASSERT(idx >= 0 && idx <= 15);
return static_cast<HostLoc>(static_cast<size_t>(HostLoc::XMM0) + idx); return HostLoc(size_t(HostLoc::XMM0) + idx);
} }
inline HostLoc HostLocSpill(size_t i) { inline HostLoc HostLocSpill(size_t i) {
return static_cast<HostLoc>(static_cast<size_t>(HostLoc::FirstSpill) + i); return HostLoc(size_t(HostLoc::FirstSpill) + i);
} }
inline bool HostLocIsSpill(HostLoc reg) { inline bool HostLocIsSpill(HostLoc reg) {

View file

@ -440,10 +440,13 @@ HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc,
// all over the place - it also fixes bugs with high reg pressure // all over the place - it also fixes bugs with high reg pressure
} else if (*it >= HostLoc::R13 && *it <= HostLoc::R15) { } else if (*it >= HostLoc::R13 && *it <= HostLoc::R15) {
// skip, do not touch // skip, do not touch
// Intel recommends to reuse registers as soon as they're overwritable (DO NOT SPILL)
} else if (loc_info.IsEmpty()) {
it_empty_candidate = it;
break;
// No empty registers for some reason (very evil) - just do normal LRU
} else { } else {
if (loc_info.lru_counter < min_lru_counter) { if (loc_info.lru_counter < min_lru_counter) {
if (loc_info.IsEmpty())
it_empty_candidate = it;
// Otherwise a "quasi"-LRU // Otherwise a "quasi"-LRU
min_lru_counter = loc_info.lru_counter; min_lru_counter = loc_info.lru_counter;
if (*it >= HostLoc::R8 && *it <= HostLoc::R15) { if (*it >= HostLoc::R8 && *it <= HostLoc::R15) {
@ -454,9 +457,6 @@ HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc,
if (min_lru_counter == 0) if (min_lru_counter == 0)
break; //early exit break; //early exit
} }
// only if not assigned (i.e for failcase of all LRU=0)
if (it_empty_candidate == desired_locations.cend() && loc_info.IsEmpty())
it_empty_candidate = it;
} }
} }
// Final resolution goes as follows: // Final resolution goes as follows:
@ -527,11 +527,10 @@ void RegAlloc::Move(HostLoc to, HostLoc from) noexcept {
ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsLocked()); ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsLocked());
ASSERT(bit_width <= HostLocBitWidth(to)); ASSERT(bit_width <= HostLocBitWidth(to));
ASSERT_MSG(!LocInfo(from).IsEmpty(), "Mov eliminated");
if (!LocInfo(from).IsEmpty()) { EmitMove(bit_width, to, from);
EmitMove(bit_width, to, from); LocInfo(to) = std::exchange(LocInfo(from), {});
LocInfo(to) = std::exchange(LocInfo(from), {});
}
} }
void RegAlloc::CopyToScratch(size_t bit_width, HostLoc to, HostLoc from) noexcept { void RegAlloc::CopyToScratch(size_t bit_width, HostLoc to, HostLoc from) noexcept {
@ -565,30 +564,36 @@ void RegAlloc::SpillRegister(HostLoc loc) noexcept {
ASSERT_MSG(HostLocIsRegister(loc), "Only registers can be spilled"); ASSERT_MSG(HostLocIsRegister(loc), "Only registers can be spilled");
ASSERT_MSG(!LocInfo(loc).IsEmpty(), "There is no need to spill unoccupied registers"); ASSERT_MSG(!LocInfo(loc).IsEmpty(), "There is no need to spill unoccupied registers");
ASSERT_MSG(!LocInfo(loc).IsLocked(), "Registers that have been allocated must not be spilt"); ASSERT_MSG(!LocInfo(loc).IsLocked(), "Registers that have been allocated must not be spilt");
auto const new_loc = FindFreeSpill(HostLocIsXMM(loc));
const HostLoc new_loc = FindFreeSpill();
Move(new_loc, loc); Move(new_loc, loc);
} }
HostLoc RegAlloc::FindFreeSpill() const noexcept { HostLoc RegAlloc::FindFreeSpill(bool is_xmm) const noexcept {
for (size_t i = static_cast<size_t>(HostLoc::FirstSpill); i < hostloc_info.size(); i++) { // Do not spill XMM into other XMM silly
const auto loc = static_cast<HostLoc>(i); if (!is_xmm) {
if (LocInfo(loc).IsEmpty()) { // TODO(lizzie): Using lower (xmm0 and such) registers results in issues/crashes - INVESTIGATE WHY
return loc; // Intel recommends to spill GPR onto XMM registers IF POSSIBLE
} for (size_t i = size_t(HostLoc::XMM15); i >= size_t(HostLoc::XMM0); --i)
if (const auto loc = HostLoc(i); LocInfo(loc).IsEmpty())
return loc;
} }
// Otherwise go to stack spilling
for (size_t i = size_t(HostLoc::FirstSpill); i < hostloc_info.size(); ++i)
if (const auto loc = HostLoc(i); LocInfo(loc).IsEmpty())
return loc;
ASSERT_FALSE("All spill locations are full"); ASSERT_FALSE("All spill locations are full");
} };
inline static Xbyak::RegExp SpillToOpArg_Helper1(HostLoc loc, size_t reserved_stack_space) noexcept {
ASSERT(HostLocIsSpill(loc));
size_t i = static_cast<size_t>(loc) - static_cast<size_t>(HostLoc::FirstSpill);
ASSERT_MSG(i < SpillCount, "Spill index greater than number of available spill locations");
return Xbyak::util::rsp + reserved_stack_space + ABI_SHADOW_SPACE + offsetof(StackLayout, spill) + i * sizeof(StackLayout::spill[0]);
}
void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc from) noexcept { void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc from) noexcept {
auto const spill_to_op_arg_helper = [&](HostLoc loc, size_t reserved_stack_space) {
ASSERT(HostLocIsSpill(loc));
size_t i = size_t(loc) - size_t(HostLoc::FirstSpill);
ASSERT_MSG(i < SpillCount, "Spill index greater than number of available spill locations");
return Xbyak::util::rsp + reserved_stack_space + ABI_SHADOW_SPACE + offsetof(StackLayout, spill) + i * sizeof(StackLayout::spill[0]);
};
auto const spill_xmm_to_op = [&](const HostLoc loc) {
return Xbyak::util::xword[spill_to_op_arg_helper(loc, reserved_stack_space)];
};
if (HostLocIsXMM(to) && HostLocIsXMM(from)) { if (HostLocIsXMM(to) && HostLocIsXMM(from)) {
MAYBE_AVX(movaps, HostLocToXmm(to), HostLocToXmm(from)); MAYBE_AVX(movaps, HostLocToXmm(to), HostLocToXmm(from));
} else if (HostLocIsGPR(to) && HostLocIsGPR(from)) { } else if (HostLocIsGPR(to) && HostLocIsGPR(from)) {
@ -613,7 +618,7 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
MAYBE_AVX(movd, HostLocToReg64(to).cvt32(), HostLocToXmm(from)); MAYBE_AVX(movd, HostLocToReg64(to).cvt32(), HostLocToXmm(from));
} }
} else if (HostLocIsXMM(to) && HostLocIsSpill(from)) { } else if (HostLocIsXMM(to) && HostLocIsSpill(from)) {
const Xbyak::Address spill_addr = SpillToOpArg(from); const Xbyak::Address spill_addr = spill_xmm_to_op(from);
ASSERT(spill_addr.getBit() >= bit_width); ASSERT(spill_addr.getBit() >= bit_width);
switch (bit_width) { switch (bit_width) {
case 128: case 128:
@ -631,7 +636,7 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
UNREACHABLE(); UNREACHABLE();
} }
} else if (HostLocIsSpill(to) && HostLocIsXMM(from)) { } else if (HostLocIsSpill(to) && HostLocIsXMM(from)) {
const Xbyak::Address spill_addr = SpillToOpArg(to); const Xbyak::Address spill_addr = spill_xmm_to_op(to);
ASSERT(spill_addr.getBit() >= bit_width); ASSERT(spill_addr.getBit() >= bit_width);
switch (bit_width) { switch (bit_width) {
case 128: case 128:
@ -651,16 +656,16 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
} else if (HostLocIsGPR(to) && HostLocIsSpill(from)) { } else if (HostLocIsGPR(to) && HostLocIsSpill(from)) {
ASSERT(bit_width != 128); ASSERT(bit_width != 128);
if (bit_width == 64) { if (bit_width == 64) {
code->mov(HostLocToReg64(to), Xbyak::util::qword[SpillToOpArg_Helper1(from, reserved_stack_space)]); code->mov(HostLocToReg64(to), Xbyak::util::qword[spill_to_op_arg_helper(from, reserved_stack_space)]);
} else { } else {
code->mov(HostLocToReg64(to).cvt32(), Xbyak::util::dword[SpillToOpArg_Helper1(from, reserved_stack_space)]); code->mov(HostLocToReg64(to).cvt32(), Xbyak::util::dword[spill_to_op_arg_helper(from, reserved_stack_space)]);
} }
} else if (HostLocIsSpill(to) && HostLocIsGPR(from)) { } else if (HostLocIsSpill(to) && HostLocIsGPR(from)) {
ASSERT(bit_width != 128); ASSERT(bit_width != 128);
if (bit_width == 64) { if (bit_width == 64) {
code->mov(Xbyak::util::qword[SpillToOpArg_Helper1(to, reserved_stack_space)], HostLocToReg64(from)); code->mov(Xbyak::util::qword[spill_to_op_arg_helper(to, reserved_stack_space)], HostLocToReg64(from));
} else { } else {
code->mov(Xbyak::util::dword[SpillToOpArg_Helper1(to, reserved_stack_space)], HostLocToReg64(from).cvt32()); code->mov(Xbyak::util::dword[spill_to_op_arg_helper(to, reserved_stack_space)], HostLocToReg64(from).cvt32());
} }
} else { } else {
ASSERT_FALSE("Invalid RegAlloc::EmitMove"); ASSERT_FALSE("Invalid RegAlloc::EmitMove");
@ -677,8 +682,4 @@ void RegAlloc::EmitExchange(const HostLoc a, const HostLoc b) noexcept {
} }
} }
Xbyak::Address RegAlloc::SpillToOpArg(const HostLoc loc) noexcept {
return Xbyak::util::xword[SpillToOpArg_Helper1(loc, reserved_stack_space)];
}
} // namespace Dynarmic::Backend::X64 } // namespace Dynarmic::Backend::X64

View file

@ -243,7 +243,7 @@ private:
void MoveOutOfTheWay(HostLoc reg) noexcept; void MoveOutOfTheWay(HostLoc reg) noexcept;
void SpillRegister(HostLoc loc) noexcept; void SpillRegister(HostLoc loc) noexcept;
HostLoc FindFreeSpill() const noexcept; HostLoc FindFreeSpill(bool is_xmm) const noexcept;
inline HostLocInfo& LocInfo(const HostLoc loc) noexcept { inline HostLocInfo& LocInfo(const HostLoc loc) noexcept {
ASSERT(loc != HostLoc::RSP && loc != ABI_JIT_PTR); ASSERT(loc != HostLoc::RSP && loc != ABI_JIT_PTR);
@ -256,7 +256,6 @@ private:
void EmitMove(const size_t bit_width, const HostLoc to, const HostLoc from) noexcept; void EmitMove(const size_t bit_width, const HostLoc to, const HostLoc from) noexcept;
void EmitExchange(const HostLoc a, const HostLoc b) noexcept; void EmitExchange(const HostLoc a, const HostLoc b) noexcept;
Xbyak::Address SpillToOpArg(const HostLoc loc) noexcept;
//data //data
alignas(64) boost::container::static_vector<HostLoc, 28> gpr_order; alignas(64) boost::container::static_vector<HostLoc, 28> gpr_order;

View file

@ -22,7 +22,7 @@ void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLoc hostloc, si
} else if (HostLocIsXMM(hostloc)) { } else if (HostLocIsXMM(hostloc)) {
return reg_data.xmms[HostLocToXmm(hostloc).getIdx()]; return reg_data.xmms[HostLocToXmm(hostloc).getIdx()];
} else if (HostLocIsSpill(hostloc)) { } else if (HostLocIsSpill(hostloc)) {
return (*reg_data.spill)[static_cast<size_t>(hostloc) - static_cast<size_t>(HostLoc::FirstSpill)]; return (*reg_data.spill)[size_t(hostloc) - size_t(HostLoc::FirstSpill)];
} else { } else {
fmt::print("invalid hostloc! "); fmt::print("invalid hostloc! ");
return {0, 0}; return {0, 0};