[dynarmic] spill to XMM if possible, use sub/add instead of inc/dec as per recommendation

This commit is contained in:
lizzie 2025-07-28 06:22:09 +01:00 committed by crueter
parent 3fd586db42
commit 0e574c397f
8 changed files with 47 additions and 47 deletions

View file

@ -233,7 +233,7 @@ void A32EmitX64::GenTerminalHandlers() {
terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
calculate_location_descriptor();
code.mov(eax, dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)]);
code.dec(eax);
code.sub(eax, 1);
code.and_(eax, u32(A32JitState::RSBPtrMask));
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]);

View file

@ -208,7 +208,7 @@ void A64EmitX64::GenTerminalHandlers() {
terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
calculate_location_descriptor();
code.mov(eax, dword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_ptr)]);
code.dec(eax);
code.sub(eax, 1);
code.and_(eax, u32(A64JitState::RSBPtrMask));
code.mov(dword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]);

View file

@ -1206,7 +1206,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
}
// a > 0 && a < 0x00800000;
code.dec(tmp);
code.sub(tmp, 1);
code.cmp(tmp, 0x007FFFFF);
code.jb(fallback, code.T_NEAR); //within -127,128
needs_fallback = true;

View file

@ -3326,7 +3326,7 @@ void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
code.paddb(mask, mask);
code.paddb(xmm_a, xmm_a);
code.pblendvb(result, alternate);
code.dec(counter);
code.sub(counter, 1);
code.jnz(loop);
ctx.reg_alloc.DefineValue(inst, result);
@ -3370,7 +3370,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst
code.paddw(mask, mask);
code.paddw(xmm_a, xmm_a);
code.pblendvb(result, alternate);
code.dec(counter);
code.sub(counter, 1);
code.jnz(loop);
ctx.reg_alloc.DefineValue(inst, result);

View file

@ -78,16 +78,16 @@ inline bool HostLocIsFlag(HostLoc reg) {
inline HostLoc HostLocRegIdx(int idx) {
ASSERT(idx >= 0 && idx <= 15);
return static_cast<HostLoc>(idx);
return HostLoc(idx);
}
inline HostLoc HostLocXmmIdx(int idx) {
ASSERT(idx >= 0 && idx <= 15);
return static_cast<HostLoc>(static_cast<size_t>(HostLoc::XMM0) + idx);
return HostLoc(size_t(HostLoc::XMM0) + idx);
}
inline HostLoc HostLocSpill(size_t i) {
return static_cast<HostLoc>(static_cast<size_t>(HostLoc::FirstSpill) + i);
return HostLoc(size_t(HostLoc::FirstSpill) + i);
}
inline bool HostLocIsSpill(HostLoc reg) {

View file

@ -440,10 +440,13 @@ HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc,
// all over the place - it also fixes bugs with high reg pressure
} else if (*it >= HostLoc::R13 && *it <= HostLoc::R15) {
// skip, do not touch
// Intel recommends to reuse registers as soon as they're overwritable (DO NOT SPILL)
} else if (loc_info.IsEmpty()) {
it_empty_candidate = it;
break;
// No empty registers for some reason (very evil) - just do normal LRU
} else {
if (loc_info.lru_counter < min_lru_counter) {
if (loc_info.IsEmpty())
it_empty_candidate = it;
// Otherwise a "quasi"-LRU
min_lru_counter = loc_info.lru_counter;
if (*it >= HostLoc::R8 && *it <= HostLoc::R15) {
@ -454,9 +457,6 @@ HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc,
if (min_lru_counter == 0)
break; //early exit
}
// only if not assigned (i.e for failcase of all LRU=0)
if (it_empty_candidate == desired_locations.cend() && loc_info.IsEmpty())
it_empty_candidate = it;
}
}
// Final resolution goes as follows:
@ -527,11 +527,10 @@ void RegAlloc::Move(HostLoc to, HostLoc from) noexcept {
ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsLocked());
ASSERT(bit_width <= HostLocBitWidth(to));
ASSERT_MSG(!LocInfo(from).IsEmpty(), "Mov eliminated");
if (!LocInfo(from).IsEmpty()) {
EmitMove(bit_width, to, from);
LocInfo(to) = std::exchange(LocInfo(from), {});
}
EmitMove(bit_width, to, from);
LocInfo(to) = std::exchange(LocInfo(from), {});
}
void RegAlloc::CopyToScratch(size_t bit_width, HostLoc to, HostLoc from) noexcept {
@ -565,30 +564,36 @@ void RegAlloc::SpillRegister(HostLoc loc) noexcept {
ASSERT_MSG(HostLocIsRegister(loc), "Only registers can be spilled");
ASSERT_MSG(!LocInfo(loc).IsEmpty(), "There is no need to spill unoccupied registers");
ASSERT_MSG(!LocInfo(loc).IsLocked(), "Registers that have been allocated must not be spilt");
const HostLoc new_loc = FindFreeSpill();
auto const new_loc = FindFreeSpill(HostLocIsXMM(loc));
Move(new_loc, loc);
}
HostLoc RegAlloc::FindFreeSpill() const noexcept {
for (size_t i = static_cast<size_t>(HostLoc::FirstSpill); i < hostloc_info.size(); i++) {
const auto loc = static_cast<HostLoc>(i);
if (LocInfo(loc).IsEmpty()) {
return loc;
}
HostLoc RegAlloc::FindFreeSpill(bool is_xmm) const noexcept {
// Do not spill XMM into other XMM silly
if (!is_xmm) {
// TODO(lizzie): Using lower (xmm0 and such) registers results in issues/crashes - INVESTIGATE WHY
// Intel recommends to spill GPR onto XMM registers IF POSSIBLE
for (size_t i = size_t(HostLoc::XMM15); i >= size_t(HostLoc::XMM0); --i)
if (const auto loc = HostLoc(i); LocInfo(loc).IsEmpty())
return loc;
}
// Otherwise go to stack spilling
for (size_t i = size_t(HostLoc::FirstSpill); i < hostloc_info.size(); ++i)
if (const auto loc = HostLoc(i); LocInfo(loc).IsEmpty())
return loc;
ASSERT_FALSE("All spill locations are full");
}
inline static Xbyak::RegExp SpillToOpArg_Helper1(HostLoc loc, size_t reserved_stack_space) noexcept {
ASSERT(HostLocIsSpill(loc));
size_t i = static_cast<size_t>(loc) - static_cast<size_t>(HostLoc::FirstSpill);
ASSERT_MSG(i < SpillCount, "Spill index greater than number of available spill locations");
return Xbyak::util::rsp + reserved_stack_space + ABI_SHADOW_SPACE + offsetof(StackLayout, spill) + i * sizeof(StackLayout::spill[0]);
}
};
void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc from) noexcept {
auto const spill_to_op_arg_helper = [&](HostLoc loc, size_t reserved_stack_space) {
ASSERT(HostLocIsSpill(loc));
size_t i = size_t(loc) - size_t(HostLoc::FirstSpill);
ASSERT_MSG(i < SpillCount, "Spill index greater than number of available spill locations");
return Xbyak::util::rsp + reserved_stack_space + ABI_SHADOW_SPACE + offsetof(StackLayout, spill) + i * sizeof(StackLayout::spill[0]);
};
auto const spill_xmm_to_op = [&](const HostLoc loc) {
return Xbyak::util::xword[spill_to_op_arg_helper(loc, reserved_stack_space)];
};
if (HostLocIsXMM(to) && HostLocIsXMM(from)) {
MAYBE_AVX(movaps, HostLocToXmm(to), HostLocToXmm(from));
} else if (HostLocIsGPR(to) && HostLocIsGPR(from)) {
@ -613,7 +618,7 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
MAYBE_AVX(movd, HostLocToReg64(to).cvt32(), HostLocToXmm(from));
}
} else if (HostLocIsXMM(to) && HostLocIsSpill(from)) {
const Xbyak::Address spill_addr = SpillToOpArg(from);
const Xbyak::Address spill_addr = spill_xmm_to_op(from);
ASSERT(spill_addr.getBit() >= bit_width);
switch (bit_width) {
case 128:
@ -631,7 +636,7 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
UNREACHABLE();
}
} else if (HostLocIsSpill(to) && HostLocIsXMM(from)) {
const Xbyak::Address spill_addr = SpillToOpArg(to);
const Xbyak::Address spill_addr = spill_xmm_to_op(to);
ASSERT(spill_addr.getBit() >= bit_width);
switch (bit_width) {
case 128:
@ -651,16 +656,16 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
} else if (HostLocIsGPR(to) && HostLocIsSpill(from)) {
ASSERT(bit_width != 128);
if (bit_width == 64) {
code->mov(HostLocToReg64(to), Xbyak::util::qword[SpillToOpArg_Helper1(from, reserved_stack_space)]);
code->mov(HostLocToReg64(to), Xbyak::util::qword[spill_to_op_arg_helper(from, reserved_stack_space)]);
} else {
code->mov(HostLocToReg64(to).cvt32(), Xbyak::util::dword[SpillToOpArg_Helper1(from, reserved_stack_space)]);
code->mov(HostLocToReg64(to).cvt32(), Xbyak::util::dword[spill_to_op_arg_helper(from, reserved_stack_space)]);
}
} else if (HostLocIsSpill(to) && HostLocIsGPR(from)) {
ASSERT(bit_width != 128);
if (bit_width == 64) {
code->mov(Xbyak::util::qword[SpillToOpArg_Helper1(to, reserved_stack_space)], HostLocToReg64(from));
code->mov(Xbyak::util::qword[spill_to_op_arg_helper(to, reserved_stack_space)], HostLocToReg64(from));
} else {
code->mov(Xbyak::util::dword[SpillToOpArg_Helper1(to, reserved_stack_space)], HostLocToReg64(from).cvt32());
code->mov(Xbyak::util::dword[spill_to_op_arg_helper(to, reserved_stack_space)], HostLocToReg64(from).cvt32());
}
} else {
ASSERT_FALSE("Invalid RegAlloc::EmitMove");
@ -677,8 +682,4 @@ void RegAlloc::EmitExchange(const HostLoc a, const HostLoc b) noexcept {
}
}
Xbyak::Address RegAlloc::SpillToOpArg(const HostLoc loc) noexcept {
return Xbyak::util::xword[SpillToOpArg_Helper1(loc, reserved_stack_space)];
}
} // namespace Dynarmic::Backend::X64

View file

@ -243,7 +243,7 @@ private:
void MoveOutOfTheWay(HostLoc reg) noexcept;
void SpillRegister(HostLoc loc) noexcept;
HostLoc FindFreeSpill() const noexcept;
HostLoc FindFreeSpill(bool is_xmm) const noexcept;
inline HostLocInfo& LocInfo(const HostLoc loc) noexcept {
ASSERT(loc != HostLoc::RSP && loc != ABI_JIT_PTR);
@ -256,7 +256,6 @@ private:
void EmitMove(const size_t bit_width, const HostLoc to, const HostLoc from) noexcept;
void EmitExchange(const HostLoc a, const HostLoc b) noexcept;
Xbyak::Address SpillToOpArg(const HostLoc loc) noexcept;
//data
alignas(64) boost::container::static_vector<HostLoc, 28> gpr_order;

View file

@ -22,7 +22,7 @@ void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLoc hostloc, si
} else if (HostLocIsXMM(hostloc)) {
return reg_data.xmms[HostLocToXmm(hostloc).getIdx()];
} else if (HostLocIsSpill(hostloc)) {
return (*reg_data.spill)[static_cast<size_t>(hostloc) - static_cast<size_t>(HostLoc::FirstSpill)];
return (*reg_data.spill)[size_t(hostloc) - size_t(HostLoc::FirstSpill)];
} else {
fmt::print("invalid hostloc! ");
return {0, 0};