[dynarmic] new regalloc scheme #81

Merged
Lizzie merged 3 commits from dynlruregs into master 2025-07-25 02:13:29 +02:00
10 changed files with 365 additions and 202 deletions

View file

@ -415,21 +415,54 @@ void RegAlloc::ReleaseStackSpace(const size_t stack_space) noexcept {
}
HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc, 28>& desired_locations) const noexcept {
boost::container::static_vector<HostLoc, 28> candidates = desired_locations; //Who let someone copy an ENTIRE VECTOR here?
// Find all locations that have not been allocated..
const auto allocated_locs = std::partition(candidates.begin(), candidates.end(), [this](auto loc) noexcept {
return !this->LocInfo(loc).IsLocked();
});
candidates.erase(allocated_locs, candidates.end());
ASSERT_MSG(!candidates.empty(), "All candidate registers have already been allocated");
// TODO(lizzie): Overspill causes issues (reads to 0 and such) on some games, I need to make a testbench
// to later track this down - however I just modified the LRU algo so it prefers empty registers first
// we need to test high register pressure (and spills, maybe 32 regs?)
// Selects the best location out of the available locations.
// NOTE: Using last is BAD because new REX prefix for each insn using the last regs
// TODO: Actually do LRU or something. Currently we just try to pick something without a value if possible.
auto const it = std::find_if(candidates.begin(), candidates.end(), [this](auto const loc) noexcept {
return this->LocInfo(loc).IsEmpty();
});
return it != candidates.end() ? *it : candidates.front();
auto min_lru_counter = size_t(-1);
auto it_candidate = desired_locations.cend(); //default fallback if everything fails
auto it_rex_candidate = desired_locations.cend();
auto it_empty_candidate = desired_locations.cend();
for (auto it = desired_locations.cbegin(); it != desired_locations.cend(); it++) {
auto const& loc_info = LocInfo(*it);
// Abstain from using upper registers unless absolutely nescesary
if (loc_info.IsLocked()) {
// skip, not suitable for allocation
} else {
if (loc_info.lru_counter < min_lru_counter) {
if (loc_info.IsEmpty())
it_empty_candidate = it;
// Otherwise a "quasi"-LRU
min_lru_counter = loc_info.lru_counter;
if (*it >= HostLoc::R8 && *it <= HostLoc::R15) {
it_rex_candidate = it;
} else {
it_candidate = it;
}
if (min_lru_counter == 0)
break; //early exit
}
// only if not assigned (i.e for failcase of all LRU=0)
if (it_empty_candidate == desired_locations.cend() && loc_info.IsEmpty())
it_empty_candidate = it;
}
}
// Final resolution goes as follows:
// 1 => Try an empty candidate
// 2 => Try normal candidate (no REX prefix)
// 3 => Try using a REX prefixed one
// We avoid using REX-addressable registers because they add +1 REX prefix which
// do we really need? The trade-off may not be worth it.
auto const it_final = it_empty_candidate != desired_locations.cend()
? it_empty_candidate : it_candidate != desired_locations.cend()
? it_candidate : it_rex_candidate;
ASSERT_MSG(it_final != desired_locations.cend(), "All candidate registers have already been allocated");
// Evil magic - increment LRU counter (will wrap at 256)
const_cast<RegAlloc*>(this)->LocInfo(*it_final).lru_counter++;
return *it_final;
}
void RegAlloc::DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc) noexcept {

View file

@ -92,8 +92,8 @@ private:
uint8_t max_bit_width = 0; //Valid values: 1,2,4,8,16,32,128
bool is_scratch : 1 = false; //1
bool is_set_last_use : 1 = false; //1
alignas(16) char padding;
alignas(16) uint8_t lru_counter = 0; //1
friend class RegAlloc;
};
static_assert(sizeof(HostLocInfo) == 64);

View file

@ -539,7 +539,8 @@ TEST_CASE("arm: Memory access (fastmem)", "[arm][A32]") {
char* backing_memory = reinterpret_cast<char*>(std::align(page_size, memory_size, buffer_ptr, buffer_size_nconst));
A32FastmemTestEnv env{backing_memory};
Dynarmic::A32::UserConfig config{&env};
Dynarmic::A32::UserConfig config{};
config.callbacks = &env;
config.fastmem_pointer = reinterpret_cast<uintptr_t>(backing_memory);
config.recompile_on_fastmem_failure = false;
config.processor_id = 0;

File diff suppressed because one or more lines are too long

View file

@ -64,7 +64,9 @@ u32 force_default_nan(u32 value) {
template<typename Fn>
void run_test(u32 instruction, Fn fn) {
A64TestEnv env;
A64::Jit jit{A64::UserConfig{&env}};
A64::UserConfig jit_user_config{};
jit_user_config.callbacks = &env;
A64::Jit jit{jit_user_config};
env.code_mem.emplace_back(instruction); // FMAX S0, S1, S2
env.code_mem.emplace_back(0x14000000); // B .

View file

@ -154,7 +154,8 @@ static u32 GenFloatInst(u64 pc, bool is_last_inst) {
}
static Dynarmic::A64::UserConfig GetUserConfig(A64TestEnv& jit_env) {
Dynarmic::A64::UserConfig jit_user_config{&jit_env};
Dynarmic::A64::UserConfig jit_user_config{};
jit_user_config.callbacks = &jit_env;
jit_user_config.optimizations &= ~OptimizationFlag::FastDispatch;
// The below corresponds to the settings for qemu's aarch64_max_initfn
jit_user_config.dczid_el0 = 7;

View file

@ -10,7 +10,8 @@
TEST_CASE("misaligned load/store do not use page_table when detect_misaligned_access_via_page_table is set", "[a64]") {
A64TestEnv env;
Dynarmic::A64::UserConfig conf{&env};
Dynarmic::A64::UserConfig conf{};
conf.callbacks = &env;
conf.page_table = nullptr;
conf.detect_misaligned_access_via_page_table = 128;
conf.only_detect_misalignment_via_page_table_on_page_boundary = true;

View file

@ -12,8 +12,8 @@ using namespace Dynarmic;
TEST_CASE("ensure fast dispatch entry is cleared even when a block does not have any patching requirements", "[a64]") {
A64TestEnv env;
A64::UserConfig conf{&env};
A64::UserConfig conf{};
conf.callbacks = &env;
A64::Jit jit{conf};
REQUIRE(conf.HasOptimization(OptimizationFlag::FastDispatch));
@ -64,8 +64,8 @@ TEST_CASE("ensure fast dispatch entry is cleared even when a block does not have
TEST_CASE("ensure fast dispatch entry is cleared even when a block does not have any patching requirements 2", "[a64]") {
A64TestEnv env;
A64::UserConfig conf{&env};
A64::UserConfig conf{};
conf.callbacks = &env;
A64::Jit jit{conf};
REQUIRE(conf.HasOptimization(OptimizationFlag::FastDispatch));

View file

@ -23,6 +23,7 @@
#include "./rand_int.h"
#include "dynarmic/common/fp/fpcr.h"
#include "dynarmic/common/fp/fpsr.h"
#include "dynarmic/common/llvm_disassemble.h"
#include "dynarmic/frontend/A32/ITState.h"
#include "dynarmic/frontend/A32/a32_location_descriptor.h"
#include "dynarmic/frontend/A32/a32_types.h"
@ -396,39 +397,41 @@ Dynarmic::A32::UserConfig GetA32UserConfig(TestEnv& testenv, bool noopt) {
template<size_t num_jit_reruns = 1, typename TestEnv>
void RunTestInstance(Dynarmic::A32::Jit& jit,
TestEnv& jit_env,
const std::array<u32, 16>& regs,
const std::array<u32, 64>& vecs,
const std::vector<typename TestEnv::InstructionType>& instructions,
const u32 cpsr,
const u32 fpscr,
const size_t ticks_left) {
TestEnv& jit_env,
const std::array<u32, 16>& regs,
const std::array<u32, 64>& vecs,
const std::vector<typename TestEnv::InstructionType>& instructions,
const u32 cpsr,
const u32 fpscr,
const size_t ticks_left,
const bool show_disas) {
const u32 initial_pc = regs[15];
const u32 num_words = initial_pc / sizeof(typename TestEnv::InstructionType);
const u32 code_mem_size = num_words + static_cast<u32>(instructions.size());
fmt::print("instructions:");
for (auto instruction : instructions) {
if constexpr (sizeof(decltype(instruction)) == 2) {
fmt::print(" {:04x}", instruction);
} else {
fmt::print(" {:08x}", instruction);
if (show_disas) {
fmt::print("instructions:\n");
auto current_pc = initial_pc;
for (auto instruction : instructions) {
if constexpr (sizeof(decltype(instruction)) == 2) {
fmt::print("{:04x} ?\n", instruction);
} else {
fmt::print("{}", Dynarmic::Common::DisassembleAArch64(instruction, current_pc));
}
current_pc += sizeof(decltype(instruction));
}
}
fmt::print("\n");
fmt::print("initial_regs:");
for (u32 i : regs) {
fmt::print(" {:08x}", i);
fmt::print("initial_regs:");
for (u32 i : regs)
fmt::print(" {:08x}", i);
fmt::print("\n");
fmt::print("initial_vecs:");
for (u32 i : vecs)
fmt::print(" {:08x}", i);
fmt::print("\n");
fmt::print("initial_cpsr: {:08x}\n", cpsr);
fmt::print("initial_fpcr: {:08x}\n", fpscr);
}
fmt::print("\n");
fmt::print("initial_vecs:");
for (u32 i : vecs) {
fmt::print(" {:08x}", i);
}
fmt::print("\n");
fmt::print("initial_cpsr: {:08x}\n", cpsr);
fmt::print("initial_fpcr: {:08x}\n", fpscr);
jit.ClearCache();
@ -450,36 +453,37 @@ void RunTestInstance(Dynarmic::A32::Jit& jit,
jit.Run();
}
fmt::print("final_regs:");
for (u32 i : jit.Regs()) {
fmt::print(" {:08x}", i);
if (show_disas) {
fmt::print("final_regs:");
for (u32 i : jit.Regs()) {
fmt::print(" {:08x}", i);
}
fmt::print("\n");
fmt::print("final_vecs:");
for (u32 i : jit.ExtRegs()) {
fmt::print(" {:08x}", i);
}
fmt::print("\n");
fmt::print("final_cpsr: {:08x}\n", jit.Cpsr());
fmt::print("final_fpsr: {:08x}\n", mask_fpsr_cum_bits ? jit.Fpscr() & 0xffffff00 : jit.Fpscr());
fmt::print("mod_mem: ");
for (auto [addr, value] : jit_env.modified_memory) {
fmt::print("{:08x}:{:02x} ", addr, value);
}
fmt::print("\n");
fmt::print("interrupts:\n");
for (const auto& i : jit_env.interrupts) {
std::puts(i.c_str());
}
fmt::print("===\n");
jit.DumpDisassembly();
}
fmt::print("\n");
fmt::print("final_vecs:");
for (u32 i : jit.ExtRegs()) {
fmt::print(" {:08x}", i);
}
fmt::print("\n");
fmt::print("final_cpsr: {:08x}\n", jit.Cpsr());
fmt::print("final_fpsr: {:08x}\n", mask_fpsr_cum_bits ? jit.Fpscr() & 0xffffff00 : jit.Fpscr());
fmt::print("mod_mem: ");
for (auto [addr, value] : jit_env.modified_memory) {
fmt::print("{:08x}:{:02x} ", addr, value);
}
fmt::print("\n");
fmt::print("interrupts:\n");
for (const auto& i : jit_env.interrupts) {
std::puts(i.c_str());
}
fmt::print("===\n");
}
Dynarmic::A64::UserConfig GetA64UserConfig(A64TestEnv& jit_env, bool noopt) {
Dynarmic::A64::UserConfig jit_user_config{&jit_env};
jit_user_config.optimizations &= ~OptimizationFlag::FastDispatch;
Dynarmic::A64::UserConfig jit_user_config{};
jit_user_config.callbacks = &jit_env;
jit_user_config.optimizations = all_safe_optimizations;
// The below corresponds to the settings for qemu's aarch64_max_initfn
jit_user_config.dczid_el0 = 7;
jit_user_config.ctr_el0 = 0x80038003;
@ -491,15 +495,16 @@ Dynarmic::A64::UserConfig GetA64UserConfig(A64TestEnv& jit_env, bool noopt) {
template<size_t num_jit_reruns = 2>
void RunTestInstance(Dynarmic::A64::Jit& jit,
A64TestEnv& jit_env,
const std::array<u64, 31>& regs,
const std::array<std::array<u64, 2>, 32>& vecs,
const std::vector<u32>& instructions,
const u32 pstate,
const u32 fpcr,
const u64 initial_sp,
const u64 start_address,
const size_t ticks_left) {
A64TestEnv& jit_env,
const std::array<u64, 31>& regs,
const std::array<std::array<u64, 2>, 32>& vecs,
const std::vector<u32>& instructions,
const u32 pstate,
const u32 fpcr,
const u64 initial_sp,
const u64 start_address,
const size_t ticks_left,
const bool show_disas) {
jit.ClearCache();
for (size_t jit_rerun_count = 0; jit_rerun_count < num_jit_reruns; ++jit_rerun_count) {
@ -522,59 +527,53 @@ void RunTestInstance(Dynarmic::A64::Jit& jit,
jit.Run();
}
fmt::print("instructions:");
for (u32 instruction : instructions) {
fmt::print(" {:08x}", instruction);
}
fmt::print("\n");
if (show_disas) {
fmt::print("instructions:\n");
auto current_pc = start_address;
for (u32 instruction : instructions) {
fmt::print("{}", Dynarmic::Common::DisassembleAArch64(instruction, current_pc));
current_pc += 4;
}
fmt::print("initial_regs:");
for (u64 i : regs) {
fmt::print(" {:016x}", i);
fmt::print("initial_regs:");
for (u64 i : regs)
fmt::print(" {:016x}", i);
fmt::print("\n");
fmt::print("initial_vecs:");
for (auto i : vecs)
fmt::print(" {:016x}:{:016x}", i[0], i[1]);
fmt::print("\n");
fmt::print("initial_sp: {:016x}\n", initial_sp);
fmt::print("initial_pstate: {:08x}\n", pstate);
fmt::print("initial_fpcr: {:08x}\n", fpcr);
fmt::print("final_regs:");
for (u64 i : jit.GetRegisters())
fmt::print(" {:016x}", i);
fmt::print("\n");
fmt::print("final_vecs:");
for (auto i : jit.GetVectors())
fmt::print(" {:016x}:{:016x}", i[0], i[1]);
fmt::print("\n");
fmt::print("final_sp: {:016x}\n", jit.GetSP());
fmt::print("final_pc: {:016x}\n", jit.GetPC());
fmt::print("final_pstate: {:08x}\n", jit.GetPstate());
fmt::print("final_fpcr: {:08x}\n", jit.GetFpcr());
fmt::print("final_qc : {}\n", FP::FPSR{jit.GetFpsr()}.QC());
fmt::print("mod_mem:");
for (auto [addr, value] : jit_env.modified_memory)
fmt::print(" {:08x}:{:02x}", addr, value);
fmt::print("\n");
fmt::print("interrupts:\n");
for (const auto& i : jit_env.interrupts)
std::puts(i.c_str());
fmt::print("===\n");
jit.DumpDisassembly();
}
fmt::print("\n");
fmt::print("initial_vecs:");
for (auto i : vecs) {
fmt::print(" {:016x}:{:016x}", i[0], i[1]);
}
fmt::print("\n");
fmt::print("initial_sp: {:016x}\n", initial_sp);
fmt::print("initial_pstate: {:08x}\n", pstate);
fmt::print("initial_fpcr: {:08x}\n", fpcr);
fmt::print("final_regs:");
for (u64 i : jit.GetRegisters()) {
fmt::print(" {:016x}", i);
}
fmt::print("\n");
fmt::print("final_vecs:");
for (auto i : jit.GetVectors()) {
fmt::print(" {:016x}:{:016x}", i[0], i[1]);
}
fmt::print("\n");
fmt::print("final_sp: {:016x}\n", jit.GetSP());
fmt::print("final_pc: {:016x}\n", jit.GetPC());
fmt::print("final_pstate: {:08x}\n", jit.GetPstate());
fmt::print("final_fpcr: {:08x}\n", jit.GetFpcr());
fmt::print("final_qc : {}\n", FP::FPSR{jit.GetFpsr()}.QC());
fmt::print("mod_mem:");
for (auto [addr, value] : jit_env.modified_memory) {
fmt::print(" {:08x}:{:02x}", addr, value);
}
fmt::print("\n");
fmt::print("interrupts:\n");
for (const auto& i : jit_env.interrupts) {
std::puts(i.c_str());
}
fmt::print("===\n");
}
} // Anonymous namespace
void TestThumb(size_t num_instructions, size_t num_iterations, bool noopt) {
void TestThumb(size_t num_instructions, size_t num_iterations, bool noopt, bool show_disas) {
ThumbTestEnv jit_env{};
Dynarmic::A32::Jit jit{GetA32UserConfig(jit_env, noopt)};
@ -597,11 +596,11 @@ void TestThumb(size_t num_instructions, size_t num_iterations, bool noopt) {
}
regs[15] = start_address;
RunTestInstance(jit, jit_env, regs, ext_reg, instructions, cpsr, fpcr, num_instructions);
RunTestInstance(jit, jit_env, regs, ext_reg, instructions, cpsr, fpcr, num_instructions, show_disas);
}
}
void TestArm(size_t num_instructions, size_t num_iterations, bool noopt) {
void TestArm(size_t num_instructions, size_t num_iterations, bool noopt, bool show_disas) {
ArmTestEnv jit_env{};
Dynarmic::A32::Jit jit{GetA32UserConfig(jit_env, noopt)};
@ -623,11 +622,11 @@ void TestArm(size_t num_instructions, size_t num_iterations, bool noopt) {
}
regs[15] = start_address;
RunTestInstance(jit, jit_env, regs, ext_reg, instructions, cpsr, fpcr, num_instructions);
RunTestInstance(jit, jit_env, regs, ext_reg, instructions, cpsr, fpcr, num_instructions, show_disas);
}
}
void TestA64(size_t num_instructions, size_t num_iterations, bool noopt) {
void TestA64(size_t num_instructions, size_t num_iterations, bool noopt, bool show_disas) {
A64TestEnv jit_env{};
Dynarmic::A64::Jit jit{GetA64UserConfig(jit_env, noopt)};
@ -649,7 +648,7 @@ void TestA64(size_t num_instructions, size_t num_iterations, bool noopt) {
instructions.emplace_back(GenRandomA64Inst(static_cast<u32>(start_address + 4 * instructions.size()), i == num_instructions - 1));
}
RunTestInstance(jit, jit_env, regs, vecs, instructions, pstate, fpcr, initial_sp, start_address, num_instructions);
RunTestInstance(jit, jit_env, regs, vecs, instructions, pstate, fpcr, initial_sp, start_address, num_instructions, show_disas);
}
}
@ -677,6 +676,7 @@ int main(int argc, char* argv[]) {
const auto instruction_count = str2sz(argv[3]);
const auto iterator_count = str2sz(argv[4]);
const bool noopt = argc == 6 && (strcmp(argv[5], "noopt") == 0);
const bool show_disas = argc == 6 && (strcmp(argv[5], "disas") == 0);
if (!seed || !instruction_count || !iterator_count) {
fmt::print("invalid numeric arguments\n");
@ -686,11 +686,11 @@ int main(int argc, char* argv[]) {
detail::g_rand_int_generator.seed(static_cast<std::mt19937::result_type>(*seed));
if (strcmp(argv[1], "thumb") == 0) {
TestThumb(*instruction_count, *iterator_count, noopt);
TestThumb(*instruction_count, *iterator_count, noopt, show_disas);
} else if (strcmp(argv[1], "arm") == 0) {
TestArm(*instruction_count, *iterator_count, noopt);
TestArm(*instruction_count, *iterator_count, noopt, show_disas);
} else if (strcmp(argv[1], "a64") == 0) {
TestA64(*instruction_count, *iterator_count, noopt);
TestA64(*instruction_count, *iterator_count, noopt, show_disas);
} else {
fmt::print("unrecognized instruction class\n");
return 1;

View file

@ -158,7 +158,8 @@ void RunTestInstance(Dynarmic::A32::Jit& jit,
}
A64::UserConfig GetA64UserConfig(A64TestEnv& jit_env, bool noopt) {
A64::UserConfig jit_user_config{&jit_env};
A64::UserConfig jit_user_config{};
jit_user_config.callbacks = &jit_env;
jit_user_config.optimizations &= ~OptimizationFlag::FastDispatch;
// The below corresponds to the settings for qemu's aarch64_max_initfn
jit_user_config.dczid_el0 = 7;