Compare commits

...

6 commits

Author SHA1 Message Date
55c1b9ddcd vk_texture_cache: Rewrite MSAA handling with blits and proper barriers
Refactor MSAA texture upload/download using intermediate blit with correct Vulkan barriers and layout handling.
2025-08-03 01:57:32 +02:00
1f34d836b4
Add cmake option to enable microprofile (#179)
Backported from dd9c743041.

Co-authored-by: PabloMK7 <hackyglitch2@gmail.com>

Co-authored-by: Shinmegumi <shinmegumi@eden-emu.dev>
Co-authored-by: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com>
Reviewed-on: eden-emu/eden#179
Co-authored-by: Gamer64 <gamer64@eden-emu.dev>
Co-committed-by: Gamer64 <gamer64@eden-emu.dev>
2025-08-02 17:22:38 +02:00
b32a667d6f [android] "Disable Buffer Reorder" option translations 2025-08-02 16:58:37 +03:00
15d371c51a revert 45e7c0d62d
Revert fix inversion of toggle for early fences (#175)

This should fix performance regressions in games that didn't need this and fix it to work with the games it is intended to be used for.

// Commit reverted there's some issues with the logic of "release early fences", it's going to be solved in a later commit.

Made based on recommendations by Discord Contributor.

Signed-off-by: Shinmegumi <shinmegumi@eden-emu.dev>
Co-authored-by: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com>
Reviewed-on: eden-emu/eden#175
Co-authored-by: Shinmegumi <shinmegumi@eden-emu.dev>
Co-committed-by: Shinmegumi <shinmegumi@eden-emu.dev>
2025-08-02 02:50:17 +02:00
8cfcf1e8bf
[vector_math]: Use NEON intrinsics in Vec4 dot operation (#177)
PabloMK7: Changes the Vec4 dot operation to use NEON intrinsics on ARM devices.
This function is used every time a triangle is added to the rendered, so it can be considered hot code. The other vector operations are not used as much, so there is no gain to provide NEON operations for them.

The improvements from this change are most likely minimal.

Co-authored-by: PabloMK7 <hackyglitch2@gmail.com>
Co-authored-by: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com>
Reviewed-on: eden-emu/eden#177
Co-authored-by: Gamer64 <gamer64@eden-emu.dev>
Co-committed-by: Gamer64 <gamer64@eden-emu.dev>
2025-08-02 01:48:04 +02:00
56acd4041a
[dynarmic] XMM spill, SSE/AVX emit, sub/add, configurable JIT state pointer, remove unnecessary stuff (#128)
Reviewed-on: eden-emu/eden#128

https://www.agner.org/optimize/
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
2025-08-02 00:48:10 +02:00
89 changed files with 1208 additions and 802 deletions

View file

@ -86,6 +86,8 @@ option(ENABLE_CUBEB "Enables the cubeb audio backend" ON)
option(USE_DISCORD_PRESENCE "Enables Discord Rich Presence" OFF)
option(ENABLE_MICROPROFILE "Enables microprofile capabilities" OFF)
option(YUZU_TESTS "Compile tests" "${BUILD_TESTING}")
if (${CMAKE_SYSTEM_NAME} STREQUAL "FreeBSD")

View file

@ -1,11 +1,11 @@
# Development
* **Windows**: [Windows Building Guide](./docs/build/Windows.md)
* **Linux**: [Linux Building Guide](./docs/build/Linux.md)
* **Android**: [Android Building Guide](./docs/build/Android.md)
* **Solaris**: [Solaris Building Guide](./docs/build/Solaris.md)
* **FreeBSD**: [FreeBSD Building Guide](./docs/build/FreeBSD.md)
* **macOS**: [macOS Building Guide](./docs/build/macOS.md)
* **Windows**: [Windows Building Guide](./build/Windows.md)
* **Linux**: [Linux Building Guide](./build/Linux.md)
* **Android**: [Android Building Guide](./build/Android.md)
* **Solaris**: [Solaris Building Guide](./build/Solaris.md)
* **FreeBSD**: [FreeBSD Building Guide](./build/FreeBSD.md)
* **macOS**: [macOS Building Guide](./build/macOS.md)
# Guidelines
@ -61,7 +61,7 @@ Then type `target remote localhost:1234` and type `c` (for continue) - and then
### gdb cheatsheet
- `mo <cmd>`: Monitor commands, `get info`, `get fastmem` and `get mappings` are available.
- `mo <cmd>`: Monitor commands, `get info`, `get fastmem` and `get mappings` are available. Type `mo help` for more info.
- `detach`: Detach from remote (i.e restarting the emulator).
- `c`: Continue
- `p <expr>`: Print variable, `p/x <expr>` for hexadecimal.

View file

@ -1,3 +1,6 @@
# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
# SPDX-License-Identifier: GPL-3.0-or-later
# SPDX-FileCopyrightText: 2016 Citra Emulator Project
# SPDX-License-Identifier: GPL-2.0-or-later
@ -51,6 +54,11 @@ endif()
# MicroProfile
add_library(microprofile INTERFACE)
target_include_directories(microprofile INTERFACE ./microprofile)
if (ENABLE_MICROPROFILE)
target_compile_definitions(microprofile INTERFACE MICROPROFILE_ENABLED=1)
else()
target_compile_definitions(microprofile INTERFACE MICROPROFILE_ENABLED=0)
endif()
# GCC bugs
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "12" AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND MINGW)

View file

@ -79,7 +79,7 @@ contain a prediction with the same `UniqueHash`.
? u64(unique_hash_to_code_ptr[imm64])
: u64(code->GetReturnFromRunCodeAddress());
code->mov(index_reg, dword[r15 + offsetof(JitState, rsb_ptr)]);
code->mov(index_reg, dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)]);
code->add(index_reg, 1);
code->and_(index_reg, u32(JitState::RSBSize - 1));
@ -91,13 +91,13 @@ contain a prediction with the same `UniqueHash`.
Xbyak::Label label;
for (size_t i = 0; i < JitState::RSBSize; ++i) {
code->cmp(loc_desc_reg, qword[r15 + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]);
code->cmp(loc_desc_reg, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]);
code->je(label, code->T_SHORT);
}
code->mov(dword[r15 + offsetof(JitState, rsb_ptr)], index_reg);
code->mov(qword[r15 + index_reg.cvt64() * 8 + offsetof(JitState, rsb_location_descriptors)], loc_desc_reg);
code->mov(qword[r15 + index_reg.cvt64() * 8 + offsetof(JitState, rsb_codeptrs)], code_ptr_reg);
code->mov(dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)], index_reg);
code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_location_descriptors)], loc_desc_reg);
code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_codeptrs)], code_ptr_reg);
code->L(label);
}
@ -122,14 +122,14 @@ To check if a predicition is in the RSB, we linearly scan the RSB.
// This calculation has to match up with IREmitter::PushRSB
code->mov(ecx, MJitStateReg(Arm::Reg::PC));
code->shl(rcx, 32);
code->mov(ebx, dword[r15 + offsetof(JitState, FPSCR_mode)]);
code->or_(ebx, dword[r15 + offsetof(JitState, CPSR_et)]);
code->mov(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, FPSCR_mode)]);
code->or_(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, CPSR_et)]);
code->or_(rbx, rcx);
code->mov(rax, u64(code->GetReturnFromRunCodeAddress()));
for (size_t i = 0; i < JitState::RSBSize; ++i) {
code->cmp(rbx, qword[r15 + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]);
code->cmove(rax, qword[r15 + offsetof(JitState, rsb_codeptrs) + i * sizeof(u64)]);
code->cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]);
code->cmove(rax, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_codeptrs) + i * sizeof(u64)]);
}
code->jmp(rax);

View file

@ -20,7 +20,7 @@ struct Label;
} // namespace oaknut
namespace Dynarmic::IR {
enum class Type;
enum class Type : u16;
} // namespace Dynarmic::IR
namespace Dynarmic::Backend::Arm64 {

View file

@ -44,21 +44,21 @@ namespace Dynarmic::Backend::X64 {
using namespace Xbyak::util;
static Xbyak::Address MJitStateReg(A32::Reg reg) {
return dword[r15 + offsetof(A32JitState, Reg) + sizeof(u32) * static_cast<size_t>(reg)];
return dword[BlockOfCode::ABI_JIT_PTR + offsetof(A32JitState, Reg) + sizeof(u32) * static_cast<size_t>(reg)];
}
static Xbyak::Address MJitStateExtReg(A32::ExtReg reg) {
if (A32::IsSingleExtReg(reg)) {
const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::S0);
return dword[r15 + offsetof(A32JitState, ExtReg) + sizeof(u32) * index];
return dword[BlockOfCode::ABI_JIT_PTR + offsetof(A32JitState, ExtReg) + sizeof(u32) * index];
}
if (A32::IsDoubleExtReg(reg)) {
const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0);
return qword[r15 + offsetof(A32JitState, ExtReg) + sizeof(u64) * index];
return qword[BlockOfCode::ABI_JIT_PTR + offsetof(A32JitState, ExtReg) + sizeof(u64) * index];
}
if (A32::IsQuadExtReg(reg)) {
const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::Q0);
return xword[r15 + offsetof(A32JitState, ExtReg) + 2 * sizeof(u64) * index];
return xword[BlockOfCode::ABI_JIT_PTR + offsetof(A32JitState, ExtReg) + 2 * sizeof(u64) * index];
}
ASSERT_FALSE("Should never happen.");
}
@ -109,12 +109,12 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
const boost::container::static_vector<HostLoc, 28> gpr_order = [this] {
boost::container::static_vector<HostLoc, 28> gprs{any_gpr};
if (conf.page_table) {
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14));
}
if (conf.fastmem_pointer) {
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R13));
}
if (conf.page_table) {
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14));
}
return gprs;
}();
@ -220,7 +220,7 @@ void A32EmitX64::GenTerminalHandlers() {
// PC ends up in ebp, location_descriptor ends up in rbx
const auto calculate_location_descriptor = [this] {
// This calculation has to match up with IREmitter::PushRSB
code.mov(ebx, dword[r15 + offsetof(A32JitState, upper_location_descriptor)]);
code.mov(ebx, dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)]);
code.shl(rbx, 32);
code.mov(ecx, MJitStateReg(A32::Reg::PC));
code.mov(ebp, ecx);
@ -232,17 +232,17 @@ void A32EmitX64::GenTerminalHandlers() {
code.align();
terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
calculate_location_descriptor();
code.mov(eax, dword[r15 + offsetof(A32JitState, rsb_ptr)]);
code.dec(eax);
code.mov(eax, dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)]);
code.sub(eax, 1);
code.and_(eax, u32(A32JitState::RSBPtrMask));
code.mov(dword[r15 + offsetof(A32JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[r15 + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
code.jne(rsb_cache_miss);
} else {
code.jne(code.GetReturnFromRunCodeAddress());
}
code.mov(rax, qword[r15 + offsetof(A32JitState, rsb_codeptrs) + rax * sizeof(u64)]);
code.mov(rax, qword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_codeptrs) + rax * sizeof(u64)]);
code.jmp(rax);
PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a32_terminal_handler_pop_rsb_hint");
@ -392,17 +392,17 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
// so we load them both at the same time with one 64-bit read. This allows us to
// extract all of their bits together at once with one pext.
static_assert(offsetof(A32JitState, upper_location_descriptor) + 4 == offsetof(A32JitState, cpsr_ge));
code.mov(result.cvt64(), qword[r15 + offsetof(A32JitState, upper_location_descriptor)]);
code.mov(result.cvt64(), qword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)]);
code.mov(tmp.cvt64(), 0x80808080'00000003ull);
code.pext(result.cvt64(), result.cvt64(), tmp.cvt64());
code.mov(tmp, 0x000f0220);
code.pdep(result, result, tmp);
} else {
code.mov(result, dword[r15 + offsetof(A32JitState, upper_location_descriptor)]);
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)]);
code.imul(result, result, 0x120);
code.and_(result, 0x00000220);
code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_ge)]);
code.mov(tmp, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)]);
code.and_(tmp, 0x80808080);
code.imul(tmp, tmp, 0x00204081);
code.shr(tmp, 12);
@ -410,11 +410,11 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
code.or_(result, tmp);
}
code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]);
code.mov(tmp, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)]);
code.shl(tmp, 27);
code.or_(result, tmp);
code.mov(tmp2, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
code.mov(tmp2, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)]);
if (code.HasHostFeature(HostFeature::FastBMI2)) {
code.mov(tmp, NZCV::x64_mask);
code.pext(tmp2, tmp2, tmp);
@ -426,7 +426,7 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
}
code.or_(result, tmp2);
code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]);
code.or_(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_jaifm)]);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -444,7 +444,7 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
// cpsr_q
code.bt(cpsr, 27);
code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]);
code.setc(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)]);
// cpsr_nzcv
code.mov(tmp, cpsr);
@ -456,12 +456,12 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
code.imul(tmp, tmp, NZCV::to_x64_multiplier);
code.and_(tmp, NZCV::x64_mask);
}
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], tmp);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], tmp);
// cpsr_jaifm
code.mov(tmp, cpsr);
code.and_(tmp, 0x010001DF);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_jaifm)], tmp);
if (code.HasHostFeature(HostFeature::FastBMI2)) {
// cpsr_et and cpsr_ge
@ -469,7 +469,7 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
// This mask is 0x7FFF0000, because we do not want the MSB to be sign extended to the upper dword.
static_assert((A32::LocationDescriptor::FPSCR_MODE_MASK & ~0x7FFF0000) == 0);
code.and_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0x7FFF0000));
code.and_(qword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)], u32(0x7FFF0000));
code.mov(tmp, 0x000f0220);
code.pext(cpsr, cpsr, tmp);
code.mov(tmp.cvt64(), 0x01010101'00000003ull);
@ -479,14 +479,14 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(tmp2.cvt64(), tmp.cvt64());
code.sub(tmp.cvt64(), cpsr.cvt64());
code.xor_(tmp.cvt64(), tmp2.cvt64());
code.or_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp.cvt64());
code.or_(qword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)], tmp.cvt64());
} else {
code.and_(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0xFFFF0000));
code.and_(dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)], u32(0xFFFF0000));
code.mov(tmp, cpsr);
code.and_(tmp, 0x00000220);
code.imul(tmp, tmp, 0x00900000);
code.shr(tmp, 28);
code.or_(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp);
code.or_(dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)], tmp);
code.and_(cpsr, 0x000f0000);
code.shr(cpsr, 16);
@ -495,14 +495,14 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(tmp, 0x80808080);
code.sub(tmp, cpsr);
code.xor_(tmp, 0x80808080);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], tmp);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], tmp);
}
}
void A32EmitX64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], to_store);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], to_store);
}
void A32EmitX64::EmitA32SetCpsrNZCVRaw(A32EmitContext& ctx, IR::Inst* inst) {
@ -510,7 +510,7 @@ void A32EmitX64::EmitA32SetCpsrNZCVRaw(A32EmitContext& ctx, IR::Inst* inst) {
if (args[0].IsImmediate()) {
const u32 imm = args[0].GetImmediateU32();
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
@ -518,14 +518,14 @@ void A32EmitX64::EmitA32SetCpsrNZCVRaw(A32EmitContext& ctx, IR::Inst* inst) {
code.shr(a, 28);
code.mov(b, NZCV::x64_mask);
code.pdep(a, a, b);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], a);
} else {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.shr(a, 28);
code.imul(a, a, NZCV::to_x64_multiplier);
code.and_(a, NZCV::x64_mask);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], a);
}
}
@ -534,25 +534,25 @@ void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
if (args[0].IsImmediate()) {
const u32 imm = args[0].GetImmediateU32();
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0));
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0));
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
code.shr(a, 28);
code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]);
code.setc(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)]);
code.mov(b, NZCV::x64_mask);
code.pdep(a, a, b);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], a);
} else {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.shr(a, 28);
code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]);
code.setc(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)]);
code.imul(a, a, NZCV::to_x64_multiplier);
code.and_(a, NZCV::x64_mask);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], a);
}
}
@ -562,10 +562,10 @@ void A32EmitX64::EmitA32SetCpsrNZ(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 nz = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
code.movzx(tmp, code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1]);
code.movzx(tmp, code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1]);
code.and_(tmp, 1);
code.or_(tmp, nz);
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], tmp.cvt8());
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1], tmp.cvt8());
}
void A32EmitX64::EmitA32SetCpsrNZC(A32EmitContext& ctx, IR::Inst* inst) {
@ -575,11 +575,11 @@ void A32EmitX64::EmitA32SetCpsrNZC(A32EmitContext& ctx, IR::Inst* inst) {
if (args[1].IsImmediate()) {
const bool c = args[1].GetImmediateU1();
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], c);
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1], c);
} else {
const Xbyak::Reg8 c = ctx.reg_alloc.UseGpr(args[1]).cvt8();
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], c);
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1], c);
}
} else {
const Xbyak::Reg32 nz = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
@ -588,19 +588,19 @@ void A32EmitX64::EmitA32SetCpsrNZC(A32EmitContext& ctx, IR::Inst* inst) {
const bool c = args[1].GetImmediateU1();
code.or_(nz, c);
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], nz.cvt8());
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1], nz.cvt8());
} else {
const Xbyak::Reg32 c = ctx.reg_alloc.UseGpr(args[1]).cvt32();
code.or_(nz, c);
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], nz.cvt8());
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1], nz.cvt8());
}
}
}
static void EmitGetFlag(BlockOfCode& code, A32EmitContext& ctx, IR::Inst* inst, size_t flag_bit) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)]);
if (flag_bit != 0) {
code.shr(result, static_cast<int>(flag_bit));
}
@ -616,18 +616,18 @@ void A32EmitX64::EmitA32OrQFlag(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (args[0].IsImmediate()) {
if (args[0].GetImmediateU1()) {
code.mov(dword[r15 + offsetof(A32JitState, cpsr_q)], 1);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)], 1);
}
} else {
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
code.or_(code.byte[r15 + offsetof(A32JitState, cpsr_q)], to_store);
code.or_(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)], to_store);
}
}
void A32EmitX64::EmitA32GetGEFlags(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
code.movd(result, dword[r15 + offsetof(A32JitState, cpsr_ge)]);
code.movd(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)]);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -637,10 +637,10 @@ void A32EmitX64::EmitA32SetGEFlags(A32EmitContext& ctx, IR::Inst* inst) {
if (args[0].IsInXmm()) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]);
code.movd(dword[r15 + offsetof(A32JitState, cpsr_ge)], to_store);
code.movd(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store);
} else {
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt32();
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], to_store);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store);
}
}
@ -654,7 +654,7 @@ void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst
ge |= mcl::bit::get_bit<17>(imm) ? 0x0000FF00 : 0;
ge |= mcl::bit::get_bit<16>(imm) ? 0x000000FF : 0;
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], ge);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], ge);
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
@ -663,7 +663,7 @@ void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst
code.shr(a, 16);
code.pdep(a, a, b);
code.imul(a, a, 0xFF);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], a);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], a);
} else {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
@ -672,7 +672,7 @@ void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst
code.imul(a, a, 0x00204081);
code.and_(a, 0x01010101);
code.imul(a, a, 0xFF);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], a);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], a);
}
}
@ -716,7 +716,7 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
const u32 new_upper = upper_without_t | (mcl::bit::get_bit<0>(new_pc) ? 1 : 0);
code.mov(MJitStateReg(A32::Reg::PC), new_pc & mask);
code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)], new_upper);
} else {
const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(arg).cvt32();
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
@ -728,7 +728,7 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC
code.and_(new_pc, mask);
code.mov(MJitStateReg(A32::Reg::PC), new_pc);
code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)], new_upper);
}
}
@ -798,9 +798,9 @@ static u32 GetFpscrImpl(A32JitState* jit_state) {
void A32EmitX64::EmitA32GetFpscr(A32EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst);
code.mov(code.ABI_PARAM1, code.r15);
code.mov(code.ABI_PARAM1, code.ABI_JIT_PTR);
code.stmxcsr(code.dword[code.r15 + offsetof(A32JitState, guest_MXCSR)]);
code.stmxcsr(code.dword[code.ABI_JIT_PTR + offsetof(A32JitState, guest_MXCSR)]);
code.CallFunction(&GetFpscrImpl);
}
@ -811,15 +811,15 @@ static void SetFpscrImpl(u32 value, A32JitState* jit_state) {
void A32EmitX64::EmitA32SetFpscr(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(nullptr, args[0]);
code.mov(code.ABI_PARAM2, code.r15);
code.mov(code.ABI_PARAM2, code.ABI_JIT_PTR);
code.CallFunction(&SetFpscrImpl);
code.ldmxcsr(code.dword[code.r15 + offsetof(A32JitState, guest_MXCSR)]);
code.ldmxcsr(code.dword[code.ABI_JIT_PTR + offsetof(A32JitState, guest_MXCSR)]);
}
void A32EmitX64::EmitA32GetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(result, dword[r15 + offsetof(A32JitState, fpsr_nzcv)]);
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, fpsr_nzcv)]);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -833,7 +833,7 @@ void A32EmitX64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(tmp, NZCV::x64_mask);
code.pext(tmp, value, tmp);
code.shl(tmp, 28);
code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], tmp);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, fpsr_nzcv)], tmp);
return;
}
@ -843,7 +843,7 @@ void A32EmitX64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
code.and_(value, NZCV::x64_mask);
code.imul(value, value, NZCV::from_x64_multiplier);
code.and_(value, NZCV::arm_mask);
code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], value);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, fpsr_nzcv)], value);
}
static void EmitCoprocessorException() {
@ -1155,7 +1155,7 @@ void A32EmitX64::EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_locat
}();
if (old_upper != new_upper) {
code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)], new_upper);
}
}
@ -1165,32 +1165,28 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDesc
if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) {
code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{terminal.next}.PC());
code.ReturnFromRunCode();
return;
}
if (conf.enable_cycle_counting) {
code.cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
patch_information[terminal.next].jg.push_back(code.getCurr());
if (const auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJg(terminal.next, next_bb->entrypoint);
} else {
EmitPatchJg(terminal.next);
}
} else {
code.cmp(dword[r15 + offsetof(A32JitState, halt_reason)], 0);
patch_information[terminal.next].jz.push_back(code.getCurr());
if (const auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJz(terminal.next, next_bb->entrypoint);
if (conf.enable_cycle_counting) {
code.cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
patch_information[terminal.next].jg.push_back(code.getCurr());
if (const auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJg(terminal.next, next_bb->entrypoint);
} else {
EmitPatchJg(terminal.next);
}
} else {
EmitPatchJz(terminal.next);
code.cmp(dword[code.ABI_JIT_PTR + offsetof(A32JitState, halt_reason)], 0);
patch_information[terminal.next].jz.push_back(code.getCurr());
if (const auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJz(terminal.next, next_bb->entrypoint);
} else {
EmitPatchJz(terminal.next);
}
}
code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{terminal.next}.PC());
PushRSBHelper(rax, rbx, terminal.next);
code.ForceReturnFromRunCode();
}
code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{terminal.next}.PC());
PushRSBHelper(rax, rbx, terminal.next);
code.ForceReturnFromRunCode();
}
void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
@ -1199,14 +1195,13 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::Location
if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) {
code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{terminal.next}.PC());
code.ReturnFromRunCode();
return;
}
patch_information[terminal.next].jmp.push_back(code.getCurr());
if (const auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJmp(terminal.next, next_bb->entrypoint);
} else {
EmitPatchJmp(terminal.next);
patch_information[terminal.next].jmp.push_back(code.getCurr());
if (const auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJmp(terminal.next, next_bb->entrypoint);
} else {
EmitPatchJmp(terminal.next);
}
}
}
@ -1245,7 +1240,7 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescr
}
void A32EmitX64::EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
code.cmp(dword[r15 + offsetof(A32JitState, halt_reason)], 0);
code.cmp(dword[code.ABI_JIT_PTR + offsetof(A32JitState, halt_reason)], 0);
code.jne(code.GetForceReturnFromRunCodeAddress());
EmitTerminal(terminal.else_, initial_location, is_single_step);
}

View file

@ -168,7 +168,7 @@ void A32EmitX64::EmitA32WriteMemory64(A32EmitContext& ctx, IR::Inst* inst) {
}
void A32EmitX64::EmitA32ClearExclusive(A32EmitContext&, IR::Inst*) {
code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0));
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, exclusive_state)], u8(0));
}
void A32EmitX64::EmitA32ExclusiveReadMemory8(A32EmitContext& ctx, IR::Inst* inst) {
@ -244,14 +244,14 @@ void A32EmitX64::EmitCheckMemoryAbort(A32EmitContext& ctx, IR::Inst* inst, Xbyak
const A32::LocationDescriptor current_location{IR::LocationDescriptor{inst->GetArg(0).GetU64()}};
code.test(dword[r15 + offsetof(A32JitState, halt_reason)], static_cast<u32>(HaltReason::MemoryAbort));
code.test(dword[code.ABI_JIT_PTR + offsetof(A32JitState, halt_reason)], static_cast<u32>(HaltReason::MemoryAbort));
if (end) {
code.jz(*end, code.T_NEAR);
} else {
code.jz(skip, code.T_NEAR);
}
EmitSetUpperLocationDescriptor(current_location, ctx.Location());
code.mov(dword[r15 + offsetof(A32JitState, Reg) + sizeof(u32) * 15], current_location.PC());
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, Reg) + sizeof(u32) * 15], current_location.PC());
code.ForceReturnFromRunCode();
code.L(skip);
}

View file

@ -80,12 +80,12 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) noexcept {
const boost::container::static_vector<HostLoc, 28> gpr_order = [this] {
boost::container::static_vector<HostLoc, 28> gprs{any_gpr};
if (conf.page_table) {
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14));
}
if (conf.fastmem_pointer) {
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R13));
}
if (conf.page_table) {
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14));
}
return gprs;
}();
@ -192,10 +192,10 @@ void A64EmitX64::GenTerminalHandlers() {
const auto calculate_location_descriptor = [this] {
// This calculation has to match up with A64::LocationDescriptor::UniqueHash
// TODO: Optimization is available here based on known state of fpcr.
code.mov(rbp, qword[r15 + offsetof(A64JitState, pc)]);
code.mov(rbp, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]);
code.mov(rcx, A64::LocationDescriptor::pc_mask);
code.and_(rcx, rbp);
code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]);
code.mov(ebx, dword[code.ABI_JIT_PTR + offsetof(A64JitState, fpcr)]);
code.and_(ebx, A64::LocationDescriptor::fpcr_mask);
code.shl(rbx, A64::LocationDescriptor::fpcr_shift);
code.or_(rbx, rcx);
@ -207,17 +207,17 @@ void A64EmitX64::GenTerminalHandlers() {
code.align();
terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
calculate_location_descriptor();
code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]);
code.dec(eax);
code.mov(eax, dword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_ptr)]);
code.sub(eax, 1);
code.and_(eax, u32(A64JitState::RSBPtrMask));
code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
code.jne(rsb_cache_miss, code.T_NEAR);
} else {
code.jne(code.GetReturnFromRunCodeAddress());
}
code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]);
code.mov(rax, qword[code.ABI_JIT_PTR + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]);
code.jmp(rax);
PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a64_terminal_handler_pop_rsb_hint");
@ -272,7 +272,7 @@ void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64GetCFlag(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(result, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A64JitState, cpsr_nzcv)]);
code.shr(result, NZCV::x64_c_flag_bit);
code.and_(result, 1);
ctx.reg_alloc.DefineValue(inst, result);
@ -281,7 +281,7 @@ void A64EmitX64::EmitA64GetCFlag(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(nzcv_raw, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
code.mov(nzcv_raw, dword[code.ABI_JIT_PTR + offsetof(A64JitState, cpsr_nzcv)]);
if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
@ -310,20 +310,20 @@ void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
code.imul(nzcv_raw, nzcv_raw, NZCV::to_x64_multiplier);
code.and_(nzcv_raw, NZCV::x64_mask);
}
code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], nzcv_raw);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A64JitState, cpsr_nzcv)], nzcv_raw);
}
void A64EmitX64::EmitA64SetNZCV(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], to_store);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A64JitState, cpsr_nzcv)], to_store);
}
void A64EmitX64::EmitA64GetW(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(result, dword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]);
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -331,13 +331,13 @@ void A64EmitX64::EmitA64GetX(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
code.mov(result, qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]);
code.mov(result, qword[code.ABI_JIT_PTR + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]);
ctx.reg_alloc.DefineValue(inst, result);
}
void A64EmitX64::EmitA64GetS(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = qword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
code.movd(result, addr);
@ -346,7 +346,7 @@ void A64EmitX64::EmitA64GetS(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64GetD(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = qword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
code.movq(result, addr);
@ -355,7 +355,7 @@ void A64EmitX64::EmitA64GetD(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64GetQ(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const auto addr = xword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
code.movaps(result, addr);
@ -364,13 +364,13 @@ void A64EmitX64::EmitA64GetQ(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64GetSP(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
code.mov(result, qword[r15 + offsetof(A64JitState, sp)]);
code.mov(result, qword[code.ABI_JIT_PTR + offsetof(A64JitState, sp)]);
ctx.reg_alloc.DefineValue(inst, result);
}
void A64EmitX64::EmitA64GetFPCR(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(result, dword[r15 + offsetof(A64JitState, fpcr)]);
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A64JitState, fpcr)]);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -380,15 +380,15 @@ static u32 GetFPSRImpl(A64JitState* jit_state) {
void A64EmitX64::EmitA64GetFPSR(A64EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst);
code.mov(code.ABI_PARAM1, code.r15);
code.stmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]);
code.mov(code.ABI_PARAM1, code.ABI_JIT_PTR);
code.stmxcsr(code.dword[code.ABI_JIT_PTR + offsetof(A64JitState, guest_MXCSR)]);
code.CallFunction(GetFPSRImpl);
}
void A64EmitX64::EmitA64SetW(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
const auto addr = qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)];
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)];
if (args[1].FitsInImmediateS32()) {
code.mov(addr, args[1].GetImmediateS32());
} else {
@ -402,7 +402,7 @@ void A64EmitX64::EmitA64SetW(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64SetX(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
const auto addr = qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)];
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)];
if (args[1].FitsInImmediateS32()) {
code.mov(addr, args[1].GetImmediateS32());
} else if (args[1].IsInXmm()) {
@ -417,7 +417,7 @@ void A64EmitX64::EmitA64SetX(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64SetS(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const auto addr = xword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -430,7 +430,7 @@ void A64EmitX64::EmitA64SetS(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64SetD(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const auto addr = xword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm to_store = ctx.reg_alloc.UseScratchXmm(args[1]);
code.movq(to_store, to_store); // TODO: Remove when able
@ -440,7 +440,7 @@ void A64EmitX64::EmitA64SetD(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64SetQ(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const auto addr = xword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
code.movaps(addr, to_store);
@ -448,7 +448,7 @@ void A64EmitX64::EmitA64SetQ(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64SetSP(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto addr = qword[r15 + offsetof(A64JitState, sp)];
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, sp)];
if (args[0].FitsInImmediateS32()) {
code.mov(addr, args[0].GetImmediateS32());
} else if (args[0].IsInXmm()) {
@ -467,9 +467,9 @@ static void SetFPCRImpl(A64JitState* jit_state, u32 value) {
void A64EmitX64::EmitA64SetFPCR(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
code.mov(code.ABI_PARAM1, code.r15);
code.mov(code.ABI_PARAM1, code.ABI_JIT_PTR);
code.CallFunction(SetFPCRImpl);
code.ldmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]);
code.ldmxcsr(code.dword[code.ABI_JIT_PTR + offsetof(A64JitState, guest_MXCSR)]);
}
static void SetFPSRImpl(A64JitState* jit_state, u32 value) {
@ -479,14 +479,14 @@ static void SetFPSRImpl(A64JitState* jit_state, u32 value) {
void A64EmitX64::EmitA64SetFPSR(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
code.mov(code.ABI_PARAM1, code.r15);
code.mov(code.ABI_PARAM1, code.ABI_JIT_PTR);
code.CallFunction(SetFPSRImpl);
code.ldmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]);
code.ldmxcsr(code.dword[code.ABI_JIT_PTR + offsetof(A64JitState, guest_MXCSR)]);
}
void A64EmitX64::EmitA64SetPC(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto addr = qword[r15 + offsetof(A64JitState, pc)];
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)];
if (args[0].FitsInImmediateS32()) {
code.mov(addr, args[0].GetImmediateS32());
} else if (args[0].IsInXmm()) {
@ -507,7 +507,7 @@ void A64EmitX64::EmitA64CallSupervisor(A64EmitContext& ctx, IR::Inst* inst) {
code.mov(param[0], imm);
});
// The kernel would have to execute ERET to get here, which would clear exclusive state.
code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0));
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A64JitState, exclusive_state)], u8(0));
}
void A64EmitX64::EmitA64ExceptionRaised(A64EmitContext& ctx, IR::Inst* inst) {
@ -621,7 +621,7 @@ void A64EmitX64::EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDesc
code.SwitchMxcsrOnExit();
Devirtualize<&A64::UserCallbacks::InterpreterFallback>(conf.callbacks).EmitCall(code, [&](RegList param) {
code.mov(param[0], A64::LocationDescriptor{terminal.next}.PC());
code.mov(qword[r15 + offsetof(A64JitState, pc)], param[0]);
code.mov(qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)], param[0]);
code.mov(param[1].cvt32(), terminal.num_instructions);
});
code.ReturnFromRunCode(true); // TODO: Check cycles
@ -632,61 +632,56 @@ void A64EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescri
}
void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor, bool is_single_step) {
if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) {
// Used for patches and linking
if (conf.HasOptimization(OptimizationFlag::BlockLinking) && !is_single_step) {
if (conf.enable_cycle_counting) {
code.cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
patch_information[terminal.next].jg.push_back(code.getCurr());
if (const auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJg(terminal.next, next_bb->entrypoint);
} else {
EmitPatchJg(terminal.next);
}
} else {
code.cmp(dword[code.ABI_JIT_PTR + offsetof(A64JitState, halt_reason)], 0);
patch_information[terminal.next].jz.push_back(code.getCurr());
if (const auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJz(terminal.next, next_bb->entrypoint);
} else {
EmitPatchJz(terminal.next);
}
}
code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
code.ReturnFromRunCode();
return;
}
if (conf.enable_cycle_counting) {
code.cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
patch_information[terminal.next].jg.push_back(code.getCurr());
if (const auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJg(terminal.next, next_bb->entrypoint);
} else {
EmitPatchJg(terminal.next);
}
code.mov(qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)], rax);
code.ForceReturnFromRunCode();
} else {
code.cmp(dword[r15 + offsetof(A64JitState, halt_reason)], 0);
patch_information[terminal.next].jz.push_back(code.getCurr());
if (const auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJz(terminal.next, next_bb->entrypoint);
} else {
EmitPatchJz(terminal.next);
}
code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
code.mov(qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)], rax);
code.ReturnFromRunCode();
}
code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
code.ForceReturnFromRunCode();
}
void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor, bool is_single_step) {
if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) {
code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
code.ReturnFromRunCode();
return;
}
patch_information[terminal.next].jmp.push_back(code.getCurr());
if (auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJmp(terminal.next, next_bb->entrypoint);
if (conf.HasOptimization(OptimizationFlag::BlockLinking) && !is_single_step) {
patch_information[terminal.next].jmp.push_back(code.getCurr());
if (auto next_bb = GetBasicBlock(terminal.next)) {
EmitPatchJmp(terminal.next, next_bb->entrypoint);
} else {
EmitPatchJmp(terminal.next);
}
} else {
EmitPatchJmp(terminal.next);
code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
code.mov(qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)], rax);
code.ReturnFromRunCode();
}
}
void A64EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) {
if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) || is_single_step) {
if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) && !is_single_step) {
code.jmp(terminal_handler_pop_rsb_hint);
} else {
code.ReturnFromRunCode();
return;
}
code.jmp(terminal_handler_pop_rsb_hint);
}
void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor, bool is_single_step) {
@ -723,7 +718,7 @@ void A64EmitX64::EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescr
}
void A64EmitX64::EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
code.cmp(dword[r15 + offsetof(A64JitState, halt_reason)], 0);
code.cmp(dword[code.ABI_JIT_PTR + offsetof(A64JitState, halt_reason)], 0);
code.jne(code.GetForceReturnFromRunCodeAddress());
EmitTerminal(terminal.else_, initial_location, is_single_step);
}
@ -734,7 +729,7 @@ void A64EmitX64::EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr
code.jg(target_code_ptr);
} else {
code.mov(rax, A64::LocationDescriptor{target_desc}.PC());
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
code.mov(qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)], rax);
code.jg(code.GetReturnFromRunCodeAddress());
}
code.EnsurePatchLocationSize(patch_location, 23);
@ -746,7 +741,7 @@ void A64EmitX64::EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr
code.jz(target_code_ptr);
} else {
code.mov(rax, A64::LocationDescriptor{target_desc}.PC());
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
code.mov(qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)], rax);
code.jz(code.GetReturnFromRunCodeAddress());
}
code.EnsurePatchLocationSize(patch_location, 23);
@ -758,7 +753,7 @@ void A64EmitX64::EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr
code.jmp(target_code_ptr);
} else {
code.mov(rax, A64::LocationDescriptor{target_desc}.PC());
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
code.mov(qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)], rax);
code.jmp(code.GetReturnFromRunCodeAddress());
}
code.EnsurePatchLocationSize(patch_location, 22);

View file

@ -127,10 +127,10 @@ protected:
BlockRangeInformation<u64> block_ranges;
std::array<FastDispatchEntry, fast_dispatch_table_size> fast_dispatch_table;
ankerl::unordered_dense::map<u64, FastmemPatchInfo> fastmem_patch_info;
std::map<std::tuple<bool, size_t, int, int>, void (*)()> read_fallbacks;
std::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks;
std::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks;
std::set<DoNotFastmemMarker> do_not_fastmem;
ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> read_fallbacks;
ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks;
ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks;
ankerl::unordered_dense::set<DoNotFastmemMarker> do_not_fastmem;
const void* terminal_handler_pop_rsb_hint = nullptr;
const void* terminal_handler_fast_dispatch_hint = nullptr;
FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr;

View file

@ -324,7 +324,7 @@ void A64EmitX64::EmitA64WriteMemory128(A64EmitContext& ctx, IR::Inst* inst) {
}
void A64EmitX64::EmitA64ClearExclusive(A64EmitContext&, IR::Inst*) {
code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0));
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A64JitState, exclusive_state)], u8(0));
}
void A64EmitX64::EmitA64ExclusiveReadMemory8(A64EmitContext& ctx, IR::Inst* inst) {
@ -416,14 +416,14 @@ void A64EmitX64::EmitCheckMemoryAbort(A64EmitContext&, IR::Inst* inst, Xbyak::La
const A64::LocationDescriptor current_location{IR::LocationDescriptor{inst->GetArg(0).GetU64()}};
code.test(dword[r15 + offsetof(A64JitState, halt_reason)], static_cast<u32>(HaltReason::MemoryAbort));
code.test(dword[code.ABI_JIT_PTR + offsetof(A64JitState, halt_reason)], static_cast<u32>(HaltReason::MemoryAbort));
if (end) {
code.jz(*end, code.T_NEAR);
} else {
code.jz(skip, code.T_NEAR);
}
code.mov(rax, current_location.PC());
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
code.mov(qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)], rax);
code.ForceReturnFromRunCode();
code.L(skip);
}

View file

@ -119,6 +119,20 @@ void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code, const std::size
ABI_PopRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLER_SAVE);
}
// Windows ABI registers are not in the same allocation algorithm as unix's
#ifdef _MSC_VER
void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) {
std::vector<HostLoc> regs;
std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
ABI_PushRegistersAndAdjustStack(code, 0, regs);
}
void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) {
std::vector<HostLoc> regs;
std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
ABI_PopRegistersAndAdjustStack(code, 0, regs);
}
#else
static consteval size_t ABI_AllCallerSaveSize() noexcept {
return ABI_ALL_CALLER_SAVE.max_size();
}
@ -166,24 +180,14 @@ alignas(64) static constinit std::array<HostLoc, ABI_AllCallerSaveSize() - 1> AB
};
void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) {
#ifdef _MSC_VER
std::vector<HostLoc> regs;
std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
ABI_PushRegistersAndAdjustStack(code, 0, regs);
#else
ASSUME(size_t(exception) < 32);
ABI_PushRegistersAndAdjustStack(code, 0, ABI_CALLER_SAVED_EXCEPT_TABLE[size_t(exception)]);
#endif
}
void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) {
#ifdef _MSC_VER
std::vector<HostLoc> regs;
std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
ABI_PopRegistersAndAdjustStack(code, 0, regs);
#else
ASSUME(size_t(exception) < 32);
ABI_PopRegistersAndAdjustStack(code, 0, ABI_CALLER_SAVED_EXCEPT_TABLE[size_t(exception)]);
#endif
}
#endif
} // namespace Dynarmic::Backend::X64

View file

@ -17,6 +17,7 @@ namespace Dynarmic::Backend::X64 {
class BlockOfCode;
constexpr HostLoc ABI_JIT_PTR = HostLoc::R15;
#ifdef _WIN32
constexpr HostLoc ABI_RETURN = HostLoc::RAX;

View file

@ -36,6 +36,7 @@
namespace Dynarmic::Backend::X64 {
const Xbyak::Reg64 BlockOfCode::ABI_JIT_PTR = HostLocToReg64(Dynarmic::Backend::X64::ABI_JIT_PTR);
#ifdef _WIN32
const Xbyak::Reg64 BlockOfCode::ABI_RETURN = HostLocToReg64(Dynarmic::Backend::X64::ABI_RETURN);
const Xbyak::Reg64 BlockOfCode::ABI_PARAM1 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM1);
@ -322,8 +323,8 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
// that the stack is appropriately aligned for CALLs.
ABI_PushCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout));
mov(r15, ABI_PARAM1);
mov(rbx, ABI_PARAM2); // save temporarily in non-volatile register
mov(ABI_JIT_PTR, ABI_PARAM1);
mov(rbx, ABI_PARAM2); // save temporarily in non-volatile register
if (cb.enable_cycle_counting) {
cb.GetTicksRemaining->EmitCall(*this);
@ -331,9 +332,11 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], ABI_RETURN);
}
// r14 = page table
// r13 = fastmem pointer
rcp(*this);
cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
cmp(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], 0);
jne(return_to_caller_mxcsr_already_exited, T_NEAR);
SwitchMxcsrOnEntry();
@ -344,7 +347,7 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
ABI_PushCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout));
mov(r15, ABI_PARAM1);
mov(ABI_JIT_PTR, ABI_PARAM1);
if (cb.enable_cycle_counting) {
mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], 1);
@ -353,10 +356,10 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
rcp(*this);
cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
cmp(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], 0);
jne(return_to_caller_mxcsr_already_exited, T_NEAR);
lock();
or_(dword[r15 + jsi.offsetof_halt_reason], static_cast<u32>(HaltReason::Step));
or_(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], static_cast<u32>(HaltReason::Step));
SwitchMxcsrOnEntry();
jmp(ABI_PARAM2);
@ -366,7 +369,7 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
align();
return_from_run_code[0] = getCurr<const void*>();
cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
cmp(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], 0);
jne(return_to_caller);
if (cb.enable_cycle_counting) {
cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
@ -378,7 +381,7 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
align();
return_from_run_code[MXCSR_ALREADY_EXITED] = getCurr<const void*>();
cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
cmp(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], 0);
jne(return_to_caller_mxcsr_already_exited);
if (cb.enable_cycle_counting) {
cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
@ -407,7 +410,7 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
xor_(eax, eax);
lock();
xchg(dword[r15 + jsi.offsetof_halt_reason], eax);
xchg(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], eax);
ABI_PopCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout));
ret();
@ -417,22 +420,22 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
void BlockOfCode::SwitchMxcsrOnEntry() {
stmxcsr(dword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, save_host_MXCSR)]);
ldmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]);
ldmxcsr(dword[ABI_JIT_PTR + jsi.offsetof_guest_MXCSR]);
}
void BlockOfCode::SwitchMxcsrOnExit() {
stmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]);
stmxcsr(dword[ABI_JIT_PTR + jsi.offsetof_guest_MXCSR]);
ldmxcsr(dword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, save_host_MXCSR)]);
}
void BlockOfCode::EnterStandardASIMD() {
stmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]);
ldmxcsr(dword[r15 + jsi.offsetof_asimd_MXCSR]);
stmxcsr(dword[ABI_JIT_PTR + jsi.offsetof_guest_MXCSR]);
ldmxcsr(dword[ABI_JIT_PTR + jsi.offsetof_asimd_MXCSR]);
}
void BlockOfCode::LeaveStandardASIMD() {
stmxcsr(dword[r15 + jsi.offsetof_asimd_MXCSR]);
ldmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]);
stmxcsr(dword[ABI_JIT_PTR + jsi.offsetof_asimd_MXCSR]);
ldmxcsr(dword[ABI_JIT_PTR + jsi.offsetof_guest_MXCSR]);
}
void BlockOfCode::UpdateTicks() {

View file

@ -155,6 +155,7 @@ public:
void SetCodePtr(CodePtr code_ptr);
void EnsurePatchLocationSize(CodePtr begin, size_t size);
static const Xbyak::Reg64 ABI_JIT_PTR;
// ABI registers
#ifdef _WIN32
static const Xbyak::Reg64 ABI_RETURN;

View file

@ -91,19 +91,18 @@ void EmitX64::PushRSBHelper(Xbyak::Reg64 loc_desc_reg, Xbyak::Reg64 index_reg, I
? iter->second.entrypoint
: code.GetReturnFromRunCodeAddress();
code.mov(index_reg.cvt32(), dword[r15 + code.GetJitStateInfo().offsetof_rsb_ptr]);
code.mov(index_reg.cvt32(), dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_rsb_ptr]);
code.mov(loc_desc_reg, target.Value());
patch_information[target].mov_rcx.push_back(code.getCurr());
EmitPatchMovRcx(target_code_ptr);
code.mov(qword[r15 + index_reg * 8 + code.GetJitStateInfo().offsetof_rsb_location_descriptors], loc_desc_reg);
code.mov(qword[r15 + index_reg * 8 + code.GetJitStateInfo().offsetof_rsb_codeptrs], rcx);
code.add(index_reg.cvt32(), 1);
code.and_(index_reg.cvt32(), u32(code.GetJitStateInfo().rsb_ptr_mask));
code.mov(dword[r15 + code.GetJitStateInfo().offsetof_rsb_ptr], index_reg.cvt32());
code.mov(qword[code.ABI_JIT_PTR + index_reg * 8 + code.GetJitStateInfo().offsetof_rsb_location_descriptors], loc_desc_reg);
code.mov(qword[code.ABI_JIT_PTR + index_reg * 8 + code.GetJitStateInfo().offsetof_rsb_codeptrs], rcx);
// Byte size hack
DEBUG_ASSERT(code.GetJitStateInfo().rsb_ptr_mask <= 0xFF);
code.add(index_reg.cvt32(), 1); //flags trashed, 1 single byte, haswell doesn't care
code.and_(index_reg.cvt32(), u32(code.GetJitStateInfo().rsb_ptr_mask)); //trashes flags
// Results ready and sort by least needed: give OOO some break
code.mov(dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_rsb_ptr], index_reg.cvt32());
}
void EmitX64::EmitVerboseDebuggingOutput(RegAlloc& reg_alloc) {
@ -119,7 +118,7 @@ void EmitX64::EmitVerboseDebuggingOutput(RegAlloc& reg_alloc) {
code.movaps(xword[rsp + offsetof(RegisterData, xmms) + 2 * sizeof(u64) * i], Xbyak::Xmm{i});
}
code.lea(rax, ptr[rsp + sizeof(RegisterData) + offsetof(StackLayout, spill)]);
code.mov(xword[rsp + offsetof(RegisterData, spill)], rax);
code.mov(qword[rsp + offsetof(RegisterData, spill)], rax);
reg_alloc.EmitVerboseDebuggingOutput();
@ -285,7 +284,7 @@ void EmitX64::EmitAddCycles(size_t cycles) {
Xbyak::Label EmitX64::EmitCond(IR::Cond cond) {
Xbyak::Label pass;
code.mov(eax, dword[r15 + code.GetJitStateInfo().offsetof_cpsr_nzcv]);
code.mov(eax, dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_cpsr_nzcv]);
code.LoadRequiredFlagsForCondFromRax(cond);

View file

@ -18,24 +18,20 @@ namespace CRC32 = Common::Crypto::CRC32;
static void EmitCRC32Castagnoli(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE42)) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[1]).changeBit(data_size);
if (data_size != 64) {
code.crc32(crc, value);
} else {
code.crc32(crc.cvt64(), value);
}
ctx.reg_alloc.DefineValue(inst, crc);
return;
} else {
ctx.reg_alloc.HostCall(inst, args[0], args[1], {});
code.mov(code.ABI_PARAM3.cvt32(), data_size / CHAR_BIT); //zext
code.CallFunction(&CRC32::ComputeCRC32Castagnoli);
}
ctx.reg_alloc.HostCall(inst, args[0], args[1], {});
code.mov(code.ABI_PARAM3, data_size / CHAR_BIT);
code.CallFunction(&CRC32::ComputeCRC32Castagnoli);
}
static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) {
@ -69,10 +65,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
code.pextrd(crc, xmm_value, 2);
ctx.reg_alloc.DefineValue(inst, crc);
return;
}
if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 32) {
} else if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 32) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
@ -90,10 +83,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
code.pextrd(crc, xmm_value, 2);
ctx.reg_alloc.DefineValue(inst, crc);
return;
}
if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 64) {
} else if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 64) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]);
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
@ -111,12 +101,11 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
code.pextrd(crc, xmm_value, 2);
ctx.reg_alloc.DefineValue(inst, crc);
return;
} else {
ctx.reg_alloc.HostCall(inst, args[0], args[1], {});
code.mov(code.ABI_PARAM3, data_size / CHAR_BIT);
code.CallFunction(&CRC32::ComputeCRC32ISO);
}
ctx.reg_alloc.HostCall(inst, args[0], args[1], {});
code.mov(code.ABI_PARAM3, data_size / CHAR_BIT);
code.CallFunction(&CRC32::ComputeCRC32ISO);
}
void EmitX64::EmitCRC32Castagnoli8(EmitContext& ctx, IR::Inst* inst) {

View file

@ -143,7 +143,7 @@ static void EmitConditionalSelect(BlockOfCode& code, EmitContext& ctx, IR::Inst*
const Xbyak::Reg then_ = ctx.reg_alloc.UseGpr(args[1]).changeBit(bitsize);
const Xbyak::Reg else_ = ctx.reg_alloc.UseScratchGpr(args[2]).changeBit(bitsize);
code.mov(nzcv, dword[r15 + code.GetJitStateInfo().offsetof_cpsr_nzcv]);
code.mov(nzcv, dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_cpsr_nzcv]);
code.LoadRequiredFlagsForCondFromRax(args[0].GetImmediateCond());
@ -909,11 +909,11 @@ static Xbyak::Reg8 DoCarry(RegAlloc& reg_alloc, Argument& carry_in, IR::Inst* ca
}
}
// AL contains flags (after LAHF + SETO sequence)
static Xbyak::Reg64 DoNZCV(BlockOfCode& code, RegAlloc& reg_alloc, IR::Inst* nzcv_out) {
if (!nzcv_out) {
return Xbyak::Reg64{-1};
}
const Xbyak::Reg64 nzcv = reg_alloc.ScratchGpr(HostLoc::RAX);
code.xor_(nzcv.cvt32(), nzcv.cvt32());
return nzcv;
@ -1168,7 +1168,7 @@ void EmitX64::EmitUnsignedDiv32(EmitContext& ctx, IR::Inst* inst) {
code.xor_(eax, eax);
code.test(divisor, divisor);
code.jz(end);
code.jz(end, code.T_NEAR);
code.mov(eax, dividend);
code.xor_(edx, edx);
code.div(divisor);
@ -1189,7 +1189,7 @@ void EmitX64::EmitUnsignedDiv64(EmitContext& ctx, IR::Inst* inst) {
code.xor_(eax, eax);
code.test(divisor, divisor);
code.jz(end);
code.jz(end, code.T_NEAR);
code.mov(rax, dividend);
code.xor_(edx, edx);
code.div(divisor);
@ -1568,14 +1568,14 @@ void EmitX64::EmitCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
} else {
const Xbyak::Reg32 source = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 temp = ctx.reg_alloc.ScratchGpr().cvt32();
// The result of a bsr of zero is undefined, but zf is set after it.
code.bsr(result, source);
code.mov(source, 0xFFFFFFFF);
code.cmovz(result, source);
code.neg(result);
code.add(result, 31);
code.mov(temp, 32);
code.xor_(result, 31);
code.test(source, source);
code.cmove(result, temp);
ctx.reg_alloc.DefineValue(inst, result);
}
}
@ -1592,14 +1592,14 @@ void EmitX64::EmitCountLeadingZeros64(EmitContext& ctx, IR::Inst* inst) {
} else {
const Xbyak::Reg64 source = ctx.reg_alloc.UseScratchGpr(args[0]).cvt64();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64();
const Xbyak::Reg64 temp = ctx.reg_alloc.ScratchGpr().cvt64();
// The result of a bsr of zero is undefined, but zf is set after it.
code.bsr(result, source);
code.mov(source.cvt32(), 0xFFFFFFFF);
code.cmovz(result.cvt32(), source.cvt32());
code.neg(result.cvt32());
code.add(result.cvt32(), 63);
code.mov(temp.cvt32(), 64);
code.xor_(result.cvt32(), 63);
code.test(source, source);
code.cmove(result.cvt32(), temp.cvt32());
ctx.reg_alloc.DefineValue(inst, result);
}
}

View file

@ -712,12 +712,12 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR().Value());
#ifdef _WIN32
code.lea(rsp, ptr[rsp - (16 + ABI_SHADOW_SPACE)]);
code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(rax, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.mov(qword[rsp + ABI_SHADOW_SPACE], rax);
code.CallFunction(fallback_fn);
code.add(rsp, 16 + ABI_SHADOW_SPACE);
#else
code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM5, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(fallback_fn);
#endif
code.movq(result, code.ABI_RETURN);
@ -821,12 +821,12 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR().Value());
#ifdef _WIN32
ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE);
code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(rax, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.mov(qword[rsp + ABI_SHADOW_SPACE], rax);
code.CallFunction(fallback_fn);
ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE);
#else
code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM5, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(fallback_fn);
#endif
}
@ -945,7 +945,7 @@ static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(inst, args[0]);
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM3, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPRecipEstimate<FPT>);
}
@ -968,7 +968,7 @@ static void EmitFPRecipExponent(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(inst, args[0]);
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM3, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPRecipExponent<FPT>);
}
@ -1026,7 +1026,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
code.movq(code.ABI_PARAM1, operand1);
code.movq(code.ABI_PARAM2, operand2);
code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPRecipStepFused<FPT>);
code.movq(result, code.ABI_RETURN);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
@ -1055,7 +1055,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
ctx.reg_alloc.HostCall(inst, args[0], args[1]);
code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPRecipStepFused<FPT>);
}
@ -1119,7 +1119,7 @@ static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, siz
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(inst, args[0]);
code.lea(code.ABI_PARAM2, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM2, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
code.CallFunction(lut.at(std::make_tuple(fsize, rounding_mode, exact)));
}
@ -1206,7 +1206,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
}
// a > 0 && a < 0x00800000;
code.dec(tmp);
code.sub(tmp, 1);
code.cmp(tmp, 0x007FFFFF);
code.jb(fallback, code.T_NEAR); //within -127,128
needs_fallback = true;
@ -1284,7 +1284,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
code.movq(code.ABI_PARAM1, operand);
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM3, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPRSqrtEstimate<FPT>);
code.movq(result, rax);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
@ -1298,7 +1298,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(inst, args[0]);
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM3, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPRSqrtEstimate<FPT>);
}
}
@ -1368,7 +1368,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
code.movq(code.ABI_PARAM1, operand1);
code.movq(code.ABI_PARAM2, operand2);
code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPRSqrtStepFused<FPT>);
code.movq(result, code.ABI_RETURN);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
@ -1398,7 +1398,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
ctx.reg_alloc.HostCall(inst, args[0], args[1]);
code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPRSqrtStepFused<FPT>);
}
@ -1511,7 +1511,7 @@ void EmitX64::EmitFPHalfToDouble(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst, args[0]);
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPConvert<u64, u16>);
}
@ -1535,7 +1535,7 @@ void EmitX64::EmitFPHalfToSingle(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst, args[0]);
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPConvert<u32, u16>);
}
@ -1556,7 +1556,7 @@ void EmitX64::EmitFPSingleToDouble(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst, args[0]);
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPConvert<u64, u32>);
}
}
@ -1581,7 +1581,7 @@ void EmitX64::EmitFPSingleToHalf(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst, args[0]);
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPConvert<u16, u32>);
}
@ -1595,7 +1595,7 @@ void EmitX64::EmitFPDoubleToHalf(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst, args[0]);
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPConvert<u16, u64>);
}
@ -1616,7 +1616,7 @@ void EmitX64::EmitFPDoubleToSingle(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst, args[0]);
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPConvert<u32, u64>);
}
}
@ -1757,7 +1757,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
mp::cartesian_product<fbits_list, rounding_list>{});
ctx.reg_alloc.HostCall(inst, args[0]);
code.lea(code.ABI_PARAM2, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM2, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
code.CallFunction(lut.at(std::make_tuple(fbits, rounding_mode)));
}

View file

@ -28,27 +28,24 @@ std::optional<AxxEmitX64::DoNotFastmemMarker> AxxEmitX64::ShouldFastmem(AxxEmitC
FakeCall AxxEmitX64::FastmemCallback(u64 rip_) {
const auto iter = fastmem_patch_info.find(rip_);
if (iter == fastmem_patch_info.end()) {
if (iter != fastmem_patch_info.end()) {
FakeCall result{
.call_rip = iter->second.callback,
.ret_rip = iter->second.resume_rip,
};
if (iter->second.recompile) {
const auto marker = iter->second.marker;
do_not_fastmem.insert(marker);
InvalidateBasicBlocks({std::get<0>(marker)});
}
return result;
} else {
fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_);
fmt::print("Segfault wasn't at a fastmem patch location!\n");
fmt::print("Now dumping code.......\n\n");
Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000);
ASSERT_FALSE("iter != fastmem_patch_info.end()");
}
FakeCall result{
.call_rip = iter->second.callback,
.ret_rip = iter->second.resume_rip,
};
if (iter->second.recompile) {
const auto marker = iter->second.marker;
do_not_fastmem.insert(marker);
InvalidateBasicBlocks({std::get<0>(marker)});
}
return result;
}
template<std::size_t bitsize, auto callback>
@ -95,7 +92,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
if (fastmem_marker) {
// Use fastmem
bool require_abort_handling;
bool require_abort_handling = false;
const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling);
const auto location = EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr, ordered);
@ -182,7 +179,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
if (fastmem_marker) {
// Use fastmem
bool require_abort_handling;
bool require_abort_handling = false;
const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling);
const auto location = EmitWriteMemoryMov<bitsize>(code, dest_ptr, value_idx, ordered);
@ -230,7 +227,7 @@ void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst, {}, args[1]);
code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1));
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)], u8(1));
code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
if (ordered) {
code.mfence();
@ -248,7 +245,7 @@ void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(nullptr);
code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1));
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)], u8(1));
code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]);
@ -288,9 +285,9 @@ void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) {
Xbyak::Label end;
code.mov(code.ABI_RETURN, u32(1));
code.cmp(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0));
code.cmp(code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)], u8(0));
code.je(end);
code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0));
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)], u8(0));
code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
if constexpr (bitsize != 128) {
using T = mcl::unsigned_integer_of_size<bitsize>;
@ -358,7 +355,7 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in
EmitExclusiveLock(code, conf, tmp, tmp2.cvt32());
code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1));
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)], u8(1));
code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id)));
code.mov(qword[tmp], vaddr);
@ -442,14 +439,14 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id)));
code.mov(status, u32(1));
code.cmp(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0));
code.cmp(code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)], u8(0));
code.je(*end, code.T_NEAR);
code.cmp(qword[tmp], vaddr);
code.jne(*end, code.T_NEAR);
EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax);
code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0));
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)], u8(0));
code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id)));
if constexpr (bitsize == 128) {
@ -504,7 +501,6 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
}
code.setnz(status.cvt8());
ctx.deferred_emits.emplace_back([=, this] {
code.L(*abort);
code.call(wrapped_fn);
@ -518,24 +514,21 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
conf.recompile_on_exclusive_fastmem_failure,
});
code.cmp(al, 0);
code.xor_(status.cvt32(), status.cvt32()); //dep-break
code.test(code.al, code.al);
code.setz(status.cvt8());
code.movzx(status.cvt32(), status.cvt8());
code.jmp(*end, code.T_NEAR);
});
} else {
code.call(wrapped_fn);
code.cmp(al, 0);
code.xor_(status.cvt32(), status.cvt32()); //dep-break
code.test(code.al, code.al);
code.setz(status.cvt8());
code.movzx(status.cvt32(), status.cvt8());
}
code.L(*end);
EmitExclusiveUnlock(code, conf, tmp, eax);
ctx.reg_alloc.DefineValue(inst, status);
EmitCheckMemoryAbort(ctx, inst);
}

View file

@ -46,26 +46,25 @@ void EmitDetectMisalignedVAddr(BlockOfCode& code, EmitContext& ctx, size_t bitsi
code.test(vaddr, align_mask);
if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) {
if (ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) {
const u32 page_align_mask = static_cast<u32>(page_size - 1) & ~align_mask;
SharedLabel detect_boundary = GenSharedLabel(), resume = GenSharedLabel();
code.jnz(*detect_boundary, code.T_NEAR);
code.L(*resume);
ctx.deferred_emits.emplace_back([=, &code] {
code.L(*detect_boundary);
code.mov(tmp, vaddr);
code.and_(tmp, page_align_mask);
code.cmp(tmp, page_align_mask);
code.jne(*resume, code.T_NEAR);
// NOTE: We expect to fallthrough into abort code here.
});
} else {
code.jnz(abort, code.T_NEAR);
return;
}
const u32 page_align_mask = static_cast<u32>(page_size - 1) & ~align_mask;
SharedLabel detect_boundary = GenSharedLabel(), resume = GenSharedLabel();
code.jnz(*detect_boundary, code.T_NEAR);
code.L(*resume);
ctx.deferred_emits.emplace_back([=, &code] {
code.L(*detect_boundary);
code.mov(tmp, vaddr);
code.and_(tmp, page_align_mask);
code.cmp(tmp, page_align_mask);
code.jne(*resume, code.T_NEAR);
// NOTE: We expect to fallthrough into abort code here.
});
}
template<typename EmitContext>
@ -202,7 +201,7 @@ template<std::size_t bitsize>
const void* EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::RegExp& addr, bool ordered) {
if (ordered) {
if constexpr (bitsize != 128) {
code.xor_(Xbyak::Reg32{value_idx}, Xbyak::Reg32{value_idx});
code.xor_(Xbyak::Reg32(value_idx), Xbyak::Reg32(value_idx));
} else {
code.xor_(eax, eax);
code.xor_(ebx, ebx);
@ -214,59 +213,59 @@ const void* EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::Reg
switch (bitsize) {
case 8:
code.lock();
code.xadd(code.byte[addr], Xbyak::Reg32{value_idx}.cvt8());
code.xadd(code.byte[addr], Xbyak::Reg32(value_idx).cvt8());
break;
case 16:
code.lock();
code.xadd(word[addr], Xbyak::Reg16{value_idx});
code.xadd(word[addr], Xbyak::Reg64(value_idx).cvt16());
break;
case 32:
code.lock();
code.xadd(dword[addr], Xbyak::Reg32{value_idx});
code.xadd(dword[addr], Xbyak::Reg64(value_idx).cvt32());
break;
case 64:
code.lock();
code.xadd(qword[addr], Xbyak::Reg64{value_idx});
code.xadd(qword[addr], Xbyak::Reg64(value_idx));
break;
case 128:
code.lock();
code.cmpxchg16b(xword[addr]);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.movq(Xbyak::Xmm{value_idx}, rax);
code.pinsrq(Xbyak::Xmm{value_idx}, rdx, 1);
code.movq(Xbyak::Xmm(value_idx), rax);
code.pinsrq(Xbyak::Xmm(value_idx), rdx, 1);
} else {
code.movq(Xbyak::Xmm{value_idx}, rax);
code.movq(Xbyak::Xmm(value_idx), rax);
code.movq(xmm0, rdx);
code.punpcklqdq(Xbyak::Xmm{value_idx}, xmm0);
code.punpcklqdq(Xbyak::Xmm(value_idx), xmm0);
}
break;
default:
ASSERT_FALSE("Invalid bitsize");
}
return fastmem_location;
} else {
const void* fastmem_location = code.getCurr();
switch (bitsize) {
case 8:
code.movzx(Xbyak::Reg64(value_idx).cvt32(), code.byte[addr]);
break;
case 16:
code.movzx(Xbyak::Reg64(value_idx).cvt32(), word[addr]);
break;
case 32:
code.mov(Xbyak::Reg64(value_idx).cvt32(), dword[addr]);
break;
case 64:
code.mov(Xbyak::Reg64(value_idx), qword[addr]);
break;
case 128:
code.movups(Xbyak::Xmm(value_idx), xword[addr]);
break;
default:
ASSERT_FALSE("Invalid bitsize");
}
return fastmem_location;
}
const void* fastmem_location = code.getCurr();
switch (bitsize) {
case 8:
code.movzx(Xbyak::Reg32{value_idx}, code.byte[addr]);
break;
case 16:
code.movzx(Xbyak::Reg32{value_idx}, word[addr]);
break;
case 32:
code.mov(Xbyak::Reg32{value_idx}, dword[addr]);
break;
case 64:
code.mov(Xbyak::Reg64{value_idx}, qword[addr]);
break;
case 128:
code.movups(Xbyak::Xmm{value_idx}, xword[addr]);
break;
default:
ASSERT_FALSE("Invalid bitsize");
}
return fastmem_location;
}
template<std::size_t bitsize>
@ -276,10 +275,10 @@ const void* EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int
code.xor_(eax, eax);
code.xor_(edx, edx);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.movq(rbx, Xbyak::Xmm{value_idx});
code.pextrq(rcx, Xbyak::Xmm{value_idx}, 1);
code.movq(rbx, Xbyak::Xmm(value_idx));
code.pextrq(rcx, Xbyak::Xmm(value_idx), 1);
} else {
code.movaps(xmm0, Xbyak::Xmm{value_idx});
code.movaps(xmm0, Xbyak::Xmm(value_idx));
code.movq(rbx, xmm0);
code.punpckhqdq(xmm0, xmm0);
code.movq(rcx, xmm0);
@ -289,16 +288,16 @@ const void* EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int
const void* fastmem_location = code.getCurr();
switch (bitsize) {
case 8:
code.xchg(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8());
code.xchg(code.byte[addr], Xbyak::Reg64(value_idx).cvt8());
break;
case 16:
code.xchg(word[addr], Xbyak::Reg16{value_idx});
code.xchg(word[addr], Xbyak::Reg64(value_idx).cvt16());
break;
case 32:
code.xchg(dword[addr], Xbyak::Reg32{value_idx});
code.xchg(dword[addr], Xbyak::Reg64(value_idx).cvt32());
break;
case 64:
code.xchg(qword[addr], Xbyak::Reg64{value_idx});
code.xchg(qword[addr], Xbyak::Reg64(value_idx));
break;
case 128: {
Xbyak::Label loop;
@ -312,29 +311,29 @@ const void* EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int
ASSERT_FALSE("Invalid bitsize");
}
return fastmem_location;
} else {
const void* fastmem_location = code.getCurr();
switch (bitsize) {
case 8:
code.mov(code.byte[addr], Xbyak::Reg64(value_idx).cvt8());
break;
case 16:
code.mov(word[addr], Xbyak::Reg64(value_idx).cvt16());
break;
case 32:
code.mov(dword[addr], Xbyak::Reg64(value_idx).cvt32());
break;
case 64:
code.mov(qword[addr], Xbyak::Reg64(value_idx));
break;
case 128:
code.movups(xword[addr], Xbyak::Xmm(value_idx));
break;
default:
ASSERT_FALSE("Invalid bitsize");
}
return fastmem_location;
}
const void* fastmem_location = code.getCurr();
switch (bitsize) {
case 8:
code.mov(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8());
break;
case 16:
code.mov(word[addr], Xbyak::Reg16{value_idx});
break;
case 32:
code.mov(dword[addr], Xbyak::Reg32{value_idx});
break;
case 64:
code.mov(qword[addr], Xbyak::Reg64{value_idx});
break;
case 128:
code.movups(xword[addr], Xbyak::Xmm{value_idx});
break;
default:
ASSERT_FALSE("Invalid bitsize");
}
return fastmem_location;
}
template<typename UserConfig>

View file

@ -69,7 +69,7 @@ void EmitSignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
ctx.reg_alloc.DefineValue(overflow_inst, overflow);
}
} else {
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow.cvt8());
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow.cvt8());
}
ctx.reg_alloc.DefineValue(inst, result);
@ -98,7 +98,7 @@ void EmitUnsignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst
const Xbyak::Reg overflow = ctx.reg_alloc.ScratchGpr();
code.setb(overflow.cvt8());
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow.cvt8());
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow.cvt8());
ctx.reg_alloc.DefineValue(inst, addend);
}
@ -226,7 +226,7 @@ void EmitX64::EmitSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx,
code.cmovns(y, tmp);
code.sets(tmp.cvt8());
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], tmp.cvt8());
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp.cvt8());
ctx.reg_alloc.DefineValue(inst, y);
}
@ -250,7 +250,7 @@ void EmitX64::EmitSignedSaturatedDoublingMultiplyReturnHigh32(EmitContext& ctx,
code.cmovns(y.cvt32(), tmp.cvt32());
code.sets(tmp.cvt8());
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], tmp.cvt8());
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp.cvt8());
ctx.reg_alloc.DefineValue(inst, y);
}

View file

@ -25,6 +25,7 @@
#include "dynarmic/backend/x64/constants.h"
#include "dynarmic/backend/x64/emit_x64.h"
#include "dynarmic/common/math_util.h"
#include "dynarmic/interface/optimization_flags.h"
#include "dynarmic/ir/basic_block.h"
#include "dynarmic/ir/microinstruction.h"
#include "dynarmic/ir/opcodes.h"
@ -109,7 +110,7 @@ static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext
ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
ctx.reg_alloc.DefineValue(inst, result);
}
@ -137,7 +138,7 @@ static void EmitTwoArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext
ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
ctx.reg_alloc.DefineValue(inst, result);
}
@ -164,7 +165,7 @@ static void EmitTwoArgumentFallbackWithSaturationAndImmediate(BlockOfCode& code,
ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
ctx.reg_alloc.DefineValue(inst, result);
}
@ -1009,10 +1010,7 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
code.gf2p8affineqb(result, code.BConst<64>(xword, 0xaaccf0ff'00000000), 8);
ctx.reg_alloc.DefineValue(inst, result);
return;
}
if (code.HasHostFeature(HostFeature::SSSE3)) {
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1034,10 +1032,9 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
code.paddb(data, tmp1);
ctx.reg_alloc.DefineValue(inst, data);
return;
} else {
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>);
}
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>);
}
void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
@ -1070,10 +1067,7 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
code.vpshufb(result, result, data);
ctx.reg_alloc.DefineValue(inst, result);
return;
}
if (code.HasHostFeature(HostFeature::SSSE3)) {
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1106,24 +1100,33 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
code.pshufb(result, data);
ctx.reg_alloc.DefineValue(inst, result);
return;
} else {
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u16>);
}
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u16>);
}
void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512CD)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
code.vplzcntd(data, data);
ctx.reg_alloc.DefineValue(inst, data);
return;
// See https://stackoverflow.com/questions/58823140/count-leading-zero-bits-for-each-element-in-avx2-vector-emulate-mm256-lzcnt-ep/58827596#58827596
} else if (code.HasHostFeature(HostFeature::AVX2)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
code.vmovdqa(temp, data);
code.vpsrld(data, data, 8);
code.vpandn(data, data, temp);
code.vmovdqa(temp, code.Const(xword, 0x0000009E0000009E, 0x0000009E0000009E));
code.vcvtdq2ps(data, data);
code.vpsrld(data, data, 23);
code.vpsubusw(data, temp, data);
code.vpminsw(data, data, code.Const(xword, 0x0000002000000020, 0x0000002000000020));
ctx.reg_alloc.DefineValue(inst, data);
} else {
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u32>);
}
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u32>);
}
void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
@ -3323,7 +3326,7 @@ void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
code.paddb(mask, mask);
code.paddb(xmm_a, xmm_a);
code.pblendvb(result, alternate);
code.dec(counter);
code.sub(counter, 1);
code.jnz(loop);
ctx.reg_alloc.DefineValue(inst, result);
@ -3367,7 +3370,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst
code.paddw(mask, mask);
code.paddw(xmm_a, xmm_a);
code.pblendvb(result, alternate);
code.dec(counter);
code.sub(counter, 1);
code.jnz(loop);
ctx.reg_alloc.DefineValue(inst, result);
@ -4258,7 +4261,7 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
UNREACHABLE();
}
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
code.or_(code.dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
ctx.reg_alloc.DefineValue(inst, data);
}
@ -4393,7 +4396,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
code.pmovmskb(mask, xmm0);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask);
code.or_(code.dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], mask);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pblendvb(result, tmp);
@ -4479,7 +4482,7 @@ static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitC
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
code.pmovmskb(bit, upper_tmp);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
code.or_(code.dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -4530,7 +4533,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
code.vpcmpeqd(mask, result, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
code.vpxor(result, result, mask);
code.pmovmskb(bit, mask);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
code.or_(code.dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
ctx.reg_alloc.Release(mask);
ctx.reg_alloc.Release(bit);
@ -4586,7 +4589,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
code.pcmpeqd(tmp, result);
code.pxor(result, tmp);
code.pmovmskb(bit, tmp);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
code.or_(code.dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -4620,7 +4623,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx,
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
code.pmovmskb(bit, y);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
code.or_(code.dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
ctx.reg_alloc.DefineValue(inst, x);
}
@ -4673,7 +4676,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx,
code.pxor(x, y);
code.pmovmskb(bit, y);
}
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
code.or_(code.dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
ctx.reg_alloc.DefineValue(inst, x);
}
@ -4712,7 +4715,7 @@ static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, Block
code.pcmpeqd(reconstructed, src);
code.movmskps(bit, reconstructed);
code.xor_(bit, 0b1111);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
code.or_(code.dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
ctx.reg_alloc.DefineValue(inst, dest);
}
@ -4767,7 +4770,7 @@ static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, Blo
code.pcmpeqd(reconstructed, src);
code.movmskps(bit, reconstructed);
code.xor_(bit, 0b1111);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
code.or_(code.dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
ctx.reg_alloc.DefineValue(inst, dest);
}
@ -4870,7 +4873,7 @@ static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitCo
// Check if any elements matched the mask prior to performing saturation. If so, set the Q bit.
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
code.pmovmskb(bit, tmp);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
code.or_(code.dword[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
ctx.reg_alloc.DefineValue(inst, zero);
}
@ -5641,6 +5644,7 @@ static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx,
break;
}
case 32:
// See https://stackoverflow.com/questions/3380785/compute-the-absolute-difference-between-unsigned-integers-using-sse/3527267#3527267
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
@ -5652,16 +5656,33 @@ static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx,
} else {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
code.pxor(x, temp);
code.pxor(y, temp);
code.movdqa(temp, x);
code.psubd(temp, y);
code.pcmpgtd(y, x);
code.psrld(y, 1);
code.pxor(temp, y);
code.psubd(temp, y);
if (ctx.HasOptimization(OptimizationFlag::CodeSpeed)) {
// About 45 bytes
const Xbyak::Xmm temp_x = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm temp_y = ctx.reg_alloc.ScratchXmm();
code.pcmpeqd(temp, temp);
code.pslld(temp, 31);
code.movdqa(temp_x, x);
code.movdqa(temp_y, y);
code.paddd(temp_x, x);
code.paddd(temp_y, y);
code.pcmpgtd(temp_y, temp_x);
code.psubd(x, y);
code.pandn(temp, temp_y);
code.pxor(x, y);
code.psubd(x, y);
} else {
// Smaller code size - about 36 bytes
code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
code.pxor(x, temp);
code.pxor(y, temp);
code.movdqa(temp, x);
code.psubd(temp, y);
code.pcmpgtd(y, x);
code.psrld(y, 1);
code.pxor(temp, y);
code.psubd(temp, y);
}
}
break;
}
@ -5727,10 +5748,7 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
code.vpmulld(result, x, y);
ctx.reg_alloc.DefineValue(lower_inst, result);
return;
}
if (code.HasHostFeature(HostFeature::AVX)) {
} else if (code.HasHostFeature(HostFeature::AVX)) {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
@ -5749,39 +5767,33 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
code.shufps(result, x, 0b11011101);
ctx.reg_alloc.DefineValue(upper_inst, result);
return;
}
} else {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm upper_result = upper_inst ? ctx.reg_alloc.ScratchXmm() : Xbyak::Xmm{-1};
const Xbyak::Xmm lower_result = lower_inst ? ctx.reg_alloc.ScratchXmm() : Xbyak::Xmm{-1};
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
// calculate unsigned multiply
code.movdqa(tmp, x);
code.pmuludq(tmp, y);
code.psrlq(x, 32);
code.psrlq(y, 32);
code.pmuludq(x, y);
// calculate unsigned multiply
code.movdqa(tmp, x);
code.pmuludq(tmp, y);
code.psrlq(x, 32);
code.psrlq(y, 32);
code.pmuludq(x, y);
// put everything into place
code.pcmpeqw(upper_result, upper_result);
code.pcmpeqw(lower_result, lower_result);
code.psllq(upper_result, 32);
code.psrlq(lower_result, 32);
code.pand(upper_result, x);
code.pand(lower_result, tmp);
code.psrlq(tmp, 32);
code.psllq(x, 32);
code.por(upper_result, tmp);
code.por(lower_result, x);
if (upper_inst) {
ctx.reg_alloc.DefineValue(upper_inst, upper_result);
}
if (lower_inst) {
ctx.reg_alloc.DefineValue(lower_inst, lower_result);
// put everything into place - only if needed
if (upper_inst) code.pcmpeqw(upper_result, upper_result);
if (lower_inst) code.pcmpeqw(lower_result, lower_result);
if (upper_inst) code.psllq(upper_result, 32);
if (lower_inst) code.psrlq(lower_result, 32);
if (upper_inst) code.pand(upper_result, x);
if (lower_inst) code.pand(lower_result, tmp);
if (upper_inst) code.psrlq(tmp, 32);
if (lower_inst) code.psllq(x, 32);
if (upper_inst) code.por(upper_result, tmp);
if (lower_inst) code.por(lower_result, x);
if (upper_inst) ctx.reg_alloc.DefineValue(upper_inst, upper_result);
if (lower_inst) ctx.reg_alloc.DefineValue(lower_inst, lower_result);
}
}

View file

@ -450,7 +450,7 @@ void EmitTwoOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
code.mov(code.ABI_PARAM3.cvt32(), fpcr);
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.movaps(xword[code.ABI_PARAM2], arg1);
code.CallFunction(fn);
@ -487,7 +487,7 @@ void EmitThreeOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xby
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 3 * 16]);
code.mov(code.ABI_PARAM4.cvt32(), fpcr);
code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(rax, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.mov(qword[rsp + ABI_SHADOW_SPACE + 0], rax);
#else
constexpr u32 stack_space = 3 * 16;
@ -496,7 +496,7 @@ void EmitThreeOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xby
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
code.mov(code.ABI_PARAM4.cvt32(), fpcr);
code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM5, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
#endif
code.movaps(xword[code.ABI_PARAM2], arg1);
@ -545,7 +545,7 @@ void EmitFourOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbya
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 3 * 16]);
code.lea(code.ABI_PARAM4, ptr[rsp + ABI_SHADOW_SPACE + 4 * 16]);
code.mov(qword[rsp + ABI_SHADOW_SPACE + 0], ctx.FPCR(fpcr_controlled).Value());
code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(rax, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.mov(qword[rsp + ABI_SHADOW_SPACE + 8], rax);
#else
constexpr u32 stack_space = 4 * 16;
@ -555,7 +555,7 @@ void EmitFourOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbya
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
code.lea(code.ABI_PARAM4, ptr[rsp + ABI_SHADOW_SPACE + 3 * 16]);
code.mov(code.ABI_PARAM5.cvt32(), ctx.FPCR(fpcr_controlled).Value());
code.lea(code.ABI_PARAM6, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.lea(code.ABI_PARAM6, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
#endif
if constexpr (load_previous_result == LoadPreviousResult::Yes) {

View file

@ -62,7 +62,7 @@ void EmitVectorSaturatedNative(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.test(overflow.cvt32(), overflow.cvt32());
}
code.setnz(overflow);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -104,7 +104,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.ktestb(k1, k1);
code.setnz(overflow);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
ctx.reg_alloc.DefineValue(inst, result);
return;
@ -160,7 +160,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.test(overflow.cvt32(), overflow.cvt32());
}
code.setnz(overflow);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
if (code.HasHostFeature(HostFeature::SSE41)) {
FCODE(blendvp)(result, tmp);
@ -204,7 +204,7 @@ void EmitVectorUnsignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst*
code.ktestb(k1, k1);
code.setnz(overflow);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
ctx.reg_alloc.DefineValue(inst, result);
return;
@ -263,7 +263,7 @@ void EmitVectorUnsignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst*
}
code.setnz(overflow);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
if constexpr (op == Op::Add) {
code.por(result, tmp);

View file

@ -78,16 +78,16 @@ inline bool HostLocIsFlag(HostLoc reg) {
inline HostLoc HostLocRegIdx(int idx) {
ASSERT(idx >= 0 && idx <= 15);
return static_cast<HostLoc>(idx);
return HostLoc(idx);
}
inline HostLoc HostLocXmmIdx(int idx) {
ASSERT(idx >= 0 && idx <= 15);
return static_cast<HostLoc>(static_cast<size_t>(HostLoc::XMM0) + idx);
return HostLoc(size_t(HostLoc::XMM0) + idx);
}
inline HostLoc HostLocSpill(size_t i) {
return static_cast<HostLoc>(static_cast<size_t>(HostLoc::FirstSpill) + i);
return HostLoc(size_t(HostLoc::FirstSpill) + i);
}
inline bool HostLocIsSpill(HostLoc reg) {
@ -109,6 +109,8 @@ inline size_t HostLocBitWidth(HostLoc loc) {
using HostLocList = std::initializer_list<HostLoc>;
// RSP is preserved for function calls
// R13 contains fastmem pointer if any
// R14 contains the pagetable pointer
// R15 contains the JitState pointer
const HostLocList any_gpr = {
HostLoc::RAX,
@ -125,12 +127,16 @@ const HostLocList any_gpr = {
HostLoc::R12,
HostLoc::R13,
HostLoc::R14,
//HostLoc::R15,
};
// XMM0 is reserved for use by instructions that implicitly use it as an argument
// XMM1 is used by 128 mem accessors
// XMM2 is also used by that (and other stuff)
// Basically dont use either XMM0, XMM1 or XMM2 ever; they're left for the regsel
const HostLocList any_xmm = {
HostLoc::XMM1,
HostLoc::XMM2,
//HostLoc::XMM1,
//HostLoc::XMM2,
HostLoc::XMM3,
HostLoc::XMM4,
HostLoc::XMM5,

View file

@ -431,13 +431,22 @@ HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc,
auto it_empty_candidate = desired_locations.cend();
for (auto it = desired_locations.cbegin(); it != desired_locations.cend(); it++) {
auto const& loc_info = LocInfo(*it);
DEBUG_ASSERT(*it != ABI_JIT_PTR);
// Abstain from using upper registers unless absolutely nescesary
if (loc_info.IsLocked()) {
// skip, not suitable for allocation
// While R13 and R14 are technically available, we avoid allocating for them
// at all costs, because theoretically skipping them is better than spilling
// all over the place - it also fixes bugs with high reg pressure
} else if (*it >= HostLoc::R13 && *it <= HostLoc::R15) {
// skip, do not touch
// Intel recommends to reuse registers as soon as they're overwritable (DO NOT SPILL)
} else if (loc_info.IsEmpty()) {
it_empty_candidate = it;
break;
// No empty registers for some reason (very evil) - just do normal LRU
} else {
if (loc_info.lru_counter < min_lru_counter) {
if (loc_info.IsEmpty())
it_empty_candidate = it;
// Otherwise a "quasi"-LRU
min_lru_counter = loc_info.lru_counter;
if (*it >= HostLoc::R8 && *it <= HostLoc::R15) {
@ -448,9 +457,6 @@ HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc,
if (min_lru_counter == 0)
break; //early exit
}
// only if not assigned (i.e for failcase of all LRU=0)
if (it_empty_candidate == desired_locations.cend() && loc_info.IsEmpty())
it_empty_candidate = it;
}
}
// Final resolution goes as follows:
@ -521,11 +527,10 @@ void RegAlloc::Move(HostLoc to, HostLoc from) noexcept {
ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsLocked());
ASSERT(bit_width <= HostLocBitWidth(to));
ASSERT_MSG(!LocInfo(from).IsEmpty(), "Mov eliminated");
if (!LocInfo(from).IsEmpty()) {
EmitMove(bit_width, to, from);
LocInfo(to) = std::exchange(LocInfo(from), {});
}
EmitMove(bit_width, to, from);
LocInfo(to) = std::exchange(LocInfo(from), {});
}
void RegAlloc::CopyToScratch(size_t bit_width, HostLoc to, HostLoc from) noexcept {
@ -559,30 +564,44 @@ void RegAlloc::SpillRegister(HostLoc loc) noexcept {
ASSERT_MSG(HostLocIsRegister(loc), "Only registers can be spilled");
ASSERT_MSG(!LocInfo(loc).IsEmpty(), "There is no need to spill unoccupied registers");
ASSERT_MSG(!LocInfo(loc).IsLocked(), "Registers that have been allocated must not be spilt");
const HostLoc new_loc = FindFreeSpill();
auto const new_loc = FindFreeSpill(HostLocIsXMM(loc));
Move(new_loc, loc);
}
HostLoc RegAlloc::FindFreeSpill() const noexcept {
for (size_t i = static_cast<size_t>(HostLoc::FirstSpill); i < hostloc_info.size(); i++) {
const auto loc = static_cast<HostLoc>(i);
if (LocInfo(loc).IsEmpty()) {
return loc;
}
HostLoc RegAlloc::FindFreeSpill(bool is_xmm) const noexcept {
// Do not spill XMM into other XMM silly
if (!is_xmm) {
// TODO(lizzie): Using lower (xmm0 and such) registers results in issues/crashes - INVESTIGATE WHY
// Intel recommends to spill GPR onto XMM registers IF POSSIBLE
// TODO(lizzie): Issues on DBZ, theory: Scratch XMM not properly restored after a function call?
// Must sync with ABI registers (except XMM0, XMM1 and XMM2)
#ifdef _WIN32
for (size_t i = size_t(HostLoc::XMM5); i >= size_t(HostLoc::XMM3); --i)
if (const auto loc = HostLoc(i); LocInfo(loc).IsEmpty())
return loc;
#else
for (size_t i = size_t(HostLoc::XMM15); i >= size_t(HostLoc::XMM3); --i)
if (const auto loc = HostLoc(i); LocInfo(loc).IsEmpty())
return loc;
#endif
}
// Otherwise go to stack spilling
for (size_t i = size_t(HostLoc::FirstSpill); i < hostloc_info.size(); ++i)
if (const auto loc = HostLoc(i); LocInfo(loc).IsEmpty())
return loc;
ASSERT_FALSE("All spill locations are full");
}
inline static Xbyak::RegExp SpillToOpArg_Helper1(HostLoc loc, size_t reserved_stack_space) noexcept {
ASSERT(HostLocIsSpill(loc));
size_t i = static_cast<size_t>(loc) - static_cast<size_t>(HostLoc::FirstSpill);
ASSERT_MSG(i < SpillCount, "Spill index greater than number of available spill locations");
return Xbyak::util::rsp + reserved_stack_space + ABI_SHADOW_SPACE + offsetof(StackLayout, spill) + i * sizeof(StackLayout::spill[0]);
}
};
void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc from) noexcept {
auto const spill_to_op_arg_helper = [&](HostLoc loc, size_t reserved_stack_space) {
ASSERT(HostLocIsSpill(loc));
size_t i = size_t(loc) - size_t(HostLoc::FirstSpill);
ASSERT_MSG(i < SpillCount, "Spill index greater than number of available spill locations");
return Xbyak::util::rsp + reserved_stack_space + ABI_SHADOW_SPACE + offsetof(StackLayout, spill) + i * sizeof(StackLayout::spill[0]);
};
auto const spill_xmm_to_op = [&](const HostLoc loc) {
return Xbyak::util::xword[spill_to_op_arg_helper(loc, reserved_stack_space)];
};
if (HostLocIsXMM(to) && HostLocIsXMM(from)) {
MAYBE_AVX(movaps, HostLocToXmm(to), HostLocToXmm(from));
} else if (HostLocIsGPR(to) && HostLocIsGPR(from)) {
@ -607,7 +626,7 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
MAYBE_AVX(movd, HostLocToReg64(to).cvt32(), HostLocToXmm(from));
}
} else if (HostLocIsXMM(to) && HostLocIsSpill(from)) {
const Xbyak::Address spill_addr = SpillToOpArg(from);
const Xbyak::Address spill_addr = spill_xmm_to_op(from);
ASSERT(spill_addr.getBit() >= bit_width);
switch (bit_width) {
case 128:
@ -625,7 +644,7 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
UNREACHABLE();
}
} else if (HostLocIsSpill(to) && HostLocIsXMM(from)) {
const Xbyak::Address spill_addr = SpillToOpArg(to);
const Xbyak::Address spill_addr = spill_xmm_to_op(to);
ASSERT(spill_addr.getBit() >= bit_width);
switch (bit_width) {
case 128:
@ -645,16 +664,16 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
} else if (HostLocIsGPR(to) && HostLocIsSpill(from)) {
ASSERT(bit_width != 128);
if (bit_width == 64) {
code->mov(HostLocToReg64(to), Xbyak::util::qword[SpillToOpArg_Helper1(from, reserved_stack_space)]);
code->mov(HostLocToReg64(to), Xbyak::util::qword[spill_to_op_arg_helper(from, reserved_stack_space)]);
} else {
code->mov(HostLocToReg64(to).cvt32(), Xbyak::util::dword[SpillToOpArg_Helper1(from, reserved_stack_space)]);
code->mov(HostLocToReg64(to).cvt32(), Xbyak::util::dword[spill_to_op_arg_helper(from, reserved_stack_space)]);
}
} else if (HostLocIsSpill(to) && HostLocIsGPR(from)) {
ASSERT(bit_width != 128);
if (bit_width == 64) {
code->mov(Xbyak::util::qword[SpillToOpArg_Helper1(to, reserved_stack_space)], HostLocToReg64(from));
code->mov(Xbyak::util::qword[spill_to_op_arg_helper(to, reserved_stack_space)], HostLocToReg64(from));
} else {
code->mov(Xbyak::util::dword[SpillToOpArg_Helper1(to, reserved_stack_space)], HostLocToReg64(from).cvt32());
code->mov(Xbyak::util::dword[spill_to_op_arg_helper(to, reserved_stack_space)], HostLocToReg64(from).cvt32());
}
} else {
ASSERT_FALSE("Invalid RegAlloc::EmitMove");
@ -671,8 +690,4 @@ void RegAlloc::EmitExchange(const HostLoc a, const HostLoc b) noexcept {
}
}
Xbyak::Address RegAlloc::SpillToOpArg(const HostLoc loc) noexcept {
return Xbyak::util::xword[SpillToOpArg_Helper1(loc, reserved_stack_space)];
}
} // namespace Dynarmic::Backend::X64

View file

@ -22,6 +22,7 @@
#include "dynarmic/backend/x64/hostloc.h"
#include "dynarmic/backend/x64/stack_layout.h"
#include "dynarmic/backend/x64/oparg.h"
#include "dynarmic/backend/x64/abi.h"
#include "dynarmic/ir/cond.h"
#include "dynarmic/ir/microinstruction.h"
#include "dynarmic/ir/value.h"
@ -242,20 +243,19 @@ private:
void MoveOutOfTheWay(HostLoc reg) noexcept;
void SpillRegister(HostLoc loc) noexcept;
HostLoc FindFreeSpill() const noexcept;
HostLoc FindFreeSpill(bool is_xmm) const noexcept;
inline HostLocInfo& LocInfo(const HostLoc loc) noexcept {
ASSERT(loc != HostLoc::RSP && loc != HostLoc::R15);
ASSERT(loc != HostLoc::RSP && loc != ABI_JIT_PTR);
return hostloc_info[static_cast<size_t>(loc)];
}
inline const HostLocInfo& LocInfo(const HostLoc loc) const noexcept {
ASSERT(loc != HostLoc::RSP && loc != HostLoc::R15);
ASSERT(loc != HostLoc::RSP && loc != ABI_JIT_PTR);
return hostloc_info[static_cast<size_t>(loc)];
}
void EmitMove(const size_t bit_width, const HostLoc to, const HostLoc from) noexcept;
void EmitExchange(const HostLoc a, const HostLoc b) noexcept;
Xbyak::Address SpillToOpArg(const HostLoc loc) noexcept;
//data
alignas(64) boost::container::static_vector<HostLoc, 28> gpr_order;

View file

@ -22,7 +22,7 @@ void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLoc hostloc, si
} else if (HostLocIsXMM(hostloc)) {
return reg_data.xmms[HostLocToXmm(hostloc).getIdx()];
} else if (HostLocIsSpill(hostloc)) {
return (*reg_data.spill)[static_cast<size_t>(hostloc) - static_cast<size_t>(HostLoc::FirstSpill)];
return (*reg_data.spill)[size_t(hostloc) - size_t(HostLoc::FirstSpill)];
} else {
fmt::print("invalid hostloc! ");
return {0, 0};

View file

@ -22,7 +22,7 @@ template<typename... Ts>
}())
#define ASSERT(_a_) ASSERT_MSG(_a_, "")
#define UNREACHABLE() ASSERT(false, "unreachable")
#define UNREACHABLE() ASSERT_MSG(false, "unreachable")
#ifdef _DEBUG
#define DEBUG_ASSERT(_a_) ASSERT(_a_)
#define DEBUG_ASSERT_MSG(_a_, ...) ASSERT_MSG(_a_, __VA_ARGS__)

View file

@ -152,11 +152,9 @@ constexpr CRC32Table iso_table{
static u32 ComputeCRC32(const CRC32Table& table, u32 crc, const u64 value, int length) {
const auto* data = reinterpret_cast<const unsigned char*>(&value);
while (length-- > 0) {
crc = (crc >> 8) ^ table[(crc ^ (*data++)) & 0xFF];
}
return crc;
}

View file

@ -16,15 +16,14 @@ namespace Dynarmic {
void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) {
Xbyak::Label start, loop;
code.jmp(start);
code.jmp(start, code.T_NEAR);
code.L(loop);
code.pause();
code.L(start);
code.mov(tmp, 1);
code.lock();
code.xchg(code.dword[ptr], tmp);
/*code.lock();*/ code.xchg(code.dword[ptr], tmp);
code.test(tmp, tmp);
code.jnz(loop);
code.jnz(loop, code.T_NEAR);
}
void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) {

View file

@ -109,13 +109,11 @@ bool TranslatorVisitor::arm_LDR_imm(Cond cond, bool P, bool U, bool W, Reg n, Re
if (t == Reg::PC) {
ir.LoadWritePC(data);
if (!P && W && n == Reg::R13) {
ir.SetTerm(IR::Term::PopRSBHint{});
} else {
ir.SetTerm(IR::Term::FastDispatchHint{});
}
return false;
}
@ -145,7 +143,11 @@ bool TranslatorVisitor::arm_LDR_reg(Cond cond, bool P, bool U, bool W, Reg n, Re
if (t == Reg::PC) {
ir.LoadWritePC(data);
ir.SetTerm(IR::Term::FastDispatchHint{});
if (!P && W && n == Reg::R13) {
ir.SetTerm(IR::Term::PopRSBHint{});
} else {
ir.SetTerm(IR::Term::FastDispatchHint{});
}
return false;
}

View file

@ -21,6 +21,7 @@ bool TranslatorVisitor::B_uncond(Imm<26> imm26) {
const s64 offset = concatenate(imm26, Imm<2>{0}).SignExtend<s64>();
const u64 target = ir.PC() + offset;
//ir.SetTerm(IR::Term::LinkBlockFast{ir.current_location->SetPC(target)});
ir.SetTerm(IR::Term::LinkBlock{ir.current_location->SetPC(target)});
return false;
}

View file

@ -32,6 +32,8 @@ enum class OptimizationFlag : std::uint32_t {
ConstProp = 0x00000010,
/// This is enables miscellaneous safe IR optimizations.
MiscIROpt = 0x00000020,
/// Optimize for code speed rather than for code size (this serves well for tight loops)
CodeSpeed = 0x00000040,
/// This is an UNSAFE optimization that reduces accuracy of fused multiply-add operations.
/// This unfuses fused instructions to improve performance on host CPUs without FMA support.

View file

@ -86,11 +86,9 @@ static std::string TerminalToString(const Terminal& terminal_variant) noexcept {
}
std::string DumpBlock(const IR::Block& block) noexcept {
std::string ret;
ret += fmt::format("Block: location={}\n", block.Location());
ret += fmt::format("cycles={}", block.CycleCount());
ret += fmt::format(", entry_cond={}", A64::CondToString(block.GetCondition()));
std::string ret = fmt::format("Block: location={}-{}\n", block.Location(), block.EndLocation())
+ fmt::format("cycles={}", block.CycleCount())
+ fmt::format(", entry_cond={}", A64::CondToString(block.GetCondition()));
if (block.GetCondition() != Cond::AL) {
ret += fmt::format(", cond_fail={}", block.ConditionFailedLocation());
}
@ -116,6 +114,8 @@ std::string DumpBlock(const IR::Block& block) noexcept {
return fmt::format("#{:#x}", arg.GetU32());
case Type::U64:
return fmt::format("#{:#x}", arg.GetU64());
case Type::U128:
return fmt::format("#<u128 imm>");
case Type::A32Reg:
return A32::RegToString(arg.GetA32RegRef());
case Type::A32ExtReg:
@ -124,8 +124,18 @@ std::string DumpBlock(const IR::Block& block) noexcept {
return A64::RegToString(arg.GetA64RegRef());
case Type::A64Vec:
return A64::VecToString(arg.GetA64VecRef());
case Type::CoprocInfo:
return fmt::format("#<coproc>");
case Type::NZCVFlags:
return fmt::format("#<NZCV flags>");
case Type::Cond:
return fmt::format("#<cond={}>", A32::CondToString(arg.GetCond()));
case Type::Table:
return fmt::format("#<table>");
case Type::AccType:
return fmt::format("#<acc-type={}>", u32(arg.GetAccType()));
default:
return "<unknown immediate type>";
return fmt::format("<unknown immediate type {}>", arg.GetType());
}
};

View file

@ -19,7 +19,7 @@
namespace Dynarmic::IR {
enum class Opcode;
enum class Type;
enum class Type : u16;
constexpr size_t max_arg_count = 4;

View file

@ -16,12 +16,6 @@ namespace Dynarmic::IR {
namespace OpcodeInfo {
struct Meta {
std::vector<Type> arg_types;
const char* name;
Type type;
};
constexpr Type Void = Type::Void;
constexpr Type A32Reg = Type::A32Reg;
constexpr Type A32ExtReg = Type::A32ExtReg;
@ -40,10 +34,22 @@ constexpr Type Cond = Type::Cond;
constexpr Type Table = Type::Table;
constexpr Type AccType = Type::AccType;
alignas(64) static const std::array opcode_info{
#define OPCODE(name, type, ...) Meta{{__VA_ARGS__}, #name, type},
#define A32OPC(name, type, ...) Meta{{__VA_ARGS__}, #name, type},
#define A64OPC(name, type, ...) Meta{{__VA_ARGS__}, #name, type},
struct Meta {
std::array<Type, 4> arg_types;
Type type;
uint8_t count;
};
// Evil macro magic for Intel C++ compiler
// Helper macro to force expanding __VA_ARGS__ to satisfy MSVC compiler.
#define PP_EXPAND(x) x
#define PP_NARGS(...) PP_EXPAND(PP_ARG_N(__VA_ARGS__, 5, 4, 3, 2, 1, 0))
#define PP_ARG_N(_1, _2, _3, _4, _5, N, ...) N
alignas(64) static const Meta opcode_info[] = {
#define OPCODE(name, type, ...) Meta{{__VA_ARGS__}, type, PP_EXPAND(PP_NARGS(__VA_ARGS__))},
#define A32OPC(name, type, ...) Meta{{__VA_ARGS__}, type, PP_EXPAND(PP_NARGS(__VA_ARGS__))},
#define A64OPC(name, type, ...) Meta{{__VA_ARGS__}, type, PP_EXPAND(PP_NARGS(__VA_ARGS__))},
#include "./opcodes.inc"
#undef OPCODE
#undef A32OPC
@ -54,22 +60,31 @@ alignas(64) static const std::array opcode_info{
/// @brief Get return type of an opcode
Type GetTypeOf(Opcode op) noexcept {
return OpcodeInfo::opcode_info.at(size_t(op)).type;
return OpcodeInfo::opcode_info[size_t(op)].type;
}
/// @brief Get the number of arguments an opcode accepts
size_t GetNumArgsOf(Opcode op) noexcept {
return OpcodeInfo::opcode_info.at(size_t(op)).arg_types.size();
return OpcodeInfo::opcode_info[size_t(op)].count;
}
/// @brief Get the required type of an argument of an opcode
Type GetArgTypeOf(Opcode op, size_t arg_index) noexcept {
return OpcodeInfo::opcode_info.at(size_t(op)).arg_types.at(arg_index);
return OpcodeInfo::opcode_info[size_t(op)].arg_types[arg_index];
}
/// @brief Get the name of an opcode.
std::string GetNameOf(Opcode op) noexcept {
return OpcodeInfo::opcode_info.at(size_t(op)).name;
std::string_view GetNameOf(Opcode op) noexcept {
static const std::string_view opcode_names[] = {
#define OPCODE(name, type, ...) #name,
#define A32OPC(name, type, ...) #name,
#define A64OPC(name, type, ...) #name,
#include "./opcodes.inc"
#undef OPCODE
#undef A32OPC
#undef A64OPC
};
return opcode_names[size_t(op)];
}
} // namespace Dynarmic::IR

View file

@ -15,7 +15,7 @@
namespace Dynarmic::IR {
enum class Type;
enum class Type : u16;
/// @brief The Opcodes of our intermediate representation.
/// Type signatures for each opcode can be found in opcodes.inc
@ -35,7 +35,7 @@ constexpr size_t OpcodeCount = static_cast<size_t>(Opcode::NUM_OPCODE);
Type GetTypeOf(Opcode op) noexcept;
size_t GetNumArgsOf(Opcode op) noexcept;
Type GetArgTypeOf(Opcode op, size_t arg_index) noexcept;
std::string GetNameOf(Opcode op) noexcept;
std::string_view GetNameOf(Opcode op) noexcept;
/// @brief Determines whether or not this instruction performs an arithmetic shift.
constexpr bool IsArithmeticShift(const Opcode op) noexcept {

View file

@ -18,7 +18,7 @@ namespace Dynarmic::IR {
/**
* The intermediate representation is typed. These are the used by our IR.
*/
enum class Type {
enum class Type : u16 {
Void = 0,
A32Reg = 1 << 0,
A32ExtReg = 1 << 1,

View file

@ -445,6 +445,9 @@ static void RunTestInstance(Dynarmic::A32::Jit& jit,
}
}
// TODO: Why the difference? QEMU what are you doing???
jit.Regs()[15] = uni.GetRegisters()[15];
REQUIRE(uni.GetRegisters() == jit.Regs());
REQUIRE(uni.GetExtRegs() == jit.ExtRegs());
REQUIRE((uni.GetCpsr() & 0xFFFFFDDF) == (jit.Cpsr() & 0xFFFFFDDF));

File diff suppressed because one or more lines are too long

View file

@ -8,7 +8,7 @@
#include <array>
#include <exception>
#include <map>
#include <unordered_map>
#include <catch2/catch_test_macros.hpp>
#include "dynarmic/common/common_types.h"
@ -23,7 +23,7 @@ namespace {
class MyEnvironment final : public A64::UserCallbacks {
public:
u64 ticks_left = 0;
std::map<u64, u8> memory{};
std::unordered_map<u64, u8> memory{};
u8 MemoryRead8(u64 vaddr) override {
return memory[vaddr];

File diff suppressed because one or more lines are too long

View file

@ -9,7 +9,7 @@
#pragma once
#include <array>
#include <map>
#include <unordered_map>
#include "dynarmic/common/assert.h"
#include "dynarmic/common/common_types.h"
@ -26,7 +26,7 @@ public:
u64 code_mem_start_address = 0;
std::vector<u32> code_mem;
std::map<u64, u8> modified_memory;
std::unordered_map<u64, u8> modified_memory;
std::vector<std::string> interrupts;
bool IsInCodeMem(u64 vaddr) const {
@ -133,6 +133,7 @@ class A64FastmemTestEnv final : public Dynarmic::A64::UserCallbacks {
public:
u64 ticks_left = 0;
char* backing_memory = nullptr;
bool ignore_invalid_insn = false;
explicit A64FastmemTestEnv(char* addr)
: backing_memory(addr) {}
@ -205,7 +206,7 @@ public:
return true;
}
void InterpreterFallback(u64 pc, size_t num_instructions) override { ASSERT_MSG(false, "InterpreterFallback({:016x}, {})", pc, num_instructions); }
void InterpreterFallback(u64 pc, size_t num_instructions) override { ASSERT_MSG(ignore_invalid_insn, "InterpreterFallback({:016x}, {})", pc, num_instructions); }
void CallSVC(std::uint32_t swi) override { ASSERT_MSG(false, "CallSVC({})", swi); }

View file

@ -29,6 +29,7 @@ if ("A64" IN_LIST DYNARMIC_FRONTENDS)
A64/fp_min_max.cpp
A64/misaligned_page_table.cpp
A64/test_invalidation.cpp
A64/real_world.cpp
A64/testenv.h
)
endif()

View file

@ -173,7 +173,7 @@ void A64Unicorn::InterruptHook(uc_engine* uc, u32 int_number, void* user_data) {
auto* this_ = static_cast<A64Unicorn*>(user_data);
u32 esr;
CHECKED(uc_reg_read(uc, UC_ARM64_REG_ESR, &esr));
//CHECKED(uc_reg_read(uc, UC_ARM64_REG_ESR_EL0, &esr));
auto ec = esr >> 26;
auto iss = esr & 0xFFFFFF;

View file

@ -77,6 +77,8 @@
<string name="frame_interpolation_description">يضمن تسليمًا سلسًا ومتناسقًا للإطارات من خلال مزامنة التوقيت بينها، مما يقلل من التقطيع وعدم انتظام الحركة. مثالي للألعاب التي تعاني من عدم استقرار في توقيت الإطارات أو تقطع دقيق أثناء اللعب.</string>
<string name="renderer_early_release_fences">إطلاق الأسوار مبكرًا</string>
<string name="renderer_early_release_fences_description">يساعد في إصلاح مشكلة 0 إطار في الثانية في ألعاب مثل DKCR:HD وSubnautica Below Zero وOri 2، ولكن قد يتسبب في تعطيل التحميل أو الأداء في ألعاب Unreal Engine.</string>
<string name="buffer_reorder_disable">تعطيل إعادة ترتيب المخزن المؤقت</string>
<string name="buffer_reorder_disable_description">عند التحديد، يتم تعطيل إعادة ترتيب تحميل الذاكرة المعينة مما يسمح بربط التحميلات برسومات محددة. قد يقلل الأداء في بعض الحالات.</string>
<string name="use_lru_cache">تمكين ذاكرة التخزين المؤقت LRU</string>
<string name="use_lru_cache_description">تمكين أو تعطيل ذاكرة التخزين المؤقت الأقل استخداماً مؤخراً (LRU) لتحسين الأداء عن طريق تقليل استخدام وحدة المعالجة المركزية. بعض الألعاب قد تواجه مشاكل معه، خاصةً TotK 1.2.1، لذا قم بتعطيله إذا لم تعمل اللعبة أو انهارت عشوائياً.</string>
<string name="dyna_state">الحالة الديناميكية الممتدة</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">يضمن تسليمًا سلسًا ومتناسقًا للإطارات من خلال مزامنة التوقيت بينها، مما يقلل من التقطيع وعدم انتظام الحركة. مثالي للألعاب التي تعاني من عدم استقرار في توقيت الإطارات أو تقطع دقيق أثناء اللعب.</string>
<string name="renderer_early_release_fences">زێدەکردنی پەرستارەکان زووتر</string>
<string name="renderer_early_release_fences_description">یارمەتی دەدات لە چارەسەری 0 FPS لە یارییەکانی وەک DKCR:HD، Subnautica Below Zero و Ori 2، بەڵام ڕەنگە بارکردن یان کارایی لە یارییەکانی Unreal Engine تێکبدات.</string>
<string name="buffer_reorder_disable">ڕێکخستنەوەی بافر ناچالاک بکە</string>
<string name="buffer_reorder_disable_description">کە دیاریکرا، ڕێکخستنەوەی بارکردنی بیرگەی نەخشەکراو ناچالاک دەکات کە ڕێگەدەدات بارکردنەکان بە ڕەسمی دیاریکراو ببەسترێت. لە هەندێک حاڵەتدا کاراییمان دەکاتەوە.</string>
<string name="use_lru_cache">تمكين ذاكرة التخزين المؤقت LRU</string>
<string name="use_lru_cache_description">چالاک یان ناچالاککردنی کاشەی LRU، کارایی باشتر دەکات بە هەڵگرتنی بەکارهێنانی پرۆسەی CPU. هەندێک یاری کێشەی لەگەڵ هەیە، بەتایبەتی TotK 1.2.1، بۆیە بیخەوێنە ئەگەر یاریەکە نەگەڕێت یان بە هەڕەمەکی بشکێت.</string>
<string name="dyna_state">الحالة الديناميكية الممتدة</string>

View file

@ -77,6 +77,8 @@
<string name="frame_interpolation_description">Zajišťuje plynulé a konzistentní zobrazování snímků synchronizací jejich časování, čímž snižuje trhání a nerovnoměrné animace. Ideální pro hry, které trpí nestabilitou časování snímků nebo mikrotrháním během hraní.</string>
<string name="renderer_early_release_fences">Uvolnit ploty brzy</string>
<string name="renderer_early_release_fences_description">Pomáhá opravit 0 FPS v hrách jako DKCR:HD, Subnautica Below Zero a Ori 2, ale může narušit načítání nebo výkon v hrách na Unreal Engine.</string>
<string name="buffer_reorder_disable">Zakázat přeřazování vyrovnávací paměti</string>
<string name="buffer_reorder_disable_description">Při zaškrtnutí zakáže přeřazování nahrání mapované paměti, což umožňuje spojit nahrání s konkrétními vykresleními. V některých případech může snížit výkon.</string>
<string name="use_lru_cache">Povolit LRU mezipaměť</string>
<string name="use_lru_cache_description">Povolte nebo zakažte mezipaměť LRU, čímž zvýšíte výkon snížením využití procesoru CPU. Některé hry s ní mají problémy, zejména TotK 1.2.1, takže ji deaktivujte, pokud hra neběží nebo náhodně padá.</string>
<string name="dyna_state">Rozšířený dynamický stav</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Sorgt für eine gleichmäßige und konsistente Frame-Wiedergabe durch Synchronisierung der Frame-Zeiten, was Ruckeln und ungleichmäßige Animationen reduziert. Ideal für Spiele, die unter instabilen Frame-Zeiten oder Mikrorucklern leiden.</string>
<string name="renderer_early_release_fences">Zäune früher freigeben</string>
<string name="renderer_early_release_fences_description">Behebt 0 FPS in Spielen wie DKCR:HD, Subnautica Below Zero und Ori 2, kann aber Ladezeiten oder Performance in Unreal Engine-Spielen beeinträchtigen.</string>
<string name="buffer_reorder_disable">Puffer-Neuanordnung deaktivieren</string>
<string name="buffer_reorder_disable_description">Wenn aktiviert, wird die Neuanordnung von gemappten Speicher-Uploads deaktiviert, was die Zuordnung von Uploads zu bestimmten Zeichenvorgängen ermöglicht. Kann in einigen Fällen die Leistung verringern.</string>
<string name="use_lru_cache">LRU-Cache aktivieren</string>
<string name="use_lru_cache_description">Aktivieren oder deaktivieren Sie den LRU-Cache, um die Leistung durch Einsparung von CPU-Prozessorauslastung zu verbessern. Einige Spiele haben Probleme damit, insbesondere TotK 1.2.1, deaktivieren Sie es also, wenn das Spiel nicht startet oder zufällig abstürzt.</string>
<string name="dyna_state">Erweiterter dynamischer Status</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Garantiza una reproducción suave y consistente de fotogramas sincronizando sus tiempos, reduciendo el tartamudeo y animaciones irregulares. Ideal para juegos con problemas de sincronización de fotogramas o microtartamudeos.</string>
<string name="renderer_early_release_fences">Liberar vallas antes</string>
<string name="renderer_early_release_fences_description">Ayuda a solucionar 0 FPS en juegos como DKCR:HD, Subnautica Below Zero y Ori 2, pero puede afectar la carga o rendimiento en juegos de Unreal Engine.</string>
<string name="buffer_reorder_disable">Desactivar reordenamiento de búfer</string>
<string name="buffer_reorder_disable_description">Cuando está marcado, desactiva el reordenamiento de cargas de memoria mapeada, lo que permite asociar cargas con dibujos específicos. Puede reducir el rendimiento en algunos casos.</string>
<string name="use_lru_cache">Habilitar caché LRU</string>
<string name="use_lru_cache_description">Activa o desactiva la caché LRU, mejorando el rendimiento al ahorrar uso del proceso de la CPU. Algunos juegos tienen problemas con ella, notablemente TotK 1.2.1, así que desactívala si el juego no inicia o se cierra aleatoriamente.</string>
<string name="dyna_state">Estado dinámico extendido</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">ارسال یکنواخت و پایدار فریم‌ها را با همگام‌سازی زمان بین آن‌ها تضمین می‌کند، که منجر به کاهش لرزش و انیمیشن‌های ناهموار می‌شود. برای بازی‌هایی که ناپایداری در زمان‌بندی فریم‌ها یا میکرو لرزش در حین بازی دارند ایده‌آل است</string>
<string name="renderer_early_release_fences">رهاسازی حصارها زودتر</string>
<string name="renderer_early_release_fences_description">به رفع مشکل 0 فریم بر ثانیه در بازی‌هایی مانند DKCR:HD، Subnautica Below Zero و Ori 2 کمک می‌کند، اما ممکن است بارگذاری یا عملکرد بازی‌های Unreal Engine را مختل کند.</string>
<string name="buffer_reorder_disable">غیرفعال کردن مرتب‌سازی مجدد بافر</string>
<string name="buffer_reorder_disable_description">در صورت انتخاب، مرتب‌سازی مجدد آپلودهای حافظه نگاشت‌شده غیرفعال می‌شود که امکان ارتباط آپلودها با ترسیمات خاص را فراهم می‌کند. ممکن است در برخی موارد عملکرد را کاهش دهد.</string>
<string name="use_lru_cache">فعال‌سازی حافظه نهان LRU</string>
<string name="use_lru_cache_description">حافظه پنهان LRU را فعال یا غیرفعال کنید تا با کاهش استفاده از پردازنده، عملکرد بهبود یابد. برخی بازی‌ها مانند TotK 1.2.1 با این ویژگی مشکل دارند، در صورت عدم راه‌اندازی یا قطعی تصادفی بازی، آن را غیرفعال کنید.</string>
<string name="dyna_state">حالت پویای گسترده</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Assure une diffusion fluide et régulière des frames en synchronisant leur timing, réduisant ainsi les saccades et les animations irrégulières. Idéal pour les jeux souffrant d`instabilité de timing des frames ou de micro-saccades pendant le jeu.</string>
<string name="renderer_early_release_fences">Libérer les barrières plus tôt</string>
<string name="renderer_early_release_fences_description">Résout les problèmes de 0 FPS dans des jeux comme DKCR:HD, Subnautica Below Zero et Ori 2, mais peut perturber le chargement ou les performances des jeux Unreal Engine.</string>
<string name="buffer_reorder_disable">Désactiver le réordonnancement du tampon</string>
<string name="buffer_reorder_disable_description">Lorsqu\'il est coché, désactive le réordonnancement des téléchargements de mémoire mappée, permettant d\'associer les téléchargements à des dessins spécifiques. Peut réduire les performances dans certains cas.</string>
<string name="use_lru_cache">Activer le cache LRU</string>
<string name="use_lru_cache_description">Active ou désactive le cache LRU pour améliorer les performances en réduisant l\'utilisation du processeur. Certains jeux comme TotK 1.2.1 ont des problèmes - désactivez-le si le jeu ne démarre pas ou plante aléatoirement.</string>
<string name="dyna_state">État dynamique étendu</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">מבטיח אספקה חלקה ועקבית של פריימים על ידי סנכרון התזמון ביניהם, מפחית קפיצות ואנימציה לא אחידה. אידיאלי למשחקים עם בעיות בתזמון פריימים או מיקרו-קפיצות במהלך המשחק.</string>
<string name="renderer_early_release_fences">שחרר גדרות מוקדם</string>
<string name="renderer_early_release_fences_description">עוזר לתקן 0 FPS במשחקים כמו DKCR:HD, Subnautica Below Zero ו-Ori 2, אך עלול לפגוע בטעינה או בביצועים במשחקי Unreal Engine.</string>
<string name="buffer_reorder_disable">השבת סידור מחדש של חוצץ</string>
<string name="buffer_reorder_disable_description">כאשר מסומן, מבטל את סידור מחדש של העלאות זיכרון ממופה המאפשר לשייך העלאות עם ציורים ספציפיים. עלול להפחית ביצועים במקרים מסוימים.</string>
<string name="use_lru_cache">הפעלת מטמון LRU</string>
<string name="use_lru_cache_description">הפעל או השבת מטמון LRU לשיפור ביצועים על ידי חיסכון בשימוש במעבד. לחלק מהמשחקים כמו TotK 1.2.1 יש בעיות - השבת אם המשחק לא עולה או קורס באקראי.</string>
<string name="dyna_state">מצב דינמי מורחב</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Biztosítja a képkockák sima és egyenletes kézbesítését azok időzítésének szinkronizálásával, csökkentve a megakadásokat és egyenetlen animációkat. Ideális azokhoz a játékokhoz, amelyek képkocka-időzítési instabilitást vagy mikro-reccsenést tapasztalnak játék közben.</string>
<string name="renderer_early_release_fences">Korai kerítés-felszabadítás</string>
<string name="renderer_early_release_fences_description">Segít javítani a 0 FPS-t olyan játékokban, mint a DKCR:HD, Subnautica Below Zero és az Ori 2, de ronthatja az Unreal Engine játékok betöltését vagy teljesítményét.</string>
<string name="buffer_reorder_disable">Puffer újrarendezés letiltása</string>
<string name="buffer_reorder_disable_description">Ha be van jelölve, letiltja a leképezett memória feltöltéseinek újrarendezését, lehetővé téve a feltöltések összerendelését konkrét rajzolásokkal. Bizonyos esetekben csökkentheti a teljesítményt.</string>
<string name="use_lru_cache">LRU gyorsítótár engedélyezése</string>
<string name="use_lru_cache_description">Engedélyezi vagy letiltja az LRU gyorsítótárat, növelve a teljesítményt a CPU használat csökkentésével. Néhány játéknak (különösen a TotK 1.2.1-nek) problémája lehet vele - tiltsa le, ha a játék nem indul el vagy véletlenszerűen összeomlik.</string>
<string name="dyna_state">Kiterjesztett Dinamikus Állapot</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Memastikan pengiriman frame yang halus dan konsisten dengan menyinkronkan waktu antar frame, mengurangi stuttering dan animasi tidak rata. Ideal untuk game yang mengalami ketidakstabilan waktu frame atau micro-stutter selama gameplay.</string>
<string name="renderer_early_release_fences">Lepas Pagar Lebih Awal</string>
<string name="renderer_early_release_fences_description">Membantu memperbaiki 0 FPS di game seperti DKCR:HD, Subnautica Below Zero dan Ori 2, tapi mungkin mengganggu loading atau performa di game Unreal Engine.</string>
<string name="buffer_reorder_disable">Nonaktifkan Penyusunan Ulang Buffer</string>
<string name="buffer_reorder_disable_description">Ketika dicentang, menonaktifkan penyusunan ulang unggahan memori yang dipetakan yang memungkinkan mengaitkan unggahan dengan gambar tertentu. Dapat mengurangi kinerja dalam beberapa kasus.</string>
<string name="use_lru_cache">Aktifkan LRU Cache</string>
<string name="use_lru_cache_description">Aktifkan atau nonaktifkan cache LRU untuk meningkatkan performa dengan mengurangi penggunaan proses CPU. Beberapa game seperti TotK 1.2.1 memiliki masalah - nonaktifkan jika game tidak mau berjalan atau crash acak.</string>
<string name="dyna_state">Status Dinamis Ekstensi</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Garantisce una consegna fluida e costante dei fotogrammi sincronizzandone i tempi, riducendo scatti e animazioni irregolari. Ideale per giochi che presentano instabilità nei tempi dei fotogrammi o micro-scatti durante il gameplay.</string>
<string name="renderer_early_release_fences">Rilascia le barriere prima</string>
<string name="renderer_early_release_fences_description">Risolve problemi di 0 FPS in giochi come DKCR:HD, Subnautica Below Zero e Ori 2, ma potrebbe compromettere caricamento o prestazioni in giochi Unreal Engine.</string>
<string name="buffer_reorder_disable">Disabilita riordino buffer</string>
<string name="buffer_reorder_disable_description">Se selezionato, disabilita il riordino dei caricamenti di memoria mappata consentendo di associare i caricamenti a disegni specifici. Potrebbe ridurre le prestazioni in alcuni casi.</string>
<string name="use_lru_cache">Abilita cache LRU</string>
<string name="use_lru_cache_description">Abilita o disabilita la cache LRU per migliorare le prestazioni riducendo l\'uso della CPU. Alcuni giochi come TotK 1.2.1 hanno problemi - disabilitalo se il gioco non si avvia o crasha casualmente.</string>
<string name="dyna_state">Stato dinamico esteso</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">フレーム間のタイミングを同期させることで、スムーズで一貫したフレーム配信を確保し、カクつきや不均一なアニメーションを軽減します。フレームタイミングの不安定さやマイクロスタッターが発生するゲームに最適です。</string>
<string name="renderer_early_release_fences">フェンスを早期に解放</string>
<string name="renderer_early_release_fences_description">DKCR:HD、Subnautica Below Zero、Ori 2などのゲームで0 FPSを修正しますが、Unreal Engineゲームの読み込みやパフォーマンスに影響する可能性があります。</string>
<string name="buffer_reorder_disable">バッファの再並べ替えを無効化</string>
<string name="buffer_reorder_disable_description">チェック時、マップされたメモリのアップロードの再並べ替えを無効化し、特定の描画に関連付けることができます。場合によってはパフォーマンスが低下する可能性があります。</string>
<string name="use_lru_cache">LRUキャッシュを有効化</string>
<string name="use_lru_cache_description">LRUキャッシュを有効/無効にし、CPUプロセスの使用を節約してパフォーマンスを向上させます。TotK 1.2.1など一部のゲームで問題が発生する可能性があるため、ゲームが起動しない場合やランダムにクラッシュする場合は無効にしてください。</string>
<string name="dyna_state">拡張ダイナミックステート</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">프레임 간 타이밍을 동기화하여 부드럽고 일관된 프레임 전달을 보장하며, 끊김과 불균일한 애니메이션을 줄입니다. 프레임 타이밍 불안정이나 게임 플레이 중 미세 끊김이 발생하는 게임에 이상적입니다.</string>
<string name="renderer_early_release_fences">펜스 조기 해제</string>
<string name="renderer_early_release_fences_description">DKCR:HD, Subnautica Below Zero, Ori 2 등의 게임에서 0 FPS 현상을 해결하지만, Unreal Engine 게임의 로딩이나 성능에 문제를 일으킬 수 있습니다.</string>
<string name="buffer_reorder_disable">버퍼 재정렬 비활성화</string>
<string name="buffer_reorder_disable_description">체크 시, 매핑된 메모리 업로드의 재정렬을 비활성화하여 특정 그리기와 업로드를 연결할 수 있습니다. 경우에 따라 성능이 저하될 수 있습니다.</string>
<string name="use_lru_cache">LRU 캐시 사용</string>
<string name="use_lru_cache_description">LRU 캐시를 활성화 또는 비활성화하여 CPU 프로세스 사용을 절약하고 성능을 향상시킵니다. TotK 1.2.1을 포함한 일부 게임에서 문제가 발생할 수 있으므로 게임이 부팅되지 않거나 무작위로 충돌하는 경우 비활성화하세요.</string>
<string name="dyna_state">확장 동적 상태</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Sikrer jevn og konsekvent bildelevering ved å synkronisere tiden mellom bilder, noe som reduserer hakking og ujevn animasjon. Ideelt for spill som opplever ustabil bildetid eller mikro-hakk under spilling.</string>
<string name="renderer_early_release_fences">Frigjør gjerder tidlig</string>
<string name="renderer_early_release_fences_description">Løser 0 FPS i spill som DKCR:HD, Subnautica Below Zero og Ori 2, men kan forårsake problemer med lasting eller ytelse i Unreal Engine-spill.</string>
<string name="buffer_reorder_disable">Deaktiver bufferomorganisering</string>
<string name="buffer_reorder_disable_description">Når merket, deaktiveres omorganisering av kartlagt minneopplasting som tillater å knytte opplastinger til spesifikke tegninger. Kan redusere ytelsen i noen tilfeller.</string>
<string name="use_lru_cache">Aktiver LRU-mellomlager</string>
<string name="use_lru_cache_description">Aktiver eller deaktiver LRU-mellomlager for å forbedre ytelsen ved å spare CPU-prosessorbruk. Noen spill som TotK 1.2.1 har problemer med dette - deaktiver hvis spillet ikke starter eller krasjer tilfeldig.</string>
<string name="dyna_state">Utvidet dynamisk tilstand</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Zapewnia płynne i spójne wyświetlanie klatek poprzez synchronizację ich czasu, redukując zacinanie i nierówną animację. Idealne dla gier z niestabilnym czasem klatek lub mikro-zacinaniem podczas rozgrywki.</string>
<string name="renderer_early_release_fences">Wcześniejsze zwalnianie zabezpieczeń</string>
<string name="renderer_early_release_fences_description">Pomaga naprawić 0 FPS w grach takich jak DKCR:HD, Subnautica Below Zero i Ori 2, ale może zaburzyć ładowanie lub wydajność w grach Unreal Engine.</string>
<string name="buffer_reorder_disable">Wyłącz przestawianie bufora</string>
<string name="buffer_reorder_disable_description">Po zaznaczeniu wyłącza przestawianie załadowań zmapowanej pamięci, umożliwiając powiązanie załadowań z konkretnymi rysunkami. Może zmniejszyć wydajność w niektórych przypadkach.</string>
<string name="use_lru_cache">Włącz pamięć podręczną LRU</string>
<string name="use_lru_cache_description">Włącz lub wyłącz pamięć podręczną LRU, aby poprawić wydajność poprzez zmniejszenie użycia procesora. Niektóre gry, takie jak TotK 1.2.1, mogą mieć problemy - wyłącz, jeśli gra się nie uruchamia lub losowo zawiesza.</string>
<string name="dyna_state">Rozszerzony stan dynamiczny</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Garante entrega suave e consistente de quadros sincronizando seu tempo, reduzindo engasgos e animações irregulares. Ideal para jogos com instabilidade no tempo de quadros ou micro-engasgos durante a jogatina.</string>
<string name="renderer_early_release_fences">Liberar cercas antecipadamente</string>
<string name="renderer_early_release_fences_description">Ajuda a corrigir 0 FPS em jogos como DKCR:HD, Subnautica Below Zero e Ori 2, mas pode prejudicar carregamento ou desempenho em jogos Unreal Engine.</string>
<string name="buffer_reorder_disable">Desativar reorganização de buffer</string>
<string name="buffer_reorder_disable_description">Quando marcado, desativa a reorganização de carregamentos de memória mapeada que permite associar carregamentos a desenhos específicos. Pode reduzir o desempenho em alguns casos.</string>
<string name="use_lru_cache">Ativar cache LRU</string>
<string name="use_lru_cache_description">Ative ou desative o cache LRU para melhorar o desempenho economizando uso do processador. Alguns jogos como TotK 1.2.1 têm problemas - desative se o jogo não iniciar ou travar aleatoriamente.</string>
<string name="dyna_state">Estado Dinâmico Estendido</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Garante uma entrega suave e consistente de frames sincronizando o seu tempo, reduzindo engasgadelas e animações irregulares. Ideal para jogos que experienciam instabilidade no tempo de frames ou micro-engasgadelas durante o jogo.</string>
<string name="renderer_early_release_fences">Libertar barreiras antecipadamente</string>
<string name="renderer_early_release_fences_description">Ajuda a corrigir 0 FPS em jogos como DKCR:HD, Subnautica Below Zero e Ori 2, mas pode afetar carregamento ou desempenho em jogos Unreal Engine.</string>
<string name="buffer_reorder_disable">Desativar reordenação de buffer</string>
<string name="buffer_reorder_disable_description">Quando assinalado, desativa a reordenação de carregamentos de memória mapeada, permitindo associar carregamentos a desenhos específicos. Pode reduzir o desempenho nalguns casos.</string>
<string name="use_lru_cache">Ativar cache LRU</string>
<string name="use_lru_cache_description">Ative ou desative a cache LRU para melhorar desempenho poupando uso do processador. Alguns jogos como TotK 1.2.1 têm problemas - desative se o jogo não iniciar ou falhar aleatoriamente.</string>
<string name="dyna_state">Estado Dinâmico Estendido</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Обеспечивает плавную и стабильную подачу кадров за счет синхронизации их времени, уменьшая подтормаживания и неравномерную анимацию. Идеально для игр с нестабильным временем кадров или микро-подтормаживаниями во время игры.</string>
<string name="renderer_early_release_fences">Ранний релиз ограждений</string>
<string name="renderer_early_release_fences_description">Помогает исправить 0 FPS в играх типа DKCR:HD, Subnautica Below Zero и Ori 2, но может нарушить загрузку или производительность в играх на Unreal Engine.</string>
<string name="buffer_reorder_disable">Отключить переупорядочивание буфера</string>
<string name="buffer_reorder_disable_description">При включении отключает переупорядочивание загрузки отображенной памяти, позволяя связывать загрузки с конкретными отрисовками. В некоторых случаях может снизить производительность.</string>
<string name="use_lru_cache">Включить LRU-кеш</string>
<string name="use_lru_cache_description">Включите или отключите кэш LRU (наименее недавно использованный), повышая производительность за счёт снижения нагрузки на ЦП. Некоторые игры могут работать с ним некорректно (например, TotK 1.2.1), поэтому отключите, если игра не запускается или случайно вылетает.</string>
<string name="dyna_state">Расширенное динамическое состояние</string>

View file

@ -87,6 +87,8 @@
<string name="frame_interpolation_description">Осигурава глатку и доследан испоруку оквира синхронизацијом времена између оквира, смањење муцања и неуједначене анимације. Идеално за игре које доживљавају временски оквир нестабилност или микро-штитнике током играња.</string>
<string name="renderer_early_release_fences">Ranije oslobađanje ograda</string>
<string name="renderer_early_release_fences_description">Pomaže u popravci 0 FPS u igrama kao što su DKCR:HD, Subnautica Below Zero i Ori 2, ali može oštetiti učitavanje ili performanse u Unreal Engine igrama.</string>
<string name="buffer_reorder_disable">Онемогући преуређивање бафера</string>
<string name="buffer_reorder_disable_description">Када је означено, онемогућава преуређивање учитавања мапиране меморије што омогућава повезивање учитавања са одређеним цртањима. Може у неким случајевима смањити перформансе.</string>
<string name="use_auto_stub">Користите ауто-стуб</string>
<string name="use_auto_stub_description">Аутоматски угушите мрежне услуге и функције. То може побољшати компатибилност, али може проузроковати пад рушења и питања стабилности.</string>
<string name="uninstall_firmware">Деинсталирајте фирмвер</string>

View file

@ -78,7 +78,9 @@
<string name="frame_interpolation_description">Забезпечує плавну та стабільну подачу кадрів шляхом синхронізації їх часу, зменшуючи підвисання та нерівномірну анімацію. Ідеально для ігор з нестабільним часом кадрів або мікро-підвисаннями під час гри.</string>
<string name="renderer_early_release_fences">Release fences early</string>
<string name="renderer_early_release_fences_description">Це налаштування може бути необхідним для виправлення помилок 0FPS у деяких іграх (зокрема DKCR:HD, Subnautica та Ori 2). Водночас інші ігри, особливо створені на рушії Unreal Engine, можуть працювати некоректно або взагалі не запускатися.</string>
<string name="use_lru_cache">Увімкнути LRU-кеш</string>
<string name="buffer_reorder_disable">Вимкнути переупорядкування буфера</string>
<string name="buffer_reorder_disable_description">Якщо позначено, вимикає переупорядкування завантажень відображеної пам\'яті, що дозволяє пов\'язувати завантаження з конкретними малюваннями. Може знизити продуктивність у деяких випадках.</string>
<string name="use_lru_cache">Увімкнути LRU-кеш</string>
<string name="use_lru_cache_description">Увімкніть або вимкніть кеш LRU (Least Recently Used) для покращення продуктивності шляхом зменшення навантаження на CPU. Деякі ігри (зокрема TotK 1.2.1) можуть працювати некоректно - вимкніть, якщо гра не запускається або раптово вилітає.</string>
<string name="dyna_state">Розширений динамічний стан</string>
<string name="dyna_state_description">Активує функції Vulkan для покращення продуктивності, поліпшеня рендерингу та економії ресурсів під час створення конвеєрів (pipeline), зберігаючи низьке використання CPU/GPU. Ці розширення можуть підвищити температуру пристрою, а старі GPU серії A6XX можуть реагувати некоректно. Вимкніть для емуляції масштабованих форматів.</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">Đảm bảo cung cấp khung hình mượt mà và ổn định bằng cách đồng bộ hóa thời gian giữa các khung hình, giảm giật lag và hoạt ảnh không đồng đều. Lý tưởng cho các trò chơi gặp vấn đề về thời gian khung hình không ổn định hoặc giật lag nhẹ trong khi chơi.</string>
<string name="renderer_early_release_fences">Giải phóng rào chắn sớm</string>
<string name="renderer_early_release_fences_description">Giúp sửa lỗi 0 FPS trong các trò chơi như DKCR:HD, Subnautica Below Zero và Ori 2, nhưng có thể ảnh hưởng đến tải hoặc hiệu suất trong trò chơi Unreal Engine.</string>
<string name="buffer_reorder_disable">Tắt sắp xếp lại bộ đệm</string>
<string name="buffer_reorder_disable_description">Khi được chọn, sẽ tắt tính năng sắp xếp lại các lần tải lên bộ nhớ đã ánh xạ, cho phép liên kết các lần tải lên với các bản vẽ cụ thể. Có thể làm giảm hiệu suất trong một số trường hợp.</string>
<string name="use_lru_cache">Bật bộ nhớ đệm LRU</string>
<string name="use_lru_cache_description">Bật hoặc tắt bộ nhớ đệm LRU để cải thiện hiệu suất bằng cách tiết kiệm quy trình sử dụng CPU. Một số trò chơi như TotK 1.2.1 có vấn đề - hãy tắt nếu trò chơi không khởi động hoặc bị treo ngẫu nhiên.</string>
<string name="dyna_state">Trạng thái động mở rộng</string>

View file

@ -77,6 +77,8 @@
<string name="frame_interpolation_description">通过同步帧间时间确保流畅一致的帧交付,减少卡顿和不均匀动画。适合存在帧时间不稳定或游戏过程中出现微卡顿的游戏。</string>
<string name="renderer_early_release_fences">提前释放围栏</string>
<string name="renderer_early_release_fences_description">可修复《大金刚国度:热带寒流》《深海迷航:零度之下》和《奥日2》等游戏中的0 FPS问题但可能影响Unreal Engine游戏的加载或性能。</string>
<string name="buffer_reorder_disable">禁用缓冲重排序</string>
<string name="buffer_reorder_disable_description">勾选时,禁用映射内存上传的重排序功能,允许将上传与特定绘制关联。在某些情况下可能会降低性能。</string>
<string name="use_lru_cache">启用LRU缓存</string>
<string name="use_lru_cache_description">启用或禁用LRU缓存通过节省CPU进程使用来提高性能。某些游戏可能存在问题特别是TotK 1.2.1,如果游戏无法启动或随机崩溃,请禁用此选项。</string>
<string name="dyna_state">扩展动态状态</string>

View file

@ -78,6 +78,8 @@
<string name="frame_interpolation_description">通過同步幀間時間確保流暢一致的幀交付,減少卡頓和不均勻動畫。適合存在幀時間不穩定或遊戲過程中出現微卡頓的遊戲。</string>
<string name="renderer_early_release_fences">提前釋放圍欄</string>
<string name="renderer_early_release_fences_description">可修復《大金剛國度:熱帶寒流》《深海迷航:零度之下》和《奧日2》等遊戲中的0 FPS問題但可能影響Unreal Engine遊戲的載入或效能。</string>
<string name="buffer_reorder_disable">停用緩衝區重新排序</string>
<string name="buffer_reorder_disable_description">勾選時,停用映射記憶體上傳的重新排序功能,允許將上傳與特定繪製關聯。某些情況下可能會降低效能。</string>
<string name="use_lru_cache">啟用LRU快取</string>
<string name="use_lru_cache_description">啟用或停用LRU快取透過節省CPU進程使用來提高效能。某些遊戲可能存在問題特別是TotK 1.2.1,如果遊戲無法啟動或隨機崩潰,請停用此選項。</string>
<string name="dyna_state">擴展動態狀態</string>

View file

@ -1,9 +1,16 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: 2014 Tony Wasserka
// SPDX-FileCopyrightText: 2014 Dolphin Emulator Project
// SPDX-License-Identifier: BSD-3-Clause AND GPL-2.0-or-later
#pragma once
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#include <cmath>
#include <type_traits>
@ -641,6 +648,23 @@ template <typename T>
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
template <>
[[nodiscard]] inline float Dot(const Vec4<float>& a, const Vec4<float>& b) {
#ifdef __ARM_NEON
float32x4_t va = vld1q_f32(&a.x);
float32x4_t vb = vld1q_f32(&b.x);
float32x4_t result = vmulq_f32(va, vb);
#if defined(__aarch64__) // Use vaddvq_f32 in ARMv8 architectures
return vaddvq_f32(result);
#else // Use manual addition for older architectures
float32x2_t sum2 = vadd_f32(vget_high_f32(result), vget_low_f32(result));
return vget_lane_f32(vpadd_f32(sum2, sum2), 0);
#endif
#else
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
#endif
}
template <typename T>
[[nodiscard]] constexpr Vec3<decltype(T{} * T{} - T{} * T{})> Cross(const Vec3<T>& a,
const Vec3<T>& b) {

View file

@ -47,6 +47,7 @@ constexpr std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
constexpr inline std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
constexpr inline std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
constexpr inline Xbyak::Reg ABI_JIT_REG = Xbyak::util::rbx;
#ifdef _WIN32
// Microsoft x64 ABI

View file

@ -136,6 +136,7 @@ public:
case Dynarmic::A64::Exception::SendEvent:
case Dynarmic::A64::Exception::SendEventLocal:
case Dynarmic::A64::Exception::Yield:
LOG_TRACE(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})", static_cast<std::size_t>(exception), pc, m_memory.Read32(pc));
return;
case Dynarmic::A64::Exception::NoExecuteFault:
LOG_CRITICAL(Core_ARM, "Cannot execute instruction at unmapped address {:#016x}", pc);
@ -144,12 +145,10 @@ public:
default:
if (m_debugger_enabled) {
ReturnException(pc, InstructionBreakpoint);
return;
} else {
m_parent.LogBacktrace(m_process);
LOG_CRITICAL(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})", static_cast<std::size_t>(exception), pc, m_memory.Read32(pc));
}
m_parent.LogBacktrace(m_process);
LOG_CRITICAL(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
static_cast<std::size_t>(exception), pc, m_memory.Read32(pc));
}
}

View file

@ -289,10 +289,12 @@ struct System::Impl {
exit_locked = false;
exit_requested = false;
#if MICROPROFILE_ENABLED
microprofile_cpu[0] = MICROPROFILE_TOKEN(ARM_CPU0);
microprofile_cpu[1] = MICROPROFILE_TOKEN(ARM_CPU1);
microprofile_cpu[2] = MICROPROFILE_TOKEN(ARM_CPU2);
microprofile_cpu[3] = MICROPROFILE_TOKEN(ARM_CPU3);
#endif
if (Settings::values.enable_renderdoc_hotkey) {
renderdoc_api = std::make_unique<Tools::RenderdocAPI>();
@ -573,7 +575,9 @@ struct System::Impl {
std::stop_source stop_event;
std::array<u64, Core::Hardware::NUM_CPU_CORES> dynarmic_ticks{};
#if MICROPROFILE_ENABLED
std::array<MicroProfileToken, Core::Hardware::NUM_CPU_CORES> microprofile_cpu{};
#endif
std::array<Core::GPUDirtyMemoryManager, Core::Hardware::NUM_CPU_CORES>
gpu_dirty_memory_managers;
@ -952,6 +956,7 @@ void System::RegisterHostThread() {
impl->kernel.RegisterHostThread();
}
#if MICROPROFILE_ENABLED
void System::EnterCPUProfile() {
std::size_t core = impl->kernel.GetCurrentHostThreadID();
impl->dynarmic_ticks[core] = MicroProfileEnter(impl->microprofile_cpu[core]);
@ -961,6 +966,7 @@ void System::ExitCPUProfile() {
std::size_t core = impl->kernel.GetCurrentHostThreadID();
MicroProfileLeave(impl->microprofile_cpu[core], impl->dynarmic_ticks[core]);
}
#endif
bool System::IsMulticore() const {
return impl->is_multicore;

View file

@ -396,11 +396,13 @@ public:
/// Register a host thread as an auxiliary thread.
void RegisterHostThread();
#if MICROPROFILE_ENABLED
/// Enter CPU Microprofile
void EnterCPUProfile();
/// Exit CPU Microprofile
void ExitCPUProfile();
#endif
/// Tells if system is running on multicore.
[[nodiscard]] bool IsMulticore() const;

View file

@ -61,7 +61,9 @@ void CoreTiming::ThreadEntry(CoreTiming& instance) {
Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
instance.on_thread_init();
instance.ThreadLoop();
#if MICROPROFILE_ENABLED
MicroProfileOnThreadExit();
#endif
}
void CoreTiming::Initialize(std::function<void()>&& on_thread_init_) {

View file

@ -201,7 +201,9 @@ void CpuManager::RunThread(std::stop_token token, std::size_t core) {
// Cleanup
SCOPE_EXIT {
data.host_context->Exit();
#if MICROPROFILE_ENABLED
MicroProfileOnThreadExit();
#endif
};
// Running

View file

@ -554,32 +554,31 @@ void GDBStub::HandleVCont(std::string_view command, std::vector<DebuggerAction>&
}
}
constexpr std::array<std::pair<const char*, Kernel::Svc::MemoryState>, 22> MemoryStateNames{{
{"----- Free ------", Kernel::Svc::MemoryState::Free},
{"Io ", Kernel::Svc::MemoryState::Io},
{"Static ", Kernel::Svc::MemoryState::Static},
{"Code ", Kernel::Svc::MemoryState::Code},
{"CodeData ", Kernel::Svc::MemoryState::CodeData},
{"Normal ", Kernel::Svc::MemoryState::Normal},
{"Shared ", Kernel::Svc::MemoryState::Shared},
{"AliasCode ", Kernel::Svc::MemoryState::AliasCode},
{"AliasCodeData ", Kernel::Svc::MemoryState::AliasCodeData},
{"Ipc ", Kernel::Svc::MemoryState::Ipc},
{"Stack ", Kernel::Svc::MemoryState::Stack},
{"ThreadLocal ", Kernel::Svc::MemoryState::ThreadLocal},
{"Transferred ", Kernel::Svc::MemoryState::Transferred},
{"SharedTransferred", Kernel::Svc::MemoryState::SharedTransferred},
{"SharedCode ", Kernel::Svc::MemoryState::SharedCode},
{"Inaccessible ", Kernel::Svc::MemoryState::Inaccessible},
{"NonSecureIpc ", Kernel::Svc::MemoryState::NonSecureIpc},
{"NonDeviceIpc ", Kernel::Svc::MemoryState::NonDeviceIpc},
{"Kernel ", Kernel::Svc::MemoryState::Kernel},
{"GeneratedCode ", Kernel::Svc::MemoryState::GeneratedCode},
{"CodeOut ", Kernel::Svc::MemoryState::CodeOut},
{"Coverage ", Kernel::Svc::MemoryState::Coverage},
}};
static constexpr const char* GetMemoryStateName(Kernel::Svc::MemoryState state) {
constexpr std::array<std::pair<const char*, Kernel::Svc::MemoryState>, 22> MemoryStateNames{{
{"----- Free ------", Kernel::Svc::MemoryState::Free},
{"Io ", Kernel::Svc::MemoryState::Io},
{"Static ", Kernel::Svc::MemoryState::Static},
{"Code ", Kernel::Svc::MemoryState::Code},
{"CodeData ", Kernel::Svc::MemoryState::CodeData},
{"Normal ", Kernel::Svc::MemoryState::Normal},
{"Shared ", Kernel::Svc::MemoryState::Shared},
{"AliasCode ", Kernel::Svc::MemoryState::AliasCode},
{"AliasCodeData ", Kernel::Svc::MemoryState::AliasCodeData},
{"Ipc ", Kernel::Svc::MemoryState::Ipc},
{"Stack ", Kernel::Svc::MemoryState::Stack},
{"ThreadLocal ", Kernel::Svc::MemoryState::ThreadLocal},
{"Transferred ", Kernel::Svc::MemoryState::Transferred},
{"SharedTransferred", Kernel::Svc::MemoryState::SharedTransferred},
{"SharedCode ", Kernel::Svc::MemoryState::SharedCode},
{"Inaccessible ", Kernel::Svc::MemoryState::Inaccessible},
{"NonSecureIpc ", Kernel::Svc::MemoryState::NonSecureIpc},
{"NonDeviceIpc ", Kernel::Svc::MemoryState::NonDeviceIpc},
{"Kernel ", Kernel::Svc::MemoryState::Kernel},
{"GeneratedCode ", Kernel::Svc::MemoryState::GeneratedCode},
{"CodeOut ", Kernel::Svc::MemoryState::CodeOut},
{"Coverage ", Kernel::Svc::MemoryState::Coverage},
}};
for (size_t i = 0; i < MemoryStateNames.size(); i++) {
if (std::get<1>(MemoryStateNames[i]) == state) {
return std::get<0>(MemoryStateNames[i]);
@ -611,13 +610,7 @@ void GDBStub::HandleRcmd(const std::vector<u8>& command) {
auto* process = GetProcess();
auto& page_table = process->GetPageTable();
const char* commands = "Commands:\n"
" get fastmem\n"
" get info\n"
" get mappings\n";
if (command_str == "get fastmem") {
if (command_str == "fastmem" || command_str == "get fastmem") {
if (Settings::IsFastmemEnabled()) {
const auto& impl = page_table.GetImpl();
const auto region = reinterpret_cast<uintptr_t>(impl.fastmem_arena);
@ -630,7 +623,7 @@ void GDBStub::HandleRcmd(const std::vector<u8>& command) {
} else {
reply = "Fastmem is not enabled.\n";
}
} else if (command_str == "get info") {
} else if (command_str == "info" || command_str == "get info") {
auto modules = Core::FindModules(process);
reply = fmt::format("Process: {:#x} ({})\n"
@ -648,8 +641,7 @@ void GDBStub::HandleRcmd(const std::vector<u8>& command) {
GetInteger(page_table.GetHeapRegionStart()),
GetInteger(page_table.GetHeapRegionStart()) + page_table.GetHeapRegionSize() - 1,
GetInteger(page_table.GetAliasCodeRegionStart()),
GetInteger(page_table.GetAliasCodeRegionStart()) + page_table.GetAliasCodeRegionSize() -
1,
GetInteger(page_table.GetAliasCodeRegionStart()) + page_table.GetAliasCodeRegionSize() - 1,
GetInteger(page_table.GetStackRegionStart()),
GetInteger(page_table.GetStackRegionStart()) + page_table.GetStackRegionSize() - 1);
@ -657,7 +649,7 @@ void GDBStub::HandleRcmd(const std::vector<u8>& command) {
reply += fmt::format(" {:#012x} - {:#012x} {}\n", vaddr,
GetInteger(Core::GetModuleEnd(process, vaddr)), name);
}
} else if (command_str == "get mappings") {
} else if (command_str == "mappings" || command_str == "get mappings") {
reply = "Mappings:\n";
VAddr cur_addr = 0;
@ -675,15 +667,11 @@ void GDBStub::HandleRcmd(const std::vector<u8>& command) {
std::numeric_limits<u64>::max()) {
const char* state = GetMemoryStateName(svc_mem_info.state);
const char* perm = GetMemoryPermissionString(svc_mem_info);
const char l = True(svc_mem_info.attribute & MemoryAttribute::Locked) ? 'L' : '-';
const char i =
True(svc_mem_info.attribute & MemoryAttribute::IpcLocked) ? 'I' : '-';
const char d =
True(svc_mem_info.attribute & MemoryAttribute::DeviceShared) ? 'D' : '-';
const char i = True(svc_mem_info.attribute & MemoryAttribute::IpcLocked) ? 'I' : '-';
const char d = True(svc_mem_info.attribute & MemoryAttribute::DeviceShared) ? 'D' : '-';
const char u = True(svc_mem_info.attribute & MemoryAttribute::Uncached) ? 'U' : '-';
const char p =
True(svc_mem_info.attribute & MemoryAttribute::PermissionLocked) ? 'P' : '-';
const char p =True(svc_mem_info.attribute & MemoryAttribute::PermissionLocked) ? 'P' : '-';
reply += fmt::format(
" {:#012x} - {:#012x} {} {} {}{}{}{}{} [{}, {}]\n", svc_mem_info.base_address,
@ -698,11 +686,8 @@ void GDBStub::HandleRcmd(const std::vector<u8>& command) {
cur_addr = next_address;
}
} else if (command_str == "help") {
reply = commands;
} else {
reply = "Unknown command.\n";
reply += commands;
reply += "Commands: fastmem, info, mappings\n";
}
std::span<const u8> reply_span{reinterpret_cast<u8*>(&reply.front()), reply.size()};

View file

@ -1278,6 +1278,7 @@ void KernelCore::ExceptionalExitApplication() {
SuspendEmulation(true);
}
#if MICROPROFILE_ENABLED
void KernelCore::EnterSVCProfile() {
impl->svc_ticks[CurrentPhysicalCoreIndex()] = MicroProfileEnter(MICROPROFILE_TOKEN(Kernel_SVC));
}
@ -1285,6 +1286,7 @@ void KernelCore::EnterSVCProfile() {
void KernelCore::ExitSVCProfile() {
MicroProfileLeave(MICROPROFILE_TOKEN(Kernel_SVC), impl->svc_ticks[CurrentPhysicalCoreIndex()]);
}
#endif
Init::KSlabResourceCounts& KernelCore::SlabResourceCounts() {
return impl->slab_resource_counts;

View file

@ -271,9 +271,11 @@ public:
bool IsShuttingDown() const;
#if MICROPROFILE_ENABLED
void EnterSVCProfile();
void ExitSVCProfile();
#endif
/// Workaround for single-core mode when preempting threads while idle.
bool IsPhantomModeForSingleCore() const;

View file

@ -27,7 +27,9 @@ void PhysicalCore::RunThread(Kernel::KThread* thread) {
interface->Initialize();
const auto EnterContext = [&]() {
#if MICROPROFILE_ENABLED
system.EnterCPUProfile();
#endif
// Lock the core context.
std::scoped_lock lk{m_guard};
@ -59,7 +61,9 @@ void PhysicalCore::RunThread(Kernel::KThread* thread) {
m_arm_interface = nullptr;
m_current_thread = nullptr;
#if MICROPROFILE_ENABLED
system.ExitCPUProfile();
#endif
};
while (true) {

View file

@ -4428,7 +4428,9 @@ void Call(Core::System& system, u32 imm) {
std::array<uint64_t, 8> args;
kernel.CurrentPhysicalCore().SaveSvcArguments(process, args);
#if MICROPROFILE_ENABLED
kernel.EnterSVCProfile();
#endif
if (process.Is64Bit()) {
Call64(system, imm, args);
@ -4436,7 +4438,9 @@ void Call(Core::System& system, u32 imm) {
Call32(system, imm, args);
}
#if MICROPROFILE_ENABLED
kernel.ExitSVCProfile();
#endif
kernel.CurrentPhysicalCore().LoadSvcArguments(process, args);
}

View file

@ -77,7 +77,7 @@ public:
void SignalFence(std::function<void()>&& func) {
bool delay_fence = Settings::IsGPULevelHigh();
#ifdef __ANDROID__
if (!delay_fence && !Settings::values.early_release_fences.GetValue()) {
if (!delay_fence && Settings::values.early_release_fences.GetValue()) {
TryReleasePendingFences<false>();
}
#else
@ -89,7 +89,7 @@ public:
CommitAsyncFlushes();
TFence new_fence = CreateFence(!should_flush);
#ifdef __ANDROID__
if (delay_fence && !Settings::values.early_release_fences.GetValue()) {
if (delay_fence && Settings::values.early_release_fences.GetValue()) {
guard.lock();
}
#else
@ -110,7 +110,7 @@ public:
rasterizer.FlushCommands();
}
#ifdef __ANDROID__
if (delay_fence && !Settings::values.early_release_fences.GetValue()) {
if (delay_fence && Settings::values.early_release_fences.GetValue()) {
guard.unlock();
cv.notify_all();
}
@ -219,9 +219,11 @@ private:
MicroProfileOnThreadCreate(name.c_str());
// Cleanup
#if MICROPROFILE_ENABLED
SCOPE_EXIT {
MicroProfileOnThreadExit();
};
#endif
Common::SetCurrentThreadName(name.c_str());
Common::SetCurrentThreadPriority(Common::ThreadPriority::High);

View file

@ -23,9 +23,11 @@ static void RunThread(std::stop_token stop_token, Core::System& system,
Tegra::Control::Scheduler& scheduler, SynchState& state) {
std::string name = "GPU";
MicroProfileOnThreadCreate(name.c_str());
#if MICROPROFILE_ENABLED
SCOPE_EXIT {
MicroProfileOnThreadExit();
};
#endif
Common::SetCurrentThreadName(name.c_str());
Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical);

View file

@ -1472,7 +1472,7 @@ void TextureCacheRuntime::CopyImageMSAA(Image& dst, Image& src,
if (msaa_copy_pass) {
return msaa_copy_pass->CopyImage(dst, src, copies, msaa_to_non_msaa);
}
UNIMPLEMENTED_MSG("Copying images with different samples is not supported.");
LOG_WARNING(Render_Vulkan, "Copying images with different samples is not supported.");
}
u64 TextureCacheRuntime::GetDeviceLocalMemory() const {
@ -1548,54 +1548,94 @@ void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
// Handle MSAA upload if necessary
/* WARNING, TODO: This code uses some hacks, besides being fundamentally ugly
since tropic didn't want to touch it for a long time, so it needs a rewrite from someone better than me at vulkan.*/
if (info.num_samples > 1 && runtime->CanUploadMSAA()) {
// Only use MSAA copy pass for color formats
// TODO: Depth/stencil formats need special handling
if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) {
// Create a temporary non-MSAA image to upload the data first
ImageInfo temp_info = info;
temp_info.num_samples = 1;
if (info.num_samples > 1) {
// Create a temporary non-MSAA image to upload the data first
ImageInfo temp_info = info;
temp_info.num_samples = 1;
// Create image with same usage flags as the target image to avoid validation errors
VkImageCreateInfo temp_ci = MakeImageCreateInfo(runtime->device, temp_info);
temp_ci.usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
vk::Image temp_image = runtime->memory_allocator.CreateImage(temp_ci);
// Create image with same usage flags as the target image to avoid validation errors
VkImageCreateInfo image_ci = MakeImageCreateInfo(runtime->device, temp_info);
image_ci.usage = original_image.UsageFlags();
vk::Image temp_image = runtime->memory_allocator.CreateImage(image_ci);
auto vk_buffer_image_copies = TransformBufferImageCopies(copies, offset, aspect_mask);
// Upload to the temporary non-MSAA image
scheduler->RequestOutsideRenderPassOperationContext();
auto vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask);
const VkBuffer src_buffer = buffer;
const VkImage temp_vk_image = *temp_image;
const VkImageAspectFlags vk_aspect_mask = aspect_mask;
scheduler->Record([src_buffer, temp_vk_image, vk_aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) {
CopyBufferToImage(cmdbuf, src_buffer, temp_vk_image, vk_aspect_mask, false, vk_copies);
boost::container::small_vector<VkImageBlit, 16> blit_regions;
blit_regions.reserve(copies.size());
for (const auto& copy : copies) {
blit_regions.emplace_back(VkImageBlit{
.srcSubresource = MakeImageSubresourceLayers(copy.image_subresource, aspect_mask),
.srcOffsets = {{copy.image_offset.x, copy.image_offset.y, copy.image_offset.z},
{static_cast<s32>(copy.image_offset.x + copy.image_extent.width),
static_cast<s32>(copy.image_offset.y + copy.image_extent.height),
static_cast<s32>(copy.image_offset.z + copy.image_extent.depth)}},
.dstSubresource = MakeImageSubresourceLayers(copy.image_subresource, aspect_mask),
.dstOffsets = {{copy.image_offset.x, copy.image_offset.y, copy.image_offset.z},
{static_cast<s32>(copy.image_offset.x + copy.image_extent.width),
static_cast<s32>(copy.image_offset.y + copy.image_extent.height),
static_cast<s32>(copy.image_offset.z + copy.image_extent.depth)}},
});
// Use MSAACopyPass to convert from non-MSAA to MSAA
std::vector<VideoCommon::ImageCopy> image_copies;
for (const auto& copy : copies) {
VideoCommon::ImageCopy image_copy;
image_copy.src_offset = {0, 0, 0}; // Use zero offset for source
image_copy.dst_offset = copy.image_offset;
image_copy.src_subresource = copy.image_subresource;
image_copy.dst_subresource = copy.image_subresource;
image_copy.extent = copy.image_extent;
image_copies.push_back(image_copy);
}
// wrapper image for the temporary image
Image temp_wrapper(*runtime, temp_info, 0, 0);
temp_wrapper.original_image = std::move(temp_image);
temp_wrapper.current_image = &Image::original_image;
temp_wrapper.aspect_mask = aspect_mask;
temp_wrapper.initialized = true;
// Use MSAACopyPass to convert from non-MSAA to MSAA
runtime->msaa_copy_pass->CopyImage(*this, temp_wrapper, image_copies, false);
std::exchange(initialized, true);
return;
}
// For depth/stencil formats, fall back to regular upload
const VkImage dst_vk_image = Handle();
const bool is_initialized = std::exchange(initialized, true);
scheduler->RequestOutsideRenderPassOperationContext();
scheduler->Record([=, temp_image = std::move(temp_image)](vk::CommandBuffer cmdbuf) {
// Upload to the temporary non-MSAA image
CopyBufferToImage(cmdbuf, buffer, *temp_image, aspect_mask, false,
vk_buffer_image_copies);
// Transition layouts for blit
const VkAccessFlags src_access_mask =
is_initialized
? (VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT)
: VK_ACCESS_NONE;
const std::array<VkImageMemoryBarrier, 2> pre_blit_barriers{
VkImageMemoryBarrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
.image = *temp_image,
.subresourceRange = {aspect_mask, 0, VK_REMAINING_MIP_LEVELS, 0,
VK_REMAINING_ARRAY_LAYERS},
},
VkImageMemoryBarrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.srcAccessMask = src_access_mask,
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout =
is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.image = dst_vk_image,
.subresourceRange = {aspect_mask, 0, VK_REMAINING_MIP_LEVELS, 0,
VK_REMAINING_ARRAY_LAYERS},
}};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, {}, {}, pre_blit_barriers);
// Blit from temporary to MSAA image
cmdbuf.BlitImage(*temp_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst_vk_image,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, blit_regions,
VK_FILTER_NEAREST);
// Transition destination image to general layout
const VkImageMemoryBarrier post_blit_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_TRANSFER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.image = dst_vk_image,
.subresourceRange = {aspect_mask, 0, VK_REMAINING_MIP_LEVELS, 0,
VK_REMAINING_ARRAY_LAYERS},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, post_blit_barrier);
});
} else {
// Regular non-MSAA upload
scheduler->RequestOutsideRenderPassOperationContext();
@ -1606,7 +1646,8 @@ void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
const bool is_initialized = std::exchange(initialized, true);
scheduler->Record([src_buffer, vk_image, vk_aspect_mask, is_initialized,
vk_copies](vk::CommandBuffer cmdbuf) {
CopyBufferToImage(cmdbuf, src_buffer, vk_image, vk_aspect_mask, is_initialized, vk_copies);
CopyBufferToImage(cmdbuf, src_buffer, vk_image, vk_aspect_mask, is_initialized,
vk_copies);
});
}
@ -1638,102 +1679,100 @@ void Image::DownloadMemory(std::span<VkBuffer> buffers_span, std::span<size_t> o
}
// RE-USE MSAA UPLOAD CODE BUT NOW FOR DOWNLOAD
if (info.num_samples > 1 && runtime->msaa_copy_pass) {
// TODO: Depth/stencil formats need special handling
if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) {
ImageInfo temp_info = info;
temp_info.num_samples = 1;
if (info.num_samples > 1) {
ImageInfo temp_info = info;
temp_info.num_samples = 1;
VkImageCreateInfo image_ci = MakeImageCreateInfo(runtime->device, temp_info);
image_ci.usage = original_image.UsageFlags();
vk::Image temp_image = runtime->memory_allocator.CreateImage(image_ci);
VkImageCreateInfo temp_ci = MakeImageCreateInfo(runtime->device, temp_info);
temp_ci.usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
vk::Image temp_image = runtime->memory_allocator.CreateImage(temp_ci);
Image temp_wrapper(*runtime, temp_info, 0, 0);
temp_wrapper.original_image = std::move(temp_image);
temp_wrapper.current_image = &Image::original_image;
temp_wrapper.aspect_mask = aspect_mask;
temp_wrapper.initialized = true;
boost::container::small_vector<VkImageBlit, 16> blit_regions;
blit_regions.reserve(copies.size());
for (const auto& copy : copies) {
blit_regions.emplace_back(VkImageBlit{
.srcSubresource = MakeImageSubresourceLayers(copy.image_subresource, aspect_mask),
.srcOffsets = {{copy.image_offset.x, copy.image_offset.y, copy.image_offset.z},
{static_cast<s32>(copy.image_offset.x + copy.image_extent.width),
static_cast<s32>(copy.image_offset.y + copy.image_extent.height),
static_cast<s32>(copy.image_offset.z + copy.image_extent.depth)}},
.dstSubresource = MakeImageSubresourceLayers(copy.image_subresource, aspect_mask),
.dstOffsets = {{copy.image_offset.x, copy.image_offset.y, copy.image_offset.z},
{static_cast<s32>(copy.image_offset.x + copy.image_extent.width),
static_cast<s32>(copy.image_offset.y + copy.image_extent.height),
static_cast<s32>(copy.image_offset.z + copy.image_extent.depth)}},
});
}
std::vector<VideoCommon::ImageCopy> image_copies;
for (const auto& copy : copies) {
VideoCommon::ImageCopy image_copy;
image_copy.src_offset = copy.image_offset;
image_copy.dst_offset = copy.image_offset;
image_copy.src_subresource = copy.image_subresource;
image_copy.dst_subresource = copy.image_subresource;
image_copy.extent = copy.image_extent;
image_copies.push_back(image_copy);
}
boost::container::small_vector<VkBuffer, 8> buffers_vector{};
boost::container::small_vector<boost::container::small_vector<VkBufferImageCopy, 16>, 8>
vk_copies;
for (size_t index = 0; index < buffers_span.size(); index++) {
buffers_vector.emplace_back(buffers_span[index]);
vk_copies.emplace_back(
TransformBufferImageCopies(copies, offsets_span[index], aspect_mask));
}
runtime->msaa_copy_pass->CopyImage(temp_wrapper, *this, image_copies, true);
const VkImage src_vk_image = Handle();
boost::container::small_vector<VkBuffer, 8> buffers_vector{};
boost::container::small_vector<boost::container::small_vector<VkBufferImageCopy, 16>, 8>
vk_copies;
for (size_t index = 0; index < buffers_span.size(); index++) {
buffers_vector.emplace_back(buffers_span[index]);
vk_copies.emplace_back(
TransformBufferImageCopies(copies, offsets_span[index], aspect_mask));
}
scheduler->RequestOutsideRenderPassOperationContext();
scheduler->Record([buffers = std::move(buffers_vector), image = *temp_wrapper.original_image,
aspect_mask_ = aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) {
const VkImageMemoryBarrier read_barrier{
scheduler->RequestOutsideRenderPassOperationContext();
scheduler->Record([=, temp_image = std::move(temp_image),
buffers = std::move(buffers_vector)](vk::CommandBuffer cmdbuf) {
const std::array<VkImageMemoryBarrier, 2> pre_blit_barriers{
VkImageMemoryBarrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
.subresourceRange{
.aspectMask = aspect_mask_,
.baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, read_barrier);
for (size_t index = 0; index < buffers.size(); index++) {
cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffers[index],
vk_copies[index]);
}
const VkMemoryBarrier memory_write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
};
const VkImageMemoryBarrier image_write_barrier{
.image = src_vk_image,
.subresourceRange = {aspect_mask, 0, VK_REMAINING_MIP_LEVELS, 0,
VK_REMAINING_ARRAY_LAYERS},
},
VkImageMemoryBarrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = 0,
.dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
.subresourceRange{
.aspectMask = aspect_mask_,
.baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, memory_write_barrier, nullptr, image_write_barrier);
});
return;
}
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.image = *temp_image,
.subresourceRange = {aspect_mask, 0, VK_REMAINING_MIP_LEVELS, 0,
VK_REMAINING_ARRAY_LAYERS},
}};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, {}, pre_blit_barriers);
cmdbuf.BlitImage(src_vk_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, *temp_image,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, blit_regions,
VK_FILTER_NEAREST);
const VkImageMemoryBarrier post_blit_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
.image = *temp_image,
.subresourceRange = {aspect_mask, 0, VK_REMAINING_MIP_LEVELS, 0,
VK_REMAINING_ARRAY_LAYERS},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, post_blit_barrier);
for (size_t index = 0; index < buffers.size(); index++) {
cmdbuf.CopyImageToBuffer(*temp_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
buffers[index], vk_copies[index]);
}
const VkMemoryBarrier memory_write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_HOST_READ_BIT,
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0,
memory_write_barrier, {}, {});
});
} else {
boost::container::small_vector<VkBuffer, 8> buffers_vector{};
boost::container::small_vector<boost::container::small_vector<VkBufferImageCopy, 16>, 8>
@ -1764,12 +1803,12 @@ void Image::DownloadMemory(std::span<VkBuffer> buffers_span, std::span<size_t> o
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, read_barrier);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, read_barrier);
for (size_t index = 0; index < buffers.size(); index++) {
cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffers[index],
vk_copies[index]);
cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
buffers[index], vk_copies[index]);
}
const VkMemoryBarrier memory_write_barrier{
@ -1796,8 +1835,9 @@ void Image::DownloadMemory(std::span<VkBuffer> buffers_span, std::span<size_t> o
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, memory_write_barrier, nullptr, image_write_barrier);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, memory_write_barrier,
nullptr, image_write_barrier);
});
}
@ -1993,6 +2033,11 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI
std::ranges::transform(swizzle, swizzle.begin(), ConvertGreenRed);
}
}
if ((image.UsageFlags() & VK_IMAGE_USAGE_STORAGE_BIT) != 0) {
swizzle = {SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A};
}
const auto format_info = MaxwellToVK::SurfaceFormat(*device, FormatType::Optimal, true, format);
const VkImageViewUsageCreateInfo image_view_usage{
.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,

View file

@ -1,6 +1,11 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: 2015 Citra Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#if MICROPROFILE_ENABLED
#include <QAction>
#include <QLayout>
#include <QMouseEvent>
@ -14,7 +19,7 @@
// Include the implementation of the UI in this file. This isn't in microprofile.cpp because the
// non-Qt frontends don't need it (and don't implement the UI drawing hooks either).
#if MICROPROFILE_ENABLED
#define MICROPROFILEUI_IMPL 1
#include "common/microprofileui.h"
@ -43,8 +48,6 @@ private:
qreal x_scale = 1.0, y_scale = 1.0;
};
#endif
MicroProfileDialog::MicroProfileDialog(QWidget* parent) : QWidget(parent, Qt::Dialog) {
setObjectName(QStringLiteral("MicroProfile"));
setWindowTitle(tr("&MicroProfile"));
@ -52,8 +55,6 @@ MicroProfileDialog::MicroProfileDialog(QWidget* parent) : QWidget(parent, Qt::Di
// Enable the maximize button
setWindowFlags(windowFlags() | Qt::WindowMaximizeButtonHint);
#if MICROPROFILE_ENABLED
MicroProfileWidget* widget = new MicroProfileWidget(this);
QLayout* layout = new QVBoxLayout(this);
@ -66,7 +67,6 @@ MicroProfileDialog::MicroProfileDialog(QWidget* parent) : QWidget(parent, Qt::Di
setFocusProxy(widget);
widget->setFocusPolicy(Qt::StrongFocus);
widget->setFocus();
#endif
}
QAction* MicroProfileDialog::toggleViewAction() {
@ -94,8 +94,6 @@ void MicroProfileDialog::hideEvent(QHideEvent* ev) {
QWidget::hideEvent(ev);
}
#if MICROPROFILE_ENABLED
/// There's no way to pass a user pointer to MicroProfile, so this variable is used to make the
/// QPainter available inside the drawing callbacks.
static QPainter* mp_painter = nullptr;

View file

@ -1,8 +1,13 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: 2015 Citra Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#if MICROPROFILE_ENABLED
#include <QWidget>
class QAction;
@ -25,3 +30,4 @@ protected:
private:
QAction* toggle_view_action = nullptr;
};
#endif

View file

@ -1348,6 +1348,11 @@ void GMainWindow::InitializeDebugWidgets() {
microProfileDialog = new MicroProfileDialog(this);
microProfileDialog->hide();
debug_menu->addAction(microProfileDialog->toggleViewAction());
#else
auto micro_profile_stub = new QAction(tr("MicroProfile (unavailable)"), this);
micro_profile_stub->setEnabled(false);
micro_profile_stub->setChecked(false);
debug_menu->addAction(micro_profile_stub);
#endif
waitTreeWidget = new WaitTreeWidget(*system, this);
@ -5630,10 +5635,13 @@ int main(int argc, char* argv[]) {
#endif
Common::DetachedTasks detached_tasks;
#if MICROPROFILE_ENABLED
MicroProfileOnThreadCreate("Frontend");
SCOPE_EXIT {
MicroProfileShutdown();
};
#endif
Common::ConfigureNvidiaEnvironmentFlags();

View file

@ -43,7 +43,9 @@ class GameList;
class GImageInfo;
class GRenderWindow;
class LoadingScreen;
#if MICROPROFILE_ENABLED
class MicroProfileDialog;
#endif
class OverlayDialog;
class ProfilerWidget;
class ControllerDialog;
@ -565,7 +567,9 @@ private:
// Debugger panes
ProfilerWidget* profilerWidget;
#if MICROPROFILE_ENABLED
MicroProfileDialog* microProfileDialog;
#endif
WaitTreeWidget* waitTreeWidget;
ControllerDialog* controller_dialog;

View file

@ -4,9 +4,6 @@
// SPDX-FileCopyrightText: 2014 Citra Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#include <chrono>
#include <iostream>
#include <memory>
@ -338,10 +335,12 @@ int main(int argc, char** argv) {
LocalFree(argv_w);
#endif
#if MICROPROFILE_ENABLED
MicroProfileOnThreadCreate("EmuThread");
SCOPE_EXIT {
MicroProfileShutdown();
};
#endif
Common::ConfigureNvidiaEnvironmentFlags();