From 5d4fff33292e309701331325e62647cafa9b5f51 Mon Sep 17 00:00:00 2001 From: lizzie Date: Thu, 2 Oct 2025 20:51:26 +0000 Subject: [PATCH 1/3] [dynarmic] allow better dtrace diagnostics for code - do not clobber %rbp and save frame pointer Signed-off-by: lizzie --- .gitignore | 2 + .../src/dynarmic/backend/x64/a32_emit_x64.cpp | 27 ++-- .../dynarmic/backend/x64/a32_interface.cpp | 4 + .../src/dynarmic/backend/x64/a64_emit_x64.cpp | 28 ++-- src/dynarmic/src/dynarmic/backend/x64/abi.cpp | 10 +- .../dynarmic/backend/x64/block_of_code.cpp | 4 +- src/dynarmic/tests/print_info.cpp | 23 ++- tools/dtrace-tool.pl | 131 ++++++++++++++++++ tools/dtrace-tool.sh | 42 ------ 9 files changed, 194 insertions(+), 77 deletions(-) create mode 100755 tools/dtrace-tool.pl delete mode 100755 tools/dtrace-tool.sh diff --git a/.gitignore b/.gitignore index 2b342e5145..525eec2326 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,8 @@ CMakeLists.txt.user* # *nix related # Common convention for backup or temporary files *~ +*.core +dtrace-out/ # Visual Studio CMake settings CMakeSettings.json diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp index fb306336cf..65cbbb354a 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp @@ -217,13 +217,13 @@ void A32EmitX64::ClearFastDispatchTable() { } void A32EmitX64::GenTerminalHandlers() { - // PC ends up in ebp, location_descriptor ends up in rbx + // PC ends up in edi, location_descriptor ends up in rbx const auto calculate_location_descriptor = [this] { // This calculation has to match up with IREmitter::PushRSB code.mov(ebx, dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)]); code.shl(rbx, 32); code.mov(ecx, MJitStateReg(A32::Reg::PC)); - code.mov(ebp, ecx); + code.mov(edi, ecx); code.or_(rbx, rcx); }; @@ -238,7 +238,7 @@ void A32EmitX64::GenTerminalHandlers() { code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)], eax); code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]); if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { - code.jne(rsb_cache_miss); + code.jne(rsb_cache_miss, code.T_NEAR); } else { code.jne(code.GetReturnFromRunCodeAddress()); } @@ -251,20 +251,21 @@ void A32EmitX64::GenTerminalHandlers() { terminal_handler_fast_dispatch_hint = code.getCurr(); calculate_location_descriptor(); code.L(rsb_cache_miss); - code.mov(r12, reinterpret_cast(fast_dispatch_table.data())); - code.mov(rbp, rbx); + code.mov(r8, reinterpret_cast(fast_dispatch_table.data())); + //code.mov(r12d, MJitStateReg(A32::Reg::PC)); + code.mov(r12, rbx); if (code.HasHostFeature(HostFeature::SSE42)) { - code.crc32(rbp, r12); + code.crc32(r12, r8); } - code.and_(ebp, fast_dispatch_table_mask); - code.lea(rbp, ptr[r12 + rbp]); - code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]); - code.jne(fast_dispatch_cache_miss); - code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]); + code.and_(r12d, fast_dispatch_table_mask); + code.lea(r12, ptr[r8 + r12]); + code.cmp(rbx, qword[r12 + offsetof(FastDispatchEntry, location_descriptor)]); + code.jne(fast_dispatch_cache_miss, code.T_NEAR); + code.jmp(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)]); code.L(fast_dispatch_cache_miss); - code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx); + code.mov(qword[r12 + offsetof(FastDispatchEntry, location_descriptor)], rbx); code.LookupBlock(); - code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax); + code.mov(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)], rax); code.jmp(rax); PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a32_terminal_handler_fast_dispatch_hint"); diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp b/src/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp index 382eb70f3f..3253bc4f72 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp @@ -331,4 +331,8 @@ void Jit::DumpDisassembly() const { impl->DumpDisassembly(); } +std::vector Jit::Disassemble() const { + return impl->Disassemble(); +} + } // namespace Dynarmic::A32 diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp index 1e673338a8..045386342d 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp @@ -188,13 +188,14 @@ void A64EmitX64::ClearFastDispatchTable() { } void A64EmitX64::GenTerminalHandlers() { - // PC ends up in rbp, location_descriptor ends up in rbx + // PC ends up in rcx, location_descriptor ends up in rbx + static_assert(std::find(ABI_ALL_CALLEE_SAVE.begin(), ABI_ALL_CALLEE_SAVE.end(), HostLoc::R12)); const auto calculate_location_descriptor = [this] { // This calculation has to match up with A64::LocationDescriptor::UniqueHash // TODO: Optimization is available here based on known state of fpcr. - code.mov(rbp, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]); + code.mov(rdi, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]); code.mov(rcx, A64::LocationDescriptor::pc_mask); - code.and_(rcx, rbp); + code.and_(rcx, rdi); code.mov(ebx, dword[code.ABI_JIT_PTR + offsetof(A64JitState, fpcr)]); code.and_(ebx, A64::LocationDescriptor::fpcr_mask); code.shl(rbx, A64::LocationDescriptor::fpcr_shift); @@ -226,20 +227,21 @@ void A64EmitX64::GenTerminalHandlers() { terminal_handler_fast_dispatch_hint = code.getCurr(); calculate_location_descriptor(); code.L(rsb_cache_miss); - code.mov(r12, reinterpret_cast(fast_dispatch_table.data())); - code.mov(rbp, rbx); + code.mov(r8, reinterpret_cast(fast_dispatch_table.data())); + //code.mov(r12, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]); + code.mov(r12, rbx); if (code.HasHostFeature(HostFeature::SSE42)) { - code.crc32(rbp, r12); + code.crc32(r12, r8); } - code.and_(ebp, fast_dispatch_table_mask); - code.lea(rbp, ptr[r12 + rbp]); - code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]); - code.jne(fast_dispatch_cache_miss); - code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]); + code.and_(r12d, fast_dispatch_table_mask); + code.lea(r12, ptr[r8 + r12]); + code.cmp(rbx, qword[r12 + offsetof(FastDispatchEntry, location_descriptor)]); + code.jne(fast_dispatch_cache_miss, code.T_NEAR); + code.jmp(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)]); code.L(fast_dispatch_cache_miss); - code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx); + code.mov(qword[r12 + offsetof(FastDispatchEntry, location_descriptor)], rbx); code.LookupBlock(); - code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax); + code.mov(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)], rax); code.jmp(rax); PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a64_terminal_handler_fast_dispatch_hint"); diff --git a/src/dynarmic/src/dynarmic/backend/x64/abi.cpp b/src/dynarmic/src/dynarmic/backend/x64/abi.cpp index a9bbab3d10..1c8f662b2e 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/abi.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/abi.cpp @@ -29,7 +29,8 @@ static_assert(ABI_SHADOW_SPACE <= 32); static FrameInfo CalculateFrameInfo(const size_t num_gprs, const size_t num_xmms, size_t frame_size) { // We are initially 8 byte aligned because the return value is pushed onto an aligned stack after a call. - const size_t rsp_alignment = (num_gprs % 2 == 0) ? 8 : 0; + // (It's an extra GPR save due to %rbp) + const size_t rsp_alignment = ((num_gprs + 1) % 2 == 0) ? 8 : 0; const size_t total_xmm_size = num_xmms * XMM_SIZE; if (frame_size & 0xF) { frame_size += 0x10 - (frame_size & 0xF); @@ -49,6 +50,10 @@ void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM); const FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size); + if (true) { + code.push(rbp); + code.mov(rbp, rsp); + } for (auto const gpr : regs) if (HostLocIsGPR(gpr)) code.push(HostLocToReg64(gpr)); @@ -91,6 +96,9 @@ void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, for (auto const gpr : mcl::iterator::reverse(regs)) if (HostLocIsGPR(gpr)) code.pop(HostLocToReg64(gpr)); + if (true) { + code.pop(rbp); + } } void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, const std::size_t frame_size) { diff --git a/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp b/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp index d5d5f089ff..7e459df2d7 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp @@ -364,8 +364,7 @@ void BlockOfCode::GenRunCode(std::function rcp) { cmp(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], 0); jne(return_to_caller_mxcsr_already_exited, T_NEAR); - lock(); - or_(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], static_cast(HaltReason::Step)); + lock(); or_(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], u32(HaltReason::Step)); SwitchMxcsrOnEntry(); jmp(ABI_PARAM2); @@ -415,7 +414,6 @@ void BlockOfCode::GenRunCode(std::function rcp) { } xor_(eax, eax); - lock(); xchg(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], eax); ABI_PopCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout)); diff --git a/src/dynarmic/tests/print_info.cpp b/src/dynarmic/tests/print_info.cpp index 3d1268f467..1851664771 100644 --- a/src/dynarmic/tests/print_info.cpp +++ b/src/dynarmic/tests/print_info.cpp @@ -37,6 +37,9 @@ #include "dynarmic/ir/basic_block.h" #include "dynarmic/ir/opt_passes.h" +#include "./A32/testenv.h" +#include "./A64/testenv.h" + using namespace Dynarmic; std::string_view GetNameOfA32Instruction(u32 instruction) { @@ -65,7 +68,10 @@ void PrintA32Instruction(u32 instruction) { fmt::print("should_continue: {}\n\n", should_continue); fmt::print("IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); - Optimization::Optimize(ir_block, A32::UserConfig{}, {}); + ArmTestEnv jit_env{}; + Dynarmic::A32::UserConfig jit_user_config{}; + jit_user_config.callbacks = &jit_env; + Optimization::Optimize(ir_block, jit_user_config, {}); fmt::print("Optimized IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); } @@ -80,7 +86,10 @@ void PrintA64Instruction(u32 instruction) { fmt::print("should_continue: {}\n\n", should_continue); fmt::print("IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); - Optimization::Optimize(ir_block, A64::UserConfig{}, {}); + A64TestEnv jit_env{}; + Dynarmic::A64::UserConfig jit_user_config{}; + jit_user_config.callbacks = &jit_env; + Optimization::Optimize(ir_block, jit_user_config, {}); fmt::print("Optimized IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); } @@ -98,7 +107,10 @@ void PrintThumbInstruction(u32 instruction) { fmt::print("should_continue: {}\n\n", should_continue); fmt::print("IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); - Optimization::Optimize(ir_block, A32::UserConfig{}, {}); + ThumbTestEnv jit_env{}; + Dynarmic::A32::UserConfig jit_user_config{}; + jit_user_config.callbacks = &jit_env; + Optimization::Optimize(ir_block, jit_user_config, {}); fmt::print("Optimized IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); } @@ -219,7 +231,7 @@ void ExecuteA32Instruction(u32 instruction) { *(iter->second) = *value; fmt::print("> {} = 0x{:08x}\n", reg_name, *value); } - } else if (reg_name == "mem" || reg_name == "memory") { + } else if (reg_name.starts_with("m")) { fmt::print("address: "); if (const auto address = get_value()) { fmt::print("value: "); @@ -228,7 +240,7 @@ void ExecuteA32Instruction(u32 instruction) { fmt::print("> mem[0x{:08x}] = 0x{:08x}\n", *address, *value); } } - } else if (reg_name == "end") { + } else if (reg_name == "exit" || reg_name == "end" || reg_name.starts_with("q")) { break; } } @@ -244,6 +256,7 @@ void ExecuteA32Instruction(u32 instruction) { env.MemoryWrite32(initial_pc + 4, 0xEAFFFFFE); // B +0 cpu.Run(); + fmt::print("{}", fmt::join(cpu.Disassemble(), "\n")); fmt::print("Registers modified:\n"); for (size_t i = 0; i < regs.size(); ++i) { diff --git a/tools/dtrace-tool.pl b/tools/dtrace-tool.pl new file mode 100755 index 0000000000..a3fbcc2d8a --- /dev/null +++ b/tools/dtrace-tool.pl @@ -0,0 +1,131 @@ +#!/usr/bin/perl +# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +# SPDX-License-Identifier: GPL-3.0-or-later +# Basic script to run dtrace sampling over the program (requires Flamegraph) +# Usage is either running as: ./dtrace-tool.sh pid (then input the pid of the process) +# Or just run directly with: ./dtrace-tool.sh +use strict; +use warnings; +use POSIX qw(strftime); + +my $input; +my $sampling_hz = '4000'; +my $sampling_time = '5'; +my $sampling_pid = `pgrep eden`; +my $sampling_program = 'eden'; +my $sampling_type = 3; + +sub dtrace_ask_params { + my $is_ok = 'Y'; + do { + print "Sampling HZ [" . $sampling_hz . "]: "; + chomp($input = ); + $sampling_hz = $input || $sampling_hz; + + print "Sampling time [" . $sampling_time . "]: "; + chomp($input = ); + $sampling_time = $input || $sampling_time; + + print "Sampling pid [" . $sampling_pid . "]: "; + chomp($input = ); + $sampling_pid = $input || $sampling_pid; + + print "Are these settings correct?: [" . $is_ok . "]\n"; + print "HZ = " . $sampling_hz . "\nTime = " . $sampling_time . "\nPID = " . $sampling_pid . "\n"; + chomp($input = ); + $is_ok = $input || $is_ok; + } while ($is_ok eq 'n'); +} + +sub dtrace_probe_profiling { + if ($sampling_type eq 0) { + return " +profile-".$sampling_hz." /pid == ".$sampling_pid." && arg0/ { + @[stack(100)] = count(); +} +profile-".$sampling_hz." /pid == ".$sampling_pid." && arg1/ { + @[ustack(100)] = count(); +} +tick-".$sampling_time."s { + exit(0); +}"; + } elsif ($sampling_type eq 1) { + return " +syscall:::entry /pid == ".$sampling_pid."/ { + \@traces[ustack(100)] = count(); +} +tick-".$sampling_time."s { + exit(0); +}"; + } elsif ($sampling_type eq 2) { + return " +profile-".$sampling_hz." /pid == ".$sampling_pid." && arg0/ { + @[stringof(curthread->td_name), stack(100)] = count(); +} +profile-".$sampling_hz." /pid == ".$sampling_pid." && arg1/ { + @[stringof(curthread->td_name), ustack(100)] = count(); +} +tick-".$sampling_time."s { + exit(0); +}"; + } elsif ($sampling_type eq 3) { + return " +io::start /pid == ".$sampling_pid."/ { + @[ustack(100)] = count(); +} +tick-".$sampling_time."s { + exit(0); +}"; + } else { + die "idk"; + } +} + +sub dtrace_generate { + my @date = (localtime(time))[5, 4, 3, 2, 1, 0]; + $date[0] += 1900; + $date[1]++; + my $fmt_date = sprintf "%4d-%02d-%02d_%02d-%02d-%02d", @date; + my $trace_dir = "dtrace-out"; + my $trace_file = $trace_dir . "/" . $fmt_date . ".user_stacks"; + my $trace_fold = $trace_dir . "/" . $fmt_date . ".fold"; + my $trace_svg = $trace_dir . "/" . $fmt_date . ".svg"; + my $trace_probe = dtrace_probe_profiling; + + print $trace_probe . "\n"; + system "sudo", "dtrace", "-Z", "-n", $trace_probe, "-o", $trace_file; + die "$!" if $?; + + open (my $trace_fold_handle, ">", $trace_fold) or die "$!"; + #run ["perl", "../FlameGraph/stackcollapse.pl", $trace_file], ">", \my $fold_output; + my $fold_output = `perl ../FlameGraph/stackcollapse.pl $trace_file`; + print $trace_fold_handle $fold_output; + + open (my $trace_svg_handle, ">", $trace_svg) or die "$!"; + #run ["perl", "../FlameGraph/flamegraph.pl", $trace_fold], ">", \my $svg_output; + my $svg_output = `perl ../FlameGraph/flamegraph.pl $trace_fold`; + print $trace_svg_handle $svg_output; + + system "sudo", "chmod", "0666", $trace_file; +} + +foreach my $i (0 .. $#ARGV) { + if ($ARGV[$i] eq '-h') { + print "Usage: $0\n"; + printf "%-20s%s\n", "-p", "Prompt for parameters"; + printf "%-20s%s\n", "-g", "Generate dtrace output"; + printf "%-20s%s\n", "-s", "Continously generate output until Ctrl^C"; + printf "%-20s%s\n", "-", "Select dtrace type"; + } elsif ($ARGV[$i] eq '-g') { + dtrace_generate; + } elsif ($ARGV[$i] eq '-s') { + while (1) { + dtrace_generate; + } + } elsif ($ARGV[$i] eq '-p') { + dtrace_ask_params; + } else { + $sampling_type = substr $ARGV[$i], 1; + print "Select: ".$sampling_type."\n"; + } +} diff --git a/tools/dtrace-tool.sh b/tools/dtrace-tool.sh deleted file mode 100755 index a8cc4c7bad..0000000000 --- a/tools/dtrace-tool.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/local/bin/bash -ex -# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project -# SPDX-License-Identifier: GPL-3.0-or-later -# Basic script to run dtrace sampling over the program (requires Flamegraph) -# Usage is either running as: ./dtrace-tool.sh pid (then input the pid of the process) -# Or just run directly with: ./dtrace-tool.sh -FLAMEGRAPH_DIR=".." -function fail { - printf '%s\n' "$1" >&2 - exit "${2-1}" -} -[ -f $FLAMEGRAPH_DIR/FlameGraph/stackcollapse.pl ] || fail 'Where is flamegraph?' -#[ which dtrace ] || fail 'Needs DTrace installed' -read -p "Sampling Hz [800]: " TRACE_CFG_HZ -if [ -z "${TRACE_CFG_HZ}" ]; then - TRACE_CFG_HZ=800 -fi -read -p "Sampling time [5] sec: " TRACE_CFG_TIME -if [ -z "${TRACE_CFG_TIME}" ]; then - TRACE_CFG_TIME=5 -fi -TRACE_FILE=dtrace-out.user_stacks -TRACE_FOLD=dtrace-out.fold -TRACE_SVG=dtrace-out.svg -ps -if [[ $1 = 'pid' ]]; then - read -p "PID: " TRACE_CFG_PID - sudo echo 'Sudo!' -else - [[ -f $1 && $1 ]] || fail 'Usage: ./tools/dtrace-profile.sh ' - echo "Executing: '$@'" - sudo echo 'Sudo!' - "$@" & - TRACE_CFG_PID=$! -fi -TRACE_PROBE="profile-${TRACE_CFG_HZ} /pid == ${TRACE_CFG_PID} && arg1/ { @[ustack()] = count(); } tick-${TRACE_CFG_TIME}s { exit(0); }" -rm -- $TRACE_SVG || echo 'Skip' -sudo dtrace -x ustackframes=100 -Z -n "$TRACE_PROBE" -o $TRACE_FILE 2>/dev/null || exit -perl $FLAMEGRAPH_DIR/FlameGraph/stackcollapse.pl $TRACE_FILE > $TRACE_FOLD || exit -perl $FLAMEGRAPH_DIR/FlameGraph/flamegraph.pl $TRACE_FOLD > $TRACE_SVG || exit -sudo chmod 0666 $TRACE_FILE -rm -- $TRACE_FILE $TRACE_FOLD \ No newline at end of file From 268918aeced185684d246a2eaaf8ffce3fb795b1 Mon Sep 17 00:00:00 2001 From: Ribbit Date: Sat, 4 Oct 2025 23:58:08 +0200 Subject: [PATCH 2/3] [vk] Implement Shader Read Barrier (#2671) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding the shader read barrier keeps every render/compute/transfer write visible before the image is sampled, so it prevents the “read-before-writes-finish” hazards. Without it you can get random stale frames, flickering post process passes, partially updated HUD textures, and corrupted depth-to-color conversions especially in scenes that render into an offscreen image and immediately feed that image to a shader (reflections, bloom, dynamic resolution, depth visualizers, etc.). This fix makes those R2T chains deterministic again across all Vulkan drivers. Co-authored-by: Ribbit Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2671 Reviewed-by: MaranBr Reviewed-by: crueter Co-authored-by: Ribbit Co-committed-by: Ribbit --- src/video_core/engines/maxwell_dma.cpp | 2 + src/video_core/renderer_vulkan/blit_image.cpp | 83 +++++++++++++++++-- src/video_core/renderer_vulkan/blit_image.h | 13 +-- .../renderer_vulkan/vk_texture_cache.cpp | 17 ++-- 4 files changed, 95 insertions(+), 20 deletions(-) diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 52ca9bbdb6..e2aa6c7e49 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -156,6 +156,8 @@ void MaxwellDMA::Launch() { } void MaxwellDMA::CopyBlockLinearToPitch() { + + u32 bytes_per_pixel = 1; DMA::ImageOperand src_operand; src_operand.bytes_per_pixel = bytes_per_pixel; diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp index 7bfcd6503b..68543bdd48 100644 --- a/src/video_core/renderer_vulkan/blit_image.cpp +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -46,6 +46,38 @@ namespace Vulkan { using VideoCommon::ImageViewType; namespace { + +[[nodiscard]] VkImageAspectFlags AspectMaskFromFormat(VideoCore::Surface::PixelFormat format) { + using VideoCore::Surface::SurfaceType; + switch (VideoCore::Surface::GetFormatType(format)) { + case SurfaceType::ColorTexture: + return VK_IMAGE_ASPECT_COLOR_BIT; + case SurfaceType::Depth: + return VK_IMAGE_ASPECT_DEPTH_BIT; + case SurfaceType::Stencil: + return VK_IMAGE_ASPECT_STENCIL_BIT; + case SurfaceType::DepthStencil: + return VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + default: + return VK_IMAGE_ASPECT_COLOR_BIT; + } +} + +[[nodiscard]] VkImageSubresourceRange SubresourceRangeFromView(const ImageView& image_view) { + auto range = image_view.range; + if ((image_view.flags & VideoCommon::ImageViewFlagBits::Slice) != VideoCommon::ImageViewFlagBits{}) { + range.base.layer = 0; + range.extent.layers = 1; + } + return VkImageSubresourceRange{ + .aspectMask = AspectMaskFromFormat(image_view.format), + .baseMipLevel = static_cast(range.base.level), + .levelCount = static_cast(range.extent.levels), + .baseArrayLayer = static_cast(range.base.layer), + .layerCount = static_cast(range.extent.layers), + }; +} + struct PushConstants { std::array tex_scale; std::array tex_offset; @@ -417,6 +449,40 @@ void TransitionImageLayout(vk::CommandBuffer& cmdbuf, VkImage image, VkImageLayo 0, barrier); } +void RecordShaderReadBarrier(Scheduler& scheduler, const ImageView& image_view) { + const VkImage image = image_view.ImageHandle(); + const VkImageSubresourceRange subresource_range = SubresourceRangeFromView(image_view); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([image, subresource_range](vk::CommandBuffer cmdbuf) { + const VkImageMemoryBarrier barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = subresource_range, + }; + cmdbuf.PipelineBarrier( + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | + VK_PIPELINE_STAGE_TRANSFER_BIT | + VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, + barrier); + }); +} + void BeginRenderPass(vk::CommandBuffer& cmdbuf, const Framebuffer* framebuffer) { const VkRenderPass render_pass = framebuffer->RenderPass(); const VkFramebuffer framebuffer_handle = framebuffer->Handle(); @@ -484,7 +550,7 @@ BlitImageHelper::BlitImageHelper(const Device& device_, Scheduler& scheduler_, BlitImageHelper::~BlitImageHelper() = default; -void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView src_view, +void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, const Region2D& dst_region, const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation) { @@ -496,10 +562,12 @@ void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView const VkPipelineLayout layout = *one_texture_pipeline_layout; const VkSampler sampler = is_linear ? *linear_sampler : *nearest_sampler; const VkPipeline pipeline = FindOrEmplaceColorPipeline(key); + const VkImageView src_view = src_image_view.Handle(Shader::TextureType::Color2D); + + RecordShaderReadBarrier(scheduler, src_image_view); scheduler.RequestRenderpass(dst_framebuffer); scheduler.Record([this, dst_region, src_region, pipeline, layout, sampler, src_view](vk::CommandBuffer cmdbuf) { - // TODO: Barriers const VkDescriptorSet descriptor_set = one_texture_descriptor_allocator.Commit(); UpdateOneTextureDescriptorSet(device, descriptor_set, sampler, src_view); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); @@ -538,7 +606,7 @@ void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView } void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer, - VkImageView src_depth_view, VkImageView src_stencil_view, + ImageView& src_image_view, const Region2D& dst_region, const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation) { @@ -554,10 +622,13 @@ void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer, const VkPipelineLayout layout = *two_textures_pipeline_layout; const VkSampler sampler = *nearest_sampler; const VkPipeline pipeline = FindOrEmplaceDepthStencilPipeline(key); + const VkImageView src_depth_view = src_image_view.DepthView(); + const VkImageView src_stencil_view = src_image_view.StencilView(); + + RecordShaderReadBarrier(scheduler, src_image_view); scheduler.RequestRenderpass(dst_framebuffer); scheduler.Record([dst_region, src_region, pipeline, layout, sampler, src_depth_view, src_stencil_view, this](vk::CommandBuffer cmdbuf) { - // TODO: Barriers const VkDescriptorSet descriptor_set = two_textures_descriptor_allocator.Commit(); UpdateTwoTexturesDescriptorSet(device, descriptor_set, sampler, src_depth_view, src_stencil_view); @@ -692,6 +763,7 @@ void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_frameb const VkSampler sampler = *nearest_sampler; const VkExtent2D extent = GetConversionExtent(src_image_view); + RecordShaderReadBarrier(scheduler, src_image_view); scheduler.RequestRenderpass(dst_framebuffer); scheduler.Record([pipeline, layout, sampler, src_view, extent, this](vk::CommandBuffer cmdbuf) { const VkOffset2D offset{ @@ -717,7 +789,6 @@ void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_frameb const VkDescriptorSet descriptor_set = one_texture_descriptor_allocator.Commit(); UpdateOneTextureDescriptorSet(device, descriptor_set, sampler, src_view); - // TODO: Barriers cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set, nullptr); @@ -737,6 +808,7 @@ void BlitImageHelper::ConvertDepthStencil(VkPipeline pipeline, const Framebuffer const VkSampler sampler = *nearest_sampler; const VkExtent2D extent = GetConversionExtent(src_image_view); + RecordShaderReadBarrier(scheduler, src_image_view); scheduler.RequestRenderpass(dst_framebuffer); scheduler.Record([pipeline, layout, sampler, src_depth_view, src_stencil_view, extent, this](vk::CommandBuffer cmdbuf) { @@ -763,7 +835,6 @@ void BlitImageHelper::ConvertDepthStencil(VkPipeline pipeline, const Framebuffer const VkDescriptorSet descriptor_set = two_textures_descriptor_allocator.Commit(); UpdateTwoTexturesDescriptorSet(device, descriptor_set, sampler, src_depth_view, src_stencil_view); - // TODO: Barriers cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set, nullptr); diff --git a/src/video_core/renderer_vulkan/blit_image.h b/src/video_core/renderer_vulkan/blit_image.h index 3d400be6a9..bdb8cce883 100644 --- a/src/video_core/renderer_vulkan/blit_image.h +++ b/src/video_core/renderer_vulkan/blit_image.h @@ -1,4 +1,7 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #pragma once @@ -43,7 +46,7 @@ public: StateTracker& state_tracker, DescriptorPool& descriptor_pool); ~BlitImageHelper(); - void BlitColor(const Framebuffer* dst_framebuffer, VkImageView src_image_view, + void BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, const Region2D& dst_region, const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation); @@ -52,9 +55,9 @@ public: VkImage src_image, VkSampler src_sampler, const Region2D& dst_region, const Region2D& src_region, const Extent3D& src_size); - void BlitDepthStencil(const Framebuffer* dst_framebuffer, VkImageView src_depth_view, - VkImageView src_stencil_view, const Region2D& dst_region, - const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, + void BlitDepthStencil(const Framebuffer* dst_framebuffer, ImageView& src_image_view, + const Region2D& dst_region, const Region2D& src_region, + Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation); void ConvertD32ToR32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 8d1d609a35..575651905e 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1086,8 +1086,8 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst return; } if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT && !is_src_msaa && !is_dst_msaa) { - blit_image_helper.BlitColor(dst_framebuffer, src.Handle(Shader::TextureType::Color2D), - dst_region, src_region, filter, operation); + blit_image_helper.BlitColor(dst_framebuffer, src, dst_region, src_region, filter, + operation); return; } ASSERT(src.format == dst.format); @@ -1106,8 +1106,8 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst }(); if (!can_blit_depth_stencil) { UNIMPLEMENTED_IF(is_src_msaa || is_dst_msaa); - blit_image_helper.BlitDepthStencil(dst_framebuffer, src.DepthView(), src.StencilView(), - dst_region, src_region, filter, operation); + blit_image_helper.BlitDepthStencil(dst_framebuffer, src, dst_region, src_region, + filter, operation); return; } } @@ -1968,18 +1968,17 @@ bool Image::BlitScaleHelper(bool scale_up) { blit_framebuffer = std::make_unique(*runtime, view_ptr, nullptr, extent, scale_up); } - const auto color_view = blit_view->Handle(Shader::TextureType::Color2D); - runtime->blit_image_helper.BlitColor(blit_framebuffer.get(), color_view, dst_region, + runtime->blit_image_helper.BlitColor(blit_framebuffer.get(), *blit_view, dst_region, src_region, operation, BLIT_OPERATION); } else if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { if (!blit_framebuffer) { blit_framebuffer = std::make_unique(*runtime, nullptr, view_ptr, extent, scale_up); } - runtime->blit_image_helper.BlitDepthStencil(blit_framebuffer.get(), blit_view->DepthView(), - blit_view->StencilView(), dst_region, - src_region, operation, BLIT_OPERATION); + runtime->blit_image_helper.BlitDepthStencil(blit_framebuffer.get(), *blit_view, + dst_region, src_region, operation, + BLIT_OPERATION); } else { // TODO: Use helper blits where applicable flags &= ~ImageFlagBits::Rescaled; From 8031bd06858ead943ff37680426ec448df7c9727 Mon Sep 17 00:00:00 2001 From: lizzie Date: Thu, 2 Oct 2025 20:51:26 +0000 Subject: [PATCH 3/3] [dynarmic] allow better dtrace diagnostics for code - do not clobber %rbp and save frame pointer Signed-off-by: lizzie --- .gitignore | 2 + .../src/dynarmic/backend/x64/a32_emit_x64.cpp | 27 ++-- .../dynarmic/backend/x64/a32_interface.cpp | 4 + .../src/dynarmic/backend/x64/a64_emit_x64.cpp | 28 ++-- src/dynarmic/src/dynarmic/backend/x64/abi.cpp | 10 +- .../dynarmic/backend/x64/block_of_code.cpp | 4 +- src/dynarmic/tests/print_info.cpp | 23 ++- tools/dtrace-tool.pl | 131 ++++++++++++++++++ tools/dtrace-tool.sh | 42 ------ 9 files changed, 194 insertions(+), 77 deletions(-) create mode 100755 tools/dtrace-tool.pl delete mode 100755 tools/dtrace-tool.sh diff --git a/.gitignore b/.gitignore index 2b342e5145..525eec2326 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,8 @@ CMakeLists.txt.user* # *nix related # Common convention for backup or temporary files *~ +*.core +dtrace-out/ # Visual Studio CMake settings CMakeSettings.json diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp index fb306336cf..65cbbb354a 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp @@ -217,13 +217,13 @@ void A32EmitX64::ClearFastDispatchTable() { } void A32EmitX64::GenTerminalHandlers() { - // PC ends up in ebp, location_descriptor ends up in rbx + // PC ends up in edi, location_descriptor ends up in rbx const auto calculate_location_descriptor = [this] { // This calculation has to match up with IREmitter::PushRSB code.mov(ebx, dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)]); code.shl(rbx, 32); code.mov(ecx, MJitStateReg(A32::Reg::PC)); - code.mov(ebp, ecx); + code.mov(edi, ecx); code.or_(rbx, rcx); }; @@ -238,7 +238,7 @@ void A32EmitX64::GenTerminalHandlers() { code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)], eax); code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]); if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { - code.jne(rsb_cache_miss); + code.jne(rsb_cache_miss, code.T_NEAR); } else { code.jne(code.GetReturnFromRunCodeAddress()); } @@ -251,20 +251,21 @@ void A32EmitX64::GenTerminalHandlers() { terminal_handler_fast_dispatch_hint = code.getCurr(); calculate_location_descriptor(); code.L(rsb_cache_miss); - code.mov(r12, reinterpret_cast(fast_dispatch_table.data())); - code.mov(rbp, rbx); + code.mov(r8, reinterpret_cast(fast_dispatch_table.data())); + //code.mov(r12d, MJitStateReg(A32::Reg::PC)); + code.mov(r12, rbx); if (code.HasHostFeature(HostFeature::SSE42)) { - code.crc32(rbp, r12); + code.crc32(r12, r8); } - code.and_(ebp, fast_dispatch_table_mask); - code.lea(rbp, ptr[r12 + rbp]); - code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]); - code.jne(fast_dispatch_cache_miss); - code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]); + code.and_(r12d, fast_dispatch_table_mask); + code.lea(r12, ptr[r8 + r12]); + code.cmp(rbx, qword[r12 + offsetof(FastDispatchEntry, location_descriptor)]); + code.jne(fast_dispatch_cache_miss, code.T_NEAR); + code.jmp(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)]); code.L(fast_dispatch_cache_miss); - code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx); + code.mov(qword[r12 + offsetof(FastDispatchEntry, location_descriptor)], rbx); code.LookupBlock(); - code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax); + code.mov(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)], rax); code.jmp(rax); PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a32_terminal_handler_fast_dispatch_hint"); diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp b/src/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp index 382eb70f3f..3253bc4f72 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp @@ -331,4 +331,8 @@ void Jit::DumpDisassembly() const { impl->DumpDisassembly(); } +std::vector Jit::Disassemble() const { + return impl->Disassemble(); +} + } // namespace Dynarmic::A32 diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp index 1e673338a8..045386342d 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp @@ -188,13 +188,14 @@ void A64EmitX64::ClearFastDispatchTable() { } void A64EmitX64::GenTerminalHandlers() { - // PC ends up in rbp, location_descriptor ends up in rbx + // PC ends up in rcx, location_descriptor ends up in rbx + static_assert(std::find(ABI_ALL_CALLEE_SAVE.begin(), ABI_ALL_CALLEE_SAVE.end(), HostLoc::R12)); const auto calculate_location_descriptor = [this] { // This calculation has to match up with A64::LocationDescriptor::UniqueHash // TODO: Optimization is available here based on known state of fpcr. - code.mov(rbp, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]); + code.mov(rdi, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]); code.mov(rcx, A64::LocationDescriptor::pc_mask); - code.and_(rcx, rbp); + code.and_(rcx, rdi); code.mov(ebx, dword[code.ABI_JIT_PTR + offsetof(A64JitState, fpcr)]); code.and_(ebx, A64::LocationDescriptor::fpcr_mask); code.shl(rbx, A64::LocationDescriptor::fpcr_shift); @@ -226,20 +227,21 @@ void A64EmitX64::GenTerminalHandlers() { terminal_handler_fast_dispatch_hint = code.getCurr(); calculate_location_descriptor(); code.L(rsb_cache_miss); - code.mov(r12, reinterpret_cast(fast_dispatch_table.data())); - code.mov(rbp, rbx); + code.mov(r8, reinterpret_cast(fast_dispatch_table.data())); + //code.mov(r12, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]); + code.mov(r12, rbx); if (code.HasHostFeature(HostFeature::SSE42)) { - code.crc32(rbp, r12); + code.crc32(r12, r8); } - code.and_(ebp, fast_dispatch_table_mask); - code.lea(rbp, ptr[r12 + rbp]); - code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]); - code.jne(fast_dispatch_cache_miss); - code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]); + code.and_(r12d, fast_dispatch_table_mask); + code.lea(r12, ptr[r8 + r12]); + code.cmp(rbx, qword[r12 + offsetof(FastDispatchEntry, location_descriptor)]); + code.jne(fast_dispatch_cache_miss, code.T_NEAR); + code.jmp(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)]); code.L(fast_dispatch_cache_miss); - code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx); + code.mov(qword[r12 + offsetof(FastDispatchEntry, location_descriptor)], rbx); code.LookupBlock(); - code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax); + code.mov(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)], rax); code.jmp(rax); PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a64_terminal_handler_fast_dispatch_hint"); diff --git a/src/dynarmic/src/dynarmic/backend/x64/abi.cpp b/src/dynarmic/src/dynarmic/backend/x64/abi.cpp index a9bbab3d10..1c8f662b2e 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/abi.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/abi.cpp @@ -29,7 +29,8 @@ static_assert(ABI_SHADOW_SPACE <= 32); static FrameInfo CalculateFrameInfo(const size_t num_gprs, const size_t num_xmms, size_t frame_size) { // We are initially 8 byte aligned because the return value is pushed onto an aligned stack after a call. - const size_t rsp_alignment = (num_gprs % 2 == 0) ? 8 : 0; + // (It's an extra GPR save due to %rbp) + const size_t rsp_alignment = ((num_gprs + 1) % 2 == 0) ? 8 : 0; const size_t total_xmm_size = num_xmms * XMM_SIZE; if (frame_size & 0xF) { frame_size += 0x10 - (frame_size & 0xF); @@ -49,6 +50,10 @@ void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM); const FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size); + if (true) { + code.push(rbp); + code.mov(rbp, rsp); + } for (auto const gpr : regs) if (HostLocIsGPR(gpr)) code.push(HostLocToReg64(gpr)); @@ -91,6 +96,9 @@ void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, for (auto const gpr : mcl::iterator::reverse(regs)) if (HostLocIsGPR(gpr)) code.pop(HostLocToReg64(gpr)); + if (true) { + code.pop(rbp); + } } void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, const std::size_t frame_size) { diff --git a/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp b/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp index d5d5f089ff..7e459df2d7 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp @@ -364,8 +364,7 @@ void BlockOfCode::GenRunCode(std::function rcp) { cmp(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], 0); jne(return_to_caller_mxcsr_already_exited, T_NEAR); - lock(); - or_(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], static_cast(HaltReason::Step)); + lock(); or_(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], u32(HaltReason::Step)); SwitchMxcsrOnEntry(); jmp(ABI_PARAM2); @@ -415,7 +414,6 @@ void BlockOfCode::GenRunCode(std::function rcp) { } xor_(eax, eax); - lock(); xchg(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], eax); ABI_PopCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout)); diff --git a/src/dynarmic/tests/print_info.cpp b/src/dynarmic/tests/print_info.cpp index 3d1268f467..1851664771 100644 --- a/src/dynarmic/tests/print_info.cpp +++ b/src/dynarmic/tests/print_info.cpp @@ -37,6 +37,9 @@ #include "dynarmic/ir/basic_block.h" #include "dynarmic/ir/opt_passes.h" +#include "./A32/testenv.h" +#include "./A64/testenv.h" + using namespace Dynarmic; std::string_view GetNameOfA32Instruction(u32 instruction) { @@ -65,7 +68,10 @@ void PrintA32Instruction(u32 instruction) { fmt::print("should_continue: {}\n\n", should_continue); fmt::print("IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); - Optimization::Optimize(ir_block, A32::UserConfig{}, {}); + ArmTestEnv jit_env{}; + Dynarmic::A32::UserConfig jit_user_config{}; + jit_user_config.callbacks = &jit_env; + Optimization::Optimize(ir_block, jit_user_config, {}); fmt::print("Optimized IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); } @@ -80,7 +86,10 @@ void PrintA64Instruction(u32 instruction) { fmt::print("should_continue: {}\n\n", should_continue); fmt::print("IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); - Optimization::Optimize(ir_block, A64::UserConfig{}, {}); + A64TestEnv jit_env{}; + Dynarmic::A64::UserConfig jit_user_config{}; + jit_user_config.callbacks = &jit_env; + Optimization::Optimize(ir_block, jit_user_config, {}); fmt::print("Optimized IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); } @@ -98,7 +107,10 @@ void PrintThumbInstruction(u32 instruction) { fmt::print("should_continue: {}\n\n", should_continue); fmt::print("IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); - Optimization::Optimize(ir_block, A32::UserConfig{}, {}); + ThumbTestEnv jit_env{}; + Dynarmic::A32::UserConfig jit_user_config{}; + jit_user_config.callbacks = &jit_env; + Optimization::Optimize(ir_block, jit_user_config, {}); fmt::print("Optimized IR:\n"); fmt::print("{}\n", IR::DumpBlock(ir_block)); } @@ -219,7 +231,7 @@ void ExecuteA32Instruction(u32 instruction) { *(iter->second) = *value; fmt::print("> {} = 0x{:08x}\n", reg_name, *value); } - } else if (reg_name == "mem" || reg_name == "memory") { + } else if (reg_name.starts_with("m")) { fmt::print("address: "); if (const auto address = get_value()) { fmt::print("value: "); @@ -228,7 +240,7 @@ void ExecuteA32Instruction(u32 instruction) { fmt::print("> mem[0x{:08x}] = 0x{:08x}\n", *address, *value); } } - } else if (reg_name == "end") { + } else if (reg_name == "exit" || reg_name == "end" || reg_name.starts_with("q")) { break; } } @@ -244,6 +256,7 @@ void ExecuteA32Instruction(u32 instruction) { env.MemoryWrite32(initial_pc + 4, 0xEAFFFFFE); // B +0 cpu.Run(); + fmt::print("{}", fmt::join(cpu.Disassemble(), "\n")); fmt::print("Registers modified:\n"); for (size_t i = 0; i < regs.size(); ++i) { diff --git a/tools/dtrace-tool.pl b/tools/dtrace-tool.pl new file mode 100755 index 0000000000..a3fbcc2d8a --- /dev/null +++ b/tools/dtrace-tool.pl @@ -0,0 +1,131 @@ +#!/usr/bin/perl +# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +# SPDX-License-Identifier: GPL-3.0-or-later +# Basic script to run dtrace sampling over the program (requires Flamegraph) +# Usage is either running as: ./dtrace-tool.sh pid (then input the pid of the process) +# Or just run directly with: ./dtrace-tool.sh +use strict; +use warnings; +use POSIX qw(strftime); + +my $input; +my $sampling_hz = '4000'; +my $sampling_time = '5'; +my $sampling_pid = `pgrep eden`; +my $sampling_program = 'eden'; +my $sampling_type = 3; + +sub dtrace_ask_params { + my $is_ok = 'Y'; + do { + print "Sampling HZ [" . $sampling_hz . "]: "; + chomp($input = ); + $sampling_hz = $input || $sampling_hz; + + print "Sampling time [" . $sampling_time . "]: "; + chomp($input = ); + $sampling_time = $input || $sampling_time; + + print "Sampling pid [" . $sampling_pid . "]: "; + chomp($input = ); + $sampling_pid = $input || $sampling_pid; + + print "Are these settings correct?: [" . $is_ok . "]\n"; + print "HZ = " . $sampling_hz . "\nTime = " . $sampling_time . "\nPID = " . $sampling_pid . "\n"; + chomp($input = ); + $is_ok = $input || $is_ok; + } while ($is_ok eq 'n'); +} + +sub dtrace_probe_profiling { + if ($sampling_type eq 0) { + return " +profile-".$sampling_hz." /pid == ".$sampling_pid." && arg0/ { + @[stack(100)] = count(); +} +profile-".$sampling_hz." /pid == ".$sampling_pid." && arg1/ { + @[ustack(100)] = count(); +} +tick-".$sampling_time."s { + exit(0); +}"; + } elsif ($sampling_type eq 1) { + return " +syscall:::entry /pid == ".$sampling_pid."/ { + \@traces[ustack(100)] = count(); +} +tick-".$sampling_time."s { + exit(0); +}"; + } elsif ($sampling_type eq 2) { + return " +profile-".$sampling_hz." /pid == ".$sampling_pid." && arg0/ { + @[stringof(curthread->td_name), stack(100)] = count(); +} +profile-".$sampling_hz." /pid == ".$sampling_pid." && arg1/ { + @[stringof(curthread->td_name), ustack(100)] = count(); +} +tick-".$sampling_time."s { + exit(0); +}"; + } elsif ($sampling_type eq 3) { + return " +io::start /pid == ".$sampling_pid."/ { + @[ustack(100)] = count(); +} +tick-".$sampling_time."s { + exit(0); +}"; + } else { + die "idk"; + } +} + +sub dtrace_generate { + my @date = (localtime(time))[5, 4, 3, 2, 1, 0]; + $date[0] += 1900; + $date[1]++; + my $fmt_date = sprintf "%4d-%02d-%02d_%02d-%02d-%02d", @date; + my $trace_dir = "dtrace-out"; + my $trace_file = $trace_dir . "/" . $fmt_date . ".user_stacks"; + my $trace_fold = $trace_dir . "/" . $fmt_date . ".fold"; + my $trace_svg = $trace_dir . "/" . $fmt_date . ".svg"; + my $trace_probe = dtrace_probe_profiling; + + print $trace_probe . "\n"; + system "sudo", "dtrace", "-Z", "-n", $trace_probe, "-o", $trace_file; + die "$!" if $?; + + open (my $trace_fold_handle, ">", $trace_fold) or die "$!"; + #run ["perl", "../FlameGraph/stackcollapse.pl", $trace_file], ">", \my $fold_output; + my $fold_output = `perl ../FlameGraph/stackcollapse.pl $trace_file`; + print $trace_fold_handle $fold_output; + + open (my $trace_svg_handle, ">", $trace_svg) or die "$!"; + #run ["perl", "../FlameGraph/flamegraph.pl", $trace_fold], ">", \my $svg_output; + my $svg_output = `perl ../FlameGraph/flamegraph.pl $trace_fold`; + print $trace_svg_handle $svg_output; + + system "sudo", "chmod", "0666", $trace_file; +} + +foreach my $i (0 .. $#ARGV) { + if ($ARGV[$i] eq '-h') { + print "Usage: $0\n"; + printf "%-20s%s\n", "-p", "Prompt for parameters"; + printf "%-20s%s\n", "-g", "Generate dtrace output"; + printf "%-20s%s\n", "-s", "Continously generate output until Ctrl^C"; + printf "%-20s%s\n", "-", "Select dtrace type"; + } elsif ($ARGV[$i] eq '-g') { + dtrace_generate; + } elsif ($ARGV[$i] eq '-s') { + while (1) { + dtrace_generate; + } + } elsif ($ARGV[$i] eq '-p') { + dtrace_ask_params; + } else { + $sampling_type = substr $ARGV[$i], 1; + print "Select: ".$sampling_type."\n"; + } +} diff --git a/tools/dtrace-tool.sh b/tools/dtrace-tool.sh deleted file mode 100755 index a8cc4c7bad..0000000000 --- a/tools/dtrace-tool.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/local/bin/bash -ex -# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project -# SPDX-License-Identifier: GPL-3.0-or-later -# Basic script to run dtrace sampling over the program (requires Flamegraph) -# Usage is either running as: ./dtrace-tool.sh pid (then input the pid of the process) -# Or just run directly with: ./dtrace-tool.sh -FLAMEGRAPH_DIR=".." -function fail { - printf '%s\n' "$1" >&2 - exit "${2-1}" -} -[ -f $FLAMEGRAPH_DIR/FlameGraph/stackcollapse.pl ] || fail 'Where is flamegraph?' -#[ which dtrace ] || fail 'Needs DTrace installed' -read -p "Sampling Hz [800]: " TRACE_CFG_HZ -if [ -z "${TRACE_CFG_HZ}" ]; then - TRACE_CFG_HZ=800 -fi -read -p "Sampling time [5] sec: " TRACE_CFG_TIME -if [ -z "${TRACE_CFG_TIME}" ]; then - TRACE_CFG_TIME=5 -fi -TRACE_FILE=dtrace-out.user_stacks -TRACE_FOLD=dtrace-out.fold -TRACE_SVG=dtrace-out.svg -ps -if [[ $1 = 'pid' ]]; then - read -p "PID: " TRACE_CFG_PID - sudo echo 'Sudo!' -else - [[ -f $1 && $1 ]] || fail 'Usage: ./tools/dtrace-profile.sh ' - echo "Executing: '$@'" - sudo echo 'Sudo!' - "$@" & - TRACE_CFG_PID=$! -fi -TRACE_PROBE="profile-${TRACE_CFG_HZ} /pid == ${TRACE_CFG_PID} && arg1/ { @[ustack()] = count(); } tick-${TRACE_CFG_TIME}s { exit(0); }" -rm -- $TRACE_SVG || echo 'Skip' -sudo dtrace -x ustackframes=100 -Z -n "$TRACE_PROBE" -o $TRACE_FILE 2>/dev/null || exit -perl $FLAMEGRAPH_DIR/FlameGraph/stackcollapse.pl $TRACE_FILE > $TRACE_FOLD || exit -perl $FLAMEGRAPH_DIR/FlameGraph/flamegraph.pl $TRACE_FOLD > $TRACE_SVG || exit -sudo chmod 0666 $TRACE_FILE -rm -- $TRACE_FILE $TRACE_FOLD \ No newline at end of file