Compare commits

...

2 commits

Author SHA1 Message Date
8031bd0685 [dynarmic] allow better dtrace diagnostics for code - do not clobber %rbp and save frame pointer
All checks were successful
eden-license / license-header (pull_request) Successful in 20s
Signed-off-by: lizzie <lizzie@eden-emu.dev>
2025-10-04 23:59:16 +02:00
268918aece
[vk] Implement Shader Read Barrier (#2671)
Adding the shader read barrier keeps every render/compute/transfer write visible before the image is sampled, so it prevents the “read-before-writes-finish” hazards. Without it you can get random stale frames, flickering post process passes, partially updated HUD textures, and corrupted depth-to-color conversions especially in scenes that render into an offscreen image and immediately feed that image to a shader (reflections, bloom, dynamic resolution, depth visualizers, etc.). This fix makes those R2T chains deterministic again across all Vulkan drivers.

Co-authored-by: Ribbit <ribbit@placeholder.com>
Reviewed-on: #2671
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: Ribbit <ribbit@eden-emu.dev>
Co-committed-by: Ribbit <ribbit@eden-emu.dev>
2025-10-04 23:58:08 +02:00
13 changed files with 289 additions and 97 deletions

2
.gitignore vendored
View file

@ -30,6 +30,8 @@ CMakeLists.txt.user*
# *nix related
# Common convention for backup or temporary files
*~
*.core
dtrace-out/
# Visual Studio CMake settings
CMakeSettings.json

View file

@ -217,13 +217,13 @@ void A32EmitX64::ClearFastDispatchTable() {
}
void A32EmitX64::GenTerminalHandlers() {
// PC ends up in ebp, location_descriptor ends up in rbx
// PC ends up in edi, location_descriptor ends up in rbx
const auto calculate_location_descriptor = [this] {
// This calculation has to match up with IREmitter::PushRSB
code.mov(ebx, dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)]);
code.shl(rbx, 32);
code.mov(ecx, MJitStateReg(A32::Reg::PC));
code.mov(ebp, ecx);
code.mov(edi, ecx);
code.or_(rbx, rcx);
};
@ -238,7 +238,7 @@ void A32EmitX64::GenTerminalHandlers() {
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
code.jne(rsb_cache_miss);
code.jne(rsb_cache_miss, code.T_NEAR);
} else {
code.jne(code.GetReturnFromRunCodeAddress());
}
@ -251,20 +251,21 @@ void A32EmitX64::GenTerminalHandlers() {
terminal_handler_fast_dispatch_hint = code.getCurr<const void*>();
calculate_location_descriptor();
code.L(rsb_cache_miss);
code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
code.mov(rbp, rbx);
code.mov(r8, reinterpret_cast<u64>(fast_dispatch_table.data()));
//code.mov(r12d, MJitStateReg(A32::Reg::PC));
code.mov(r12, rbx);
if (code.HasHostFeature(HostFeature::SSE42)) {
code.crc32(rbp, r12);
code.crc32(r12, r8);
}
code.and_(ebp, fast_dispatch_table_mask);
code.lea(rbp, ptr[r12 + rbp]);
code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]);
code.jne(fast_dispatch_cache_miss);
code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]);
code.and_(r12d, fast_dispatch_table_mask);
code.lea(r12, ptr[r8 + r12]);
code.cmp(rbx, qword[r12 + offsetof(FastDispatchEntry, location_descriptor)]);
code.jne(fast_dispatch_cache_miss, code.T_NEAR);
code.jmp(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)]);
code.L(fast_dispatch_cache_miss);
code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx);
code.mov(qword[r12 + offsetof(FastDispatchEntry, location_descriptor)], rbx);
code.LookupBlock();
code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax);
code.mov(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)], rax);
code.jmp(rax);
PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a32_terminal_handler_fast_dispatch_hint");

View file

@ -331,4 +331,8 @@ void Jit::DumpDisassembly() const {
impl->DumpDisassembly();
}
std::vector<std::string> Jit::Disassemble() const {
return impl->Disassemble();
}
} // namespace Dynarmic::A32

View file

@ -188,13 +188,14 @@ void A64EmitX64::ClearFastDispatchTable() {
}
void A64EmitX64::GenTerminalHandlers() {
// PC ends up in rbp, location_descriptor ends up in rbx
// PC ends up in rcx, location_descriptor ends up in rbx
static_assert(std::find(ABI_ALL_CALLEE_SAVE.begin(), ABI_ALL_CALLEE_SAVE.end(), HostLoc::R12));
const auto calculate_location_descriptor = [this] {
// This calculation has to match up with A64::LocationDescriptor::UniqueHash
// TODO: Optimization is available here based on known state of fpcr.
code.mov(rbp, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]);
code.mov(rdi, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]);
code.mov(rcx, A64::LocationDescriptor::pc_mask);
code.and_(rcx, rbp);
code.and_(rcx, rdi);
code.mov(ebx, dword[code.ABI_JIT_PTR + offsetof(A64JitState, fpcr)]);
code.and_(ebx, A64::LocationDescriptor::fpcr_mask);
code.shl(rbx, A64::LocationDescriptor::fpcr_shift);
@ -226,20 +227,21 @@ void A64EmitX64::GenTerminalHandlers() {
terminal_handler_fast_dispatch_hint = code.getCurr<const void*>();
calculate_location_descriptor();
code.L(rsb_cache_miss);
code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
code.mov(rbp, rbx);
code.mov(r8, reinterpret_cast<u64>(fast_dispatch_table.data()));
//code.mov(r12, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]);
code.mov(r12, rbx);
if (code.HasHostFeature(HostFeature::SSE42)) {
code.crc32(rbp, r12);
code.crc32(r12, r8);
}
code.and_(ebp, fast_dispatch_table_mask);
code.lea(rbp, ptr[r12 + rbp]);
code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]);
code.jne(fast_dispatch_cache_miss);
code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]);
code.and_(r12d, fast_dispatch_table_mask);
code.lea(r12, ptr[r8 + r12]);
code.cmp(rbx, qword[r12 + offsetof(FastDispatchEntry, location_descriptor)]);
code.jne(fast_dispatch_cache_miss, code.T_NEAR);
code.jmp(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)]);
code.L(fast_dispatch_cache_miss);
code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx);
code.mov(qword[r12 + offsetof(FastDispatchEntry, location_descriptor)], rbx);
code.LookupBlock();
code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax);
code.mov(ptr[r12 + offsetof(FastDispatchEntry, code_ptr)], rax);
code.jmp(rax);
PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a64_terminal_handler_fast_dispatch_hint");

View file

@ -29,7 +29,8 @@ static_assert(ABI_SHADOW_SPACE <= 32);
static FrameInfo CalculateFrameInfo(const size_t num_gprs, const size_t num_xmms, size_t frame_size) {
// We are initially 8 byte aligned because the return value is pushed onto an aligned stack after a call.
const size_t rsp_alignment = (num_gprs % 2 == 0) ? 8 : 0;
// (It's an extra GPR save due to %rbp)
const size_t rsp_alignment = ((num_gprs + 1) % 2 == 0) ? 8 : 0;
const size_t total_xmm_size = num_xmms * XMM_SIZE;
if (frame_size & 0xF) {
frame_size += 0x10 - (frame_size & 0xF);
@ -49,6 +50,10 @@ void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size,
const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM);
const FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size);
if (true) {
code.push(rbp);
code.mov(rbp, rsp);
}
for (auto const gpr : regs)
if (HostLocIsGPR(gpr))
code.push(HostLocToReg64(gpr));
@ -91,6 +96,9 @@ void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size,
for (auto const gpr : mcl::iterator::reverse(regs))
if (HostLocIsGPR(gpr))
code.pop(HostLocToReg64(gpr));
if (true) {
code.pop(rbp);
}
}
void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, const std::size_t frame_size) {

View file

@ -364,8 +364,7 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
cmp(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], 0);
jne(return_to_caller_mxcsr_already_exited, T_NEAR);
lock();
or_(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], static_cast<u32>(HaltReason::Step));
lock(); or_(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], u32(HaltReason::Step));
SwitchMxcsrOnEntry();
jmp(ABI_PARAM2);
@ -415,7 +414,6 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
}
xor_(eax, eax);
lock();
xchg(dword[ABI_JIT_PTR + jsi.offsetof_halt_reason], eax);
ABI_PopCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout));

View file

@ -37,6 +37,9 @@
#include "dynarmic/ir/basic_block.h"
#include "dynarmic/ir/opt_passes.h"
#include "./A32/testenv.h"
#include "./A64/testenv.h"
using namespace Dynarmic;
std::string_view GetNameOfA32Instruction(u32 instruction) {
@ -65,7 +68,10 @@ void PrintA32Instruction(u32 instruction) {
fmt::print("should_continue: {}\n\n", should_continue);
fmt::print("IR:\n");
fmt::print("{}\n", IR::DumpBlock(ir_block));
Optimization::Optimize(ir_block, A32::UserConfig{}, {});
ArmTestEnv jit_env{};
Dynarmic::A32::UserConfig jit_user_config{};
jit_user_config.callbacks = &jit_env;
Optimization::Optimize(ir_block, jit_user_config, {});
fmt::print("Optimized IR:\n");
fmt::print("{}\n", IR::DumpBlock(ir_block));
}
@ -80,7 +86,10 @@ void PrintA64Instruction(u32 instruction) {
fmt::print("should_continue: {}\n\n", should_continue);
fmt::print("IR:\n");
fmt::print("{}\n", IR::DumpBlock(ir_block));
Optimization::Optimize(ir_block, A64::UserConfig{}, {});
A64TestEnv jit_env{};
Dynarmic::A64::UserConfig jit_user_config{};
jit_user_config.callbacks = &jit_env;
Optimization::Optimize(ir_block, jit_user_config, {});
fmt::print("Optimized IR:\n");
fmt::print("{}\n", IR::DumpBlock(ir_block));
}
@ -98,7 +107,10 @@ void PrintThumbInstruction(u32 instruction) {
fmt::print("should_continue: {}\n\n", should_continue);
fmt::print("IR:\n");
fmt::print("{}\n", IR::DumpBlock(ir_block));
Optimization::Optimize(ir_block, A32::UserConfig{}, {});
ThumbTestEnv jit_env{};
Dynarmic::A32::UserConfig jit_user_config{};
jit_user_config.callbacks = &jit_env;
Optimization::Optimize(ir_block, jit_user_config, {});
fmt::print("Optimized IR:\n");
fmt::print("{}\n", IR::DumpBlock(ir_block));
}
@ -219,7 +231,7 @@ void ExecuteA32Instruction(u32 instruction) {
*(iter->second) = *value;
fmt::print("> {} = 0x{:08x}\n", reg_name, *value);
}
} else if (reg_name == "mem" || reg_name == "memory") {
} else if (reg_name.starts_with("m")) {
fmt::print("address: ");
if (const auto address = get_value()) {
fmt::print("value: ");
@ -228,7 +240,7 @@ void ExecuteA32Instruction(u32 instruction) {
fmt::print("> mem[0x{:08x}] = 0x{:08x}\n", *address, *value);
}
}
} else if (reg_name == "end") {
} else if (reg_name == "exit" || reg_name == "end" || reg_name.starts_with("q")) {
break;
}
}
@ -244,6 +256,7 @@ void ExecuteA32Instruction(u32 instruction) {
env.MemoryWrite32(initial_pc + 4, 0xEAFFFFFE); // B +0
cpu.Run();
fmt::print("{}", fmt::join(cpu.Disassemble(), "\n"));
fmt::print("Registers modified:\n");
for (size_t i = 0; i < regs.size(); ++i) {

View file

@ -156,6 +156,8 @@ void MaxwellDMA::Launch() {
}
void MaxwellDMA::CopyBlockLinearToPitch() {
u32 bytes_per_pixel = 1;
DMA::ImageOperand src_operand;
src_operand.bytes_per_pixel = bytes_per_pixel;

View file

@ -46,6 +46,38 @@ namespace Vulkan {
using VideoCommon::ImageViewType;
namespace {
[[nodiscard]] VkImageAspectFlags AspectMaskFromFormat(VideoCore::Surface::PixelFormat format) {
using VideoCore::Surface::SurfaceType;
switch (VideoCore::Surface::GetFormatType(format)) {
case SurfaceType::ColorTexture:
return VK_IMAGE_ASPECT_COLOR_BIT;
case SurfaceType::Depth:
return VK_IMAGE_ASPECT_DEPTH_BIT;
case SurfaceType::Stencil:
return VK_IMAGE_ASPECT_STENCIL_BIT;
case SurfaceType::DepthStencil:
return VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
default:
return VK_IMAGE_ASPECT_COLOR_BIT;
}
}
[[nodiscard]] VkImageSubresourceRange SubresourceRangeFromView(const ImageView& image_view) {
auto range = image_view.range;
if ((image_view.flags & VideoCommon::ImageViewFlagBits::Slice) != VideoCommon::ImageViewFlagBits{}) {
range.base.layer = 0;
range.extent.layers = 1;
}
return VkImageSubresourceRange{
.aspectMask = AspectMaskFromFormat(image_view.format),
.baseMipLevel = static_cast<u32>(range.base.level),
.levelCount = static_cast<u32>(range.extent.levels),
.baseArrayLayer = static_cast<u32>(range.base.layer),
.layerCount = static_cast<u32>(range.extent.layers),
};
}
struct PushConstants {
std::array<float, 2> tex_scale;
std::array<float, 2> tex_offset;
@ -417,6 +449,40 @@ void TransitionImageLayout(vk::CommandBuffer& cmdbuf, VkImage image, VkImageLayo
0, barrier);
}
void RecordShaderReadBarrier(Scheduler& scheduler, const ImageView& image_view) {
const VkImage image = image_view.ImageHandle();
const VkImageSubresourceRange subresource_range = SubresourceRangeFromView(image_view);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([image, subresource_range](vk::CommandBuffer cmdbuf) {
const VkImageMemoryBarrier barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_SHADER_WRITE_BIT |
VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
.subresourceRange = subresource_range,
};
cmdbuf.PipelineBarrier(
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
VK_PIPELINE_STAGE_TRANSFER_BIT |
VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0,
barrier);
});
}
void BeginRenderPass(vk::CommandBuffer& cmdbuf, const Framebuffer* framebuffer) {
const VkRenderPass render_pass = framebuffer->RenderPass();
const VkFramebuffer framebuffer_handle = framebuffer->Handle();
@ -484,7 +550,7 @@ BlitImageHelper::BlitImageHelper(const Device& device_, Scheduler& scheduler_,
BlitImageHelper::~BlitImageHelper() = default;
void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView src_view,
void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view,
const Region2D& dst_region, const Region2D& src_region,
Tegra::Engines::Fermi2D::Filter filter,
Tegra::Engines::Fermi2D::Operation operation) {
@ -496,10 +562,12 @@ void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView
const VkPipelineLayout layout = *one_texture_pipeline_layout;
const VkSampler sampler = is_linear ? *linear_sampler : *nearest_sampler;
const VkPipeline pipeline = FindOrEmplaceColorPipeline(key);
const VkImageView src_view = src_image_view.Handle(Shader::TextureType::Color2D);
RecordShaderReadBarrier(scheduler, src_image_view);
scheduler.RequestRenderpass(dst_framebuffer);
scheduler.Record([this, dst_region, src_region, pipeline, layout, sampler,
src_view](vk::CommandBuffer cmdbuf) {
// TODO: Barriers
const VkDescriptorSet descriptor_set = one_texture_descriptor_allocator.Commit();
UpdateOneTextureDescriptorSet(device, descriptor_set, sampler, src_view);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
@ -538,7 +606,7 @@ void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView
}
void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer,
VkImageView src_depth_view, VkImageView src_stencil_view,
ImageView& src_image_view,
const Region2D& dst_region, const Region2D& src_region,
Tegra::Engines::Fermi2D::Filter filter,
Tegra::Engines::Fermi2D::Operation operation) {
@ -554,10 +622,13 @@ void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer,
const VkPipelineLayout layout = *two_textures_pipeline_layout;
const VkSampler sampler = *nearest_sampler;
const VkPipeline pipeline = FindOrEmplaceDepthStencilPipeline(key);
const VkImageView src_depth_view = src_image_view.DepthView();
const VkImageView src_stencil_view = src_image_view.StencilView();
RecordShaderReadBarrier(scheduler, src_image_view);
scheduler.RequestRenderpass(dst_framebuffer);
scheduler.Record([dst_region, src_region, pipeline, layout, sampler, src_depth_view,
src_stencil_view, this](vk::CommandBuffer cmdbuf) {
// TODO: Barriers
const VkDescriptorSet descriptor_set = two_textures_descriptor_allocator.Commit();
UpdateTwoTexturesDescriptorSet(device, descriptor_set, sampler, src_depth_view,
src_stencil_view);
@ -692,6 +763,7 @@ void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_frameb
const VkSampler sampler = *nearest_sampler;
const VkExtent2D extent = GetConversionExtent(src_image_view);
RecordShaderReadBarrier(scheduler, src_image_view);
scheduler.RequestRenderpass(dst_framebuffer);
scheduler.Record([pipeline, layout, sampler, src_view, extent, this](vk::CommandBuffer cmdbuf) {
const VkOffset2D offset{
@ -717,7 +789,6 @@ void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_frameb
const VkDescriptorSet descriptor_set = one_texture_descriptor_allocator.Commit();
UpdateOneTextureDescriptorSet(device, descriptor_set, sampler, src_view);
// TODO: Barriers
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set,
nullptr);
@ -737,6 +808,7 @@ void BlitImageHelper::ConvertDepthStencil(VkPipeline pipeline, const Framebuffer
const VkSampler sampler = *nearest_sampler;
const VkExtent2D extent = GetConversionExtent(src_image_view);
RecordShaderReadBarrier(scheduler, src_image_view);
scheduler.RequestRenderpass(dst_framebuffer);
scheduler.Record([pipeline, layout, sampler, src_depth_view, src_stencil_view, extent,
this](vk::CommandBuffer cmdbuf) {
@ -763,7 +835,6 @@ void BlitImageHelper::ConvertDepthStencil(VkPipeline pipeline, const Framebuffer
const VkDescriptorSet descriptor_set = two_textures_descriptor_allocator.Commit();
UpdateTwoTexturesDescriptorSet(device, descriptor_set, sampler, src_depth_view,
src_stencil_view);
// TODO: Barriers
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set,
nullptr);

View file

@ -1,4 +1,7 @@
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
@ -43,7 +46,7 @@ public:
StateTracker& state_tracker, DescriptorPool& descriptor_pool);
~BlitImageHelper();
void BlitColor(const Framebuffer* dst_framebuffer, VkImageView src_image_view,
void BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view,
const Region2D& dst_region, const Region2D& src_region,
Tegra::Engines::Fermi2D::Filter filter,
Tegra::Engines::Fermi2D::Operation operation);
@ -52,9 +55,9 @@ public:
VkImage src_image, VkSampler src_sampler, const Region2D& dst_region,
const Region2D& src_region, const Extent3D& src_size);
void BlitDepthStencil(const Framebuffer* dst_framebuffer, VkImageView src_depth_view,
VkImageView src_stencil_view, const Region2D& dst_region,
const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter,
void BlitDepthStencil(const Framebuffer* dst_framebuffer, ImageView& src_image_view,
const Region2D& dst_region, const Region2D& src_region,
Tegra::Engines::Fermi2D::Filter filter,
Tegra::Engines::Fermi2D::Operation operation);
void ConvertD32ToR32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view);

View file

@ -1086,8 +1086,8 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst
return;
}
if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT && !is_src_msaa && !is_dst_msaa) {
blit_image_helper.BlitColor(dst_framebuffer, src.Handle(Shader::TextureType::Color2D),
dst_region, src_region, filter, operation);
blit_image_helper.BlitColor(dst_framebuffer, src, dst_region, src_region, filter,
operation);
return;
}
ASSERT(src.format == dst.format);
@ -1106,8 +1106,8 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst
}();
if (!can_blit_depth_stencil) {
UNIMPLEMENTED_IF(is_src_msaa || is_dst_msaa);
blit_image_helper.BlitDepthStencil(dst_framebuffer, src.DepthView(), src.StencilView(),
dst_region, src_region, filter, operation);
blit_image_helper.BlitDepthStencil(dst_framebuffer, src, dst_region, src_region,
filter, operation);
return;
}
}
@ -1968,18 +1968,17 @@ bool Image::BlitScaleHelper(bool scale_up) {
blit_framebuffer =
std::make_unique<Framebuffer>(*runtime, view_ptr, nullptr, extent, scale_up);
}
const auto color_view = blit_view->Handle(Shader::TextureType::Color2D);
runtime->blit_image_helper.BlitColor(blit_framebuffer.get(), color_view, dst_region,
runtime->blit_image_helper.BlitColor(blit_framebuffer.get(), *blit_view, dst_region,
src_region, operation, BLIT_OPERATION);
} else if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
if (!blit_framebuffer) {
blit_framebuffer =
std::make_unique<Framebuffer>(*runtime, nullptr, view_ptr, extent, scale_up);
}
runtime->blit_image_helper.BlitDepthStencil(blit_framebuffer.get(), blit_view->DepthView(),
blit_view->StencilView(), dst_region,
src_region, operation, BLIT_OPERATION);
runtime->blit_image_helper.BlitDepthStencil(blit_framebuffer.get(), *blit_view,
dst_region, src_region, operation,
BLIT_OPERATION);
} else {
// TODO: Use helper blits where applicable
flags &= ~ImageFlagBits::Rescaled;

131
tools/dtrace-tool.pl Executable file
View file

@ -0,0 +1,131 @@
#!/usr/bin/perl
# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
# SPDX-License-Identifier: GPL-3.0-or-later
# Basic script to run dtrace sampling over the program (requires Flamegraph)
# Usage is either running as: ./dtrace-tool.sh pid (then input the pid of the process)
# Or just run directly with: ./dtrace-tool.sh <command>
use strict;
use warnings;
use POSIX qw(strftime);
my $input;
my $sampling_hz = '4000';
my $sampling_time = '5';
my $sampling_pid = `pgrep eden`;
my $sampling_program = 'eden';
my $sampling_type = 3;
sub dtrace_ask_params {
my $is_ok = 'Y';
do {
print "Sampling HZ [" . $sampling_hz . "]: ";
chomp($input = <STDIN>);
$sampling_hz = $input || $sampling_hz;
print "Sampling time [" . $sampling_time . "]: ";
chomp($input = <STDIN>);
$sampling_time = $input || $sampling_time;
print "Sampling pid [" . $sampling_pid . "]: ";
chomp($input = <STDIN>);
$sampling_pid = $input || $sampling_pid;
print "Are these settings correct?: [" . $is_ok . "]\n";
print "HZ = " . $sampling_hz . "\nTime = " . $sampling_time . "\nPID = " . $sampling_pid . "\n";
chomp($input = <STDIN>);
$is_ok = $input || $is_ok;
} while ($is_ok eq 'n');
}
sub dtrace_probe_profiling {
if ($sampling_type eq 0) {
return "
profile-".$sampling_hz." /pid == ".$sampling_pid." && arg0/ {
@[stack(100)] = count();
}
profile-".$sampling_hz." /pid == ".$sampling_pid." && arg1/ {
@[ustack(100)] = count();
}
tick-".$sampling_time."s {
exit(0);
}";
} elsif ($sampling_type eq 1) {
return "
syscall:::entry /pid == ".$sampling_pid."/ {
\@traces[ustack(100)] = count();
}
tick-".$sampling_time."s {
exit(0);
}";
} elsif ($sampling_type eq 2) {
return "
profile-".$sampling_hz." /pid == ".$sampling_pid." && arg0/ {
@[stringof(curthread->td_name), stack(100)] = count();
}
profile-".$sampling_hz." /pid == ".$sampling_pid." && arg1/ {
@[stringof(curthread->td_name), ustack(100)] = count();
}
tick-".$sampling_time."s {
exit(0);
}";
} elsif ($sampling_type eq 3) {
return "
io::start /pid == ".$sampling_pid."/ {
@[ustack(100)] = count();
}
tick-".$sampling_time."s {
exit(0);
}";
} else {
die "idk";
}
}
sub dtrace_generate {
my @date = (localtime(time))[5, 4, 3, 2, 1, 0];
$date[0] += 1900;
$date[1]++;
my $fmt_date = sprintf "%4d-%02d-%02d_%02d-%02d-%02d", @date;
my $trace_dir = "dtrace-out";
my $trace_file = $trace_dir . "/" . $fmt_date . ".user_stacks";
my $trace_fold = $trace_dir . "/" . $fmt_date . ".fold";
my $trace_svg = $trace_dir . "/" . $fmt_date . ".svg";
my $trace_probe = dtrace_probe_profiling;
print $trace_probe . "\n";
system "sudo", "dtrace", "-Z", "-n", $trace_probe, "-o", $trace_file;
die "$!" if $?;
open (my $trace_fold_handle, ">", $trace_fold) or die "$!";
#run ["perl", "../FlameGraph/stackcollapse.pl", $trace_file], ">", \my $fold_output;
my $fold_output = `perl ../FlameGraph/stackcollapse.pl $trace_file`;
print $trace_fold_handle $fold_output;
open (my $trace_svg_handle, ">", $trace_svg) or die "$!";
#run ["perl", "../FlameGraph/flamegraph.pl", $trace_fold], ">", \my $svg_output;
my $svg_output = `perl ../FlameGraph/flamegraph.pl $trace_fold`;
print $trace_svg_handle $svg_output;
system "sudo", "chmod", "0666", $trace_file;
}
foreach my $i (0 .. $#ARGV) {
if ($ARGV[$i] eq '-h') {
print "Usage: $0\n";
printf "%-20s%s\n", "-p", "Prompt for parameters";
printf "%-20s%s\n", "-g", "Generate dtrace output";
printf "%-20s%s\n", "-s", "Continously generate output until Ctrl^C";
printf "%-20s%s\n", "-<n>", "Select dtrace type";
} elsif ($ARGV[$i] eq '-g') {
dtrace_generate;
} elsif ($ARGV[$i] eq '-s') {
while (1) {
dtrace_generate;
}
} elsif ($ARGV[$i] eq '-p') {
dtrace_ask_params;
} else {
$sampling_type = substr $ARGV[$i], 1;
print "Select: ".$sampling_type."\n";
}
}

View file

@ -1,42 +0,0 @@
#!/usr/local/bin/bash -ex
# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
# SPDX-License-Identifier: GPL-3.0-or-later
# Basic script to run dtrace sampling over the program (requires Flamegraph)
# Usage is either running as: ./dtrace-tool.sh pid (then input the pid of the process)
# Or just run directly with: ./dtrace-tool.sh <command>
FLAMEGRAPH_DIR=".."
function fail {
printf '%s\n' "$1" >&2
exit "${2-1}"
}
[ -f $FLAMEGRAPH_DIR/FlameGraph/stackcollapse.pl ] || fail 'Where is flamegraph?'
#[ which dtrace ] || fail 'Needs DTrace installed'
read -p "Sampling Hz [800]: " TRACE_CFG_HZ
if [ -z "${TRACE_CFG_HZ}" ]; then
TRACE_CFG_HZ=800
fi
read -p "Sampling time [5] sec: " TRACE_CFG_TIME
if [ -z "${TRACE_CFG_TIME}" ]; then
TRACE_CFG_TIME=5
fi
TRACE_FILE=dtrace-out.user_stacks
TRACE_FOLD=dtrace-out.fold
TRACE_SVG=dtrace-out.svg
ps
if [[ $1 = 'pid' ]]; then
read -p "PID: " TRACE_CFG_PID
sudo echo 'Sudo!'
else
[[ -f $1 && $1 ]] || fail 'Usage: ./tools/dtrace-profile.sh <path to program>'
echo "Executing: '$@'"
sudo echo 'Sudo!'
"$@" &
TRACE_CFG_PID=$!
fi
TRACE_PROBE="profile-${TRACE_CFG_HZ} /pid == ${TRACE_CFG_PID} && arg1/ { @[ustack()] = count(); } tick-${TRACE_CFG_TIME}s { exit(0); }"
rm -- $TRACE_SVG || echo 'Skip'
sudo dtrace -x ustackframes=100 -Z -n "$TRACE_PROBE" -o $TRACE_FILE 2>/dev/null || exit
perl $FLAMEGRAPH_DIR/FlameGraph/stackcollapse.pl $TRACE_FILE > $TRACE_FOLD || exit
perl $FLAMEGRAPH_DIR/FlameGraph/flamegraph.pl $TRACE_FOLD > $TRACE_SVG || exit
sudo chmod 0666 $TRACE_FILE
rm -- $TRACE_FILE $TRACE_FOLD