From 1ea9091e134176fa5263cac886c9044085fc36b8 Mon Sep 17 00:00:00 2001 From: Ribbit Date: Mon, 6 Oct 2025 00:31:59 -0700 Subject: [PATCH 1/3] [vk] Experimental StreamBuffer Changes --- .../vk_staging_buffer_pool.cpp | 60 ++++++++++++++----- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 08513d1534..ecc4f77dc7 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -33,19 +33,29 @@ constexpr VkDeviceSize MAX_STREAM_BUFFER_SIZE = 128_MiB; size_t GetStreamBufferSize(const Device& device) { VkDeviceSize size{0}; if (device.HasDebuggingToolAttached()) { - ForEachDeviceLocalHostVisibleHeap(device, [&size](size_t index, VkMemoryHeap& heap) { + bool found_heap = false; + ForEachDeviceLocalHostVisibleHeap(device, [&size, &found_heap](size_t /*index*/, VkMemoryHeap& heap) { size = (std::max)(size, heap.size); + found_heap = true; }); - // If rebar is not supported, cut the max heap size to 40%. This will allow 2 captures to be - // loaded at the same time in RenderDoc. If rebar is supported, this shouldn't be an issue - // as the heap will be much larger. - if (size <= 256_MiB) { + // If no suitable heap was found fall back to the default cap to avoid creating a zero-sized stream buffer. + if (!found_heap) { + size = MAX_STREAM_BUFFER_SIZE; + } else if (size <= 256_MiB) { + // If rebar is not supported, cut the max heap size to 40%. This will allow 2 captures to be + // loaded at the same time in RenderDoc. If rebar is supported, this shouldn't be an issue + // as the heap will be much larger. size = size * 40 / 100; } } else { size = MAX_STREAM_BUFFER_SIZE; } - return (std::min)(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE); + + // Clamp to the configured maximum, align up for safety, and ensure a sane minimum so + // region_size (stream_buffer_size / NUM_SYNCS) never becomes zero. + const VkDeviceSize aligned = (std::min)(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE); + const VkDeviceSize min_size = MAX_ALIGNMENT * StagingBufferPool::NUM_SYNCS; + return static_cast((std::max)(aligned, min_size)); } } // Anonymous namespace @@ -106,31 +116,53 @@ void StagingBufferPool::TickFrame() { } StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { - if (AreRegionsActive(Region(free_iterator) + 1, - (std::min)(Region(iterator + size) + 1, NUM_SYNCS))) { + const size_t aligned_size = Common::AlignUp(size, MAX_ALIGNMENT); + const bool wraps = iterator + size >= stream_buffer_size; + const size_t new_iterator = + wraps ? aligned_size : Common::AlignUp(iterator + size, MAX_ALIGNMENT); + const size_t begin_region = wraps ? 0 : Region(iterator); + const size_t last_byte = new_iterator == 0 ? 0 : new_iterator - 1; + const size_t end_region = (std::min)(Region(last_byte) + 1, NUM_SYNCS); + const size_t guard_begin = (std::min)(Region(free_iterator) + 1, NUM_SYNCS); + + if (!wraps) { + if (guard_begin < end_region && AreRegionsActive(guard_begin, end_region)) { + // Avoid waiting for the previous usages to be free + return GetStagingBuffer(size, MemoryUsage::Upload); + } + } else if (guard_begin < NUM_SYNCS && AreRegionsActive(guard_begin, NUM_SYNCS)) { // Avoid waiting for the previous usages to be free return GetStagingBuffer(size, MemoryUsage::Upload); } + const u64 current_tick = scheduler.CurrentTick(); std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + Region(iterator), current_tick); used_iterator = iterator; - free_iterator = (std::max)(free_iterator, iterator + size); - if (iterator + size >= stream_buffer_size) { + if (wraps) { std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS, current_tick); used_iterator = 0; iterator = 0; free_iterator = size; - - if (AreRegionsActive(0, Region(size) + 1)) { + const size_t head_last_byte = aligned_size == 0 ? 0 : aligned_size - 1; + const size_t head_end_region = (std::min)(Region(head_last_byte) + 1, NUM_SYNCS); + if (AreRegionsActive(0, head_end_region)) { // Avoid waiting for the previous usages to be free return GetStagingBuffer(size, MemoryUsage::Upload); } } - const size_t offset = iterator; - iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); + + std::fill(sync_ticks.begin() + begin_region, sync_ticks.begin() + end_region, current_tick); + + const size_t offset = wraps ? 0 : iterator; + iterator = new_iterator; + + if (!wraps) { + free_iterator = (std::max)(free_iterator, offset + size); + } + return StagingBufferRef{ .buffer = *stream_buffer, .offset = static_cast(offset), From 5d4cfe195b9d221a0b7c7195dd0fd0f8d38b1838 Mon Sep 17 00:00:00 2001 From: Ribbit Date: Mon, 6 Oct 2025 17:39:32 +0200 Subject: [PATCH 2/3] [vk] Fix Vulkan Upload & Present Barriers for Spec Compliance (#2681) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The barrier before the CPU-upload copy was using VK_PIPELINE_STAGE_HOST_BIT. Validation rules only allow HOST as the source stage if you’re also specifying host-side access flags; inside a command buffer the GPU isn’t executing “host work,” so pairing that stage with the usual image layout transition is technically invalid. Switching the source stage to VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT keeps the ordering guarantee we need and satisfies the spec, while the actual host visibility is still handled correctly by the preceding vmaFlushAllocation. Co-authored-by: Ribbit Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2681 Reviewed-by: MaranBr Reviewed-by: Shinmegumi Co-authored-by: Ribbit Co-committed-by: Ribbit --- src/video_core/renderer_vulkan/present/layer.cpp | 2 +- src/video_core/renderer_vulkan/vk_present_manager.cpp | 4 ++-- src/video_core/renderer_vulkan/vk_texture_cache.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/video_core/renderer_vulkan/present/layer.cpp b/src/video_core/renderer_vulkan/present/layer.cpp index 5676dfe62a..fee19a69c2 100644 --- a/src/video_core/renderer_vulkan/present/layer.cpp +++ b/src/video_core/renderer_vulkan/present/layer.cpp @@ -332,7 +332,7 @@ void Layer::UpdateRawImage(const Tegra::FramebufferConfig& framebuffer, size_t i write_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; write_barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, read_barrier); cmdbuf.CopyBufferToImage(*buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, diff --git a/src/video_core/renderer_vulkan/vk_present_manager.cpp b/src/video_core/renderer_vulkan/vk_present_manager.cpp index 0b29ad1389..161f6c8b9f 100644 --- a/src/video_core/renderer_vulkan/vk_present_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_present_manager.cpp @@ -412,7 +412,7 @@ void PresentManager::CopyToSwapchainImpl(Frame* frame) { .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT, + .dstAccessMask = 0, .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, .newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, @@ -460,7 +460,7 @@ void PresentManager::CopyToSwapchainImpl(Frame* frame) { MakeImageCopy(frame->width, frame->height, extent.width, extent.height)); } - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, {}, + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, {}, {}, {}, post_barriers); cmdbuf.End(); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 575651905e..50a73ea76d 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1068,7 +1068,7 @@ void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src, cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER, {}, middle_out_barrier); - cmdbuf.CopyBufferToImage(copy_buffer, dst_image, VK_IMAGE_LAYOUT_GENERAL, vk_out_copies); + cmdbuf.CopyBufferToImage(copy_buffer, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk_out_copies); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, {}, {}, post_barriers); }); From 0a1faf9f5d5f4426f3f0add4cb493a7abef36e14 Mon Sep 17 00:00:00 2001 From: Ribbit Date: Mon, 6 Oct 2025 00:31:59 -0700 Subject: [PATCH 3/3] [vk] Experimental StreamBuffer Changes --- .../vk_staging_buffer_pool.cpp | 60 ++++++++++++++----- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 08513d1534..ecc4f77dc7 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -33,19 +33,29 @@ constexpr VkDeviceSize MAX_STREAM_BUFFER_SIZE = 128_MiB; size_t GetStreamBufferSize(const Device& device) { VkDeviceSize size{0}; if (device.HasDebuggingToolAttached()) { - ForEachDeviceLocalHostVisibleHeap(device, [&size](size_t index, VkMemoryHeap& heap) { + bool found_heap = false; + ForEachDeviceLocalHostVisibleHeap(device, [&size, &found_heap](size_t /*index*/, VkMemoryHeap& heap) { size = (std::max)(size, heap.size); + found_heap = true; }); - // If rebar is not supported, cut the max heap size to 40%. This will allow 2 captures to be - // loaded at the same time in RenderDoc. If rebar is supported, this shouldn't be an issue - // as the heap will be much larger. - if (size <= 256_MiB) { + // If no suitable heap was found fall back to the default cap to avoid creating a zero-sized stream buffer. + if (!found_heap) { + size = MAX_STREAM_BUFFER_SIZE; + } else if (size <= 256_MiB) { + // If rebar is not supported, cut the max heap size to 40%. This will allow 2 captures to be + // loaded at the same time in RenderDoc. If rebar is supported, this shouldn't be an issue + // as the heap will be much larger. size = size * 40 / 100; } } else { size = MAX_STREAM_BUFFER_SIZE; } - return (std::min)(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE); + + // Clamp to the configured maximum, align up for safety, and ensure a sane minimum so + // region_size (stream_buffer_size / NUM_SYNCS) never becomes zero. + const VkDeviceSize aligned = (std::min)(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE); + const VkDeviceSize min_size = MAX_ALIGNMENT * StagingBufferPool::NUM_SYNCS; + return static_cast((std::max)(aligned, min_size)); } } // Anonymous namespace @@ -106,31 +116,53 @@ void StagingBufferPool::TickFrame() { } StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { - if (AreRegionsActive(Region(free_iterator) + 1, - (std::min)(Region(iterator + size) + 1, NUM_SYNCS))) { + const size_t aligned_size = Common::AlignUp(size, MAX_ALIGNMENT); + const bool wraps = iterator + size >= stream_buffer_size; + const size_t new_iterator = + wraps ? aligned_size : Common::AlignUp(iterator + size, MAX_ALIGNMENT); + const size_t begin_region = wraps ? 0 : Region(iterator); + const size_t last_byte = new_iterator == 0 ? 0 : new_iterator - 1; + const size_t end_region = (std::min)(Region(last_byte) + 1, NUM_SYNCS); + const size_t guard_begin = (std::min)(Region(free_iterator) + 1, NUM_SYNCS); + + if (!wraps) { + if (guard_begin < end_region && AreRegionsActive(guard_begin, end_region)) { + // Avoid waiting for the previous usages to be free + return GetStagingBuffer(size, MemoryUsage::Upload); + } + } else if (guard_begin < NUM_SYNCS && AreRegionsActive(guard_begin, NUM_SYNCS)) { // Avoid waiting for the previous usages to be free return GetStagingBuffer(size, MemoryUsage::Upload); } + const u64 current_tick = scheduler.CurrentTick(); std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + Region(iterator), current_tick); used_iterator = iterator; - free_iterator = (std::max)(free_iterator, iterator + size); - if (iterator + size >= stream_buffer_size) { + if (wraps) { std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS, current_tick); used_iterator = 0; iterator = 0; free_iterator = size; - - if (AreRegionsActive(0, Region(size) + 1)) { + const size_t head_last_byte = aligned_size == 0 ? 0 : aligned_size - 1; + const size_t head_end_region = (std::min)(Region(head_last_byte) + 1, NUM_SYNCS); + if (AreRegionsActive(0, head_end_region)) { // Avoid waiting for the previous usages to be free return GetStagingBuffer(size, MemoryUsage::Upload); } } - const size_t offset = iterator; - iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); + + std::fill(sync_ticks.begin() + begin_region, sync_ticks.begin() + end_region, current_tick); + + const size_t offset = wraps ? 0 : iterator; + iterator = new_iterator; + + if (!wraps) { + free_iterator = (std::max)(free_iterator, offset + size); + } + return StagingBufferRef{ .buffer = *stream_buffer, .offset = static_cast(offset),