Compare commits

...

2 commits

Author SHA1 Message Date
Ribbit
0a1faf9f5d [vk] Experimental StreamBuffer Changes
All checks were successful
eden-license / license-header (pull_request) Successful in 34s
2025-10-06 18:26:43 +02:00
5d4cfe195b
[vk] Fix Vulkan Upload & Present Barriers for Spec Compliance (#2681)
The barrier before the CPU-upload copy was using VK_PIPELINE_STAGE_HOST_BIT. Validation rules only allow HOST as the source stage if you’re also specifying host-side access flags; inside a command buffer the GPU isn’t executing “host work,” so pairing that stage with the usual image layout transition is technically invalid.

Switching the source stage to VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT keeps the ordering guarantee we need and satisfies the spec, while the actual host visibility is still handled correctly by the preceding vmaFlushAllocation.

Co-authored-by: Ribbit <ribbit@placeholder.com>
Reviewed-on: #2681
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: Shinmegumi <shinmegumi@eden-emu.dev>
Co-authored-by: Ribbit <ribbit@eden-emu.dev>
Co-committed-by: Ribbit <ribbit@eden-emu.dev>
2025-10-06 17:39:32 +02:00
4 changed files with 50 additions and 18 deletions

View file

@ -332,7 +332,7 @@ void Layer::UpdateRawImage(const Tegra::FramebufferConfig& framebuffer, size_t i
write_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
write_barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
read_barrier);
cmdbuf.CopyBufferToImage(*buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,

View file

@ -412,7 +412,7 @@ void PresentManager::CopyToSwapchainImpl(Frame* frame) {
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT,
.dstAccessMask = 0,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@ -460,7 +460,7 @@ void PresentManager::CopyToSwapchainImpl(Frame* frame) {
MakeImageCopy(frame->width, frame->height, extent.width, extent.height));
}
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, {},
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, {},
{}, {}, post_barriers);
cmdbuf.End();

View file

@ -33,19 +33,29 @@ constexpr VkDeviceSize MAX_STREAM_BUFFER_SIZE = 128_MiB;
size_t GetStreamBufferSize(const Device& device) {
VkDeviceSize size{0};
if (device.HasDebuggingToolAttached()) {
ForEachDeviceLocalHostVisibleHeap(device, [&size](size_t index, VkMemoryHeap& heap) {
bool found_heap = false;
ForEachDeviceLocalHostVisibleHeap(device, [&size, &found_heap](size_t /*index*/, VkMemoryHeap& heap) {
size = (std::max)(size, heap.size);
found_heap = true;
});
// If rebar is not supported, cut the max heap size to 40%. This will allow 2 captures to be
// loaded at the same time in RenderDoc. If rebar is supported, this shouldn't be an issue
// as the heap will be much larger.
if (size <= 256_MiB) {
// If no suitable heap was found fall back to the default cap to avoid creating a zero-sized stream buffer.
if (!found_heap) {
size = MAX_STREAM_BUFFER_SIZE;
} else if (size <= 256_MiB) {
// If rebar is not supported, cut the max heap size to 40%. This will allow 2 captures to be
// loaded at the same time in RenderDoc. If rebar is supported, this shouldn't be an issue
// as the heap will be much larger.
size = size * 40 / 100;
}
} else {
size = MAX_STREAM_BUFFER_SIZE;
}
return (std::min)(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE);
// Clamp to the configured maximum, align up for safety, and ensure a sane minimum so
// region_size (stream_buffer_size / NUM_SYNCS) never becomes zero.
const VkDeviceSize aligned = (std::min)(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE);
const VkDeviceSize min_size = MAX_ALIGNMENT * StagingBufferPool::NUM_SYNCS;
return static_cast<size_t>((std::max)(aligned, min_size));
}
} // Anonymous namespace
@ -106,31 +116,53 @@ void StagingBufferPool::TickFrame() {
}
StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
if (AreRegionsActive(Region(free_iterator) + 1,
(std::min)(Region(iterator + size) + 1, NUM_SYNCS))) {
const size_t aligned_size = Common::AlignUp(size, MAX_ALIGNMENT);
const bool wraps = iterator + size >= stream_buffer_size;
const size_t new_iterator =
wraps ? aligned_size : Common::AlignUp(iterator + size, MAX_ALIGNMENT);
const size_t begin_region = wraps ? 0 : Region(iterator);
const size_t last_byte = new_iterator == 0 ? 0 : new_iterator - 1;
const size_t end_region = (std::min)(Region(last_byte) + 1, NUM_SYNCS);
const size_t guard_begin = (std::min)(Region(free_iterator) + 1, NUM_SYNCS);
if (!wraps) {
if (guard_begin < end_region && AreRegionsActive(guard_begin, end_region)) {
// Avoid waiting for the previous usages to be free
return GetStagingBuffer(size, MemoryUsage::Upload);
}
} else if (guard_begin < NUM_SYNCS && AreRegionsActive(guard_begin, NUM_SYNCS)) {
// Avoid waiting for the previous usages to be free
return GetStagingBuffer(size, MemoryUsage::Upload);
}
const u64 current_tick = scheduler.CurrentTick();
std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + Region(iterator),
current_tick);
used_iterator = iterator;
free_iterator = (std::max)(free_iterator, iterator + size);
if (iterator + size >= stream_buffer_size) {
if (wraps) {
std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS,
current_tick);
used_iterator = 0;
iterator = 0;
free_iterator = size;
if (AreRegionsActive(0, Region(size) + 1)) {
const size_t head_last_byte = aligned_size == 0 ? 0 : aligned_size - 1;
const size_t head_end_region = (std::min)(Region(head_last_byte) + 1, NUM_SYNCS);
if (AreRegionsActive(0, head_end_region)) {
// Avoid waiting for the previous usages to be free
return GetStagingBuffer(size, MemoryUsage::Upload);
}
}
const size_t offset = iterator;
iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT);
std::fill(sync_ticks.begin() + begin_region, sync_ticks.begin() + end_region, current_tick);
const size_t offset = wraps ? 0 : iterator;
iterator = new_iterator;
if (!wraps) {
free_iterator = (std::max)(free_iterator, offset + size);
}
return StagingBufferRef{
.buffer = *stream_buffer,
.offset = static_cast<VkDeviceSize>(offset),

View file

@ -1068,7 +1068,7 @@ void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src,
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, READ_BARRIER, {}, middle_out_barrier);
cmdbuf.CopyBufferToImage(copy_buffer, dst_image, VK_IMAGE_LAYOUT_GENERAL, vk_out_copies);
cmdbuf.CopyBufferToImage(copy_buffer, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk_out_copies);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, {}, {}, post_barriers);
});