From 17556bf64bf2acd34f2e0182c3021ab41bdad09f Mon Sep 17 00:00:00 2001 From: Ribbit Date: Wed, 8 Oct 2025 18:38:43 -0700 Subject: [PATCH 1/2] [vk] Fix Vulkan streaming ring alignment and flushes --- .../renderer_vulkan/renderer_vulkan.cpp | 4 +- .../renderer_vulkan/vk_buffer_cache.cpp | 9 +++ .../renderer_vulkan/vk_buffer_cache.h | 17 +++++ .../renderer_vulkan/vk_query_cache.cpp | 6 ++ .../renderer_vulkan/vk_scheduler.cpp | 11 ++- src/video_core/renderer_vulkan/vk_scheduler.h | 23 +++++- .../vk_staging_buffer_pool.cpp | 73 +++++++++++++++++-- .../renderer_vulkan/vk_staging_buffer_pool.h | 12 +++ src/video_core/vulkan_common/vulkan_device.h | 10 +++ .../vulkan_common/vulkan_wrapper.cpp | 6 +- src/video_core/vulkan_common/vulkan_wrapper.h | 7 ++ 11 files changed, 165 insertions(+), 13 deletions(-) diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index e6e72cdca7..3ef606c4d2 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -166,7 +166,7 @@ try if (Settings::values.renderer_force_max_clock.GetValue() && device.ShouldBoostClocks()) { turbo_mode.emplace(instance, dld); - scheduler.RegisterOnSubmit([this] { turbo_mode->QueueSubmitted(); }); + scheduler.AddOnSubmit([this] { turbo_mode->QueueSubmitted(); }); } Report(); @@ -176,7 +176,7 @@ try } RendererVulkan::~RendererVulkan() { - scheduler.RegisterOnSubmit([] {}); + scheduler.RegisterOnSubmit(std::function{}); void(device.GetLogical().WaitIdle()); } diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 55565e3d79..2c807b9c69 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -179,6 +179,11 @@ public: }(); u8* staging_data = host_visible ? buffer.Mapped().data() : staging.mapped_span.data(); +#ifdef YUZU_DEBUG + if (!host_visible) { + ASSERT(staging.mapped_span.size() >= size_bytes); + } +#endif const size_t quad_size = bytes_per_index * 6; for (u32 first = 0; first < num_first_offset_copies; ++first) { @@ -514,6 +519,10 @@ void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat ReserveNullBuffer(); vk_buffer = *null_buffer; } +#ifdef YUZU_DEBUG + const size_t bytes_per_index = BytesPerIndex(vk_index_type); + ASSERT(bytes_per_index == 0 || (vk_offset % bytes_per_index) == 0); +#endif scheduler.Record([vk_buffer, vk_offset, vk_index_type](vk::CommandBuffer cmdbuf) { cmdbuf.BindIndexBuffer(vk_buffer, vk_offset, vk_index_type); }); diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index efe960258c..784e1f7c5c 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -128,20 +128,37 @@ public: [[maybe_unused]] u32 binding_index, u32 size) { const StagingBufferRef ref = staging_pool.Request(size, MemoryUsage::Upload); BindBuffer(ref.buffer, static_cast(ref.offset), size); +#ifdef YUZU_DEBUG + ASSERT(ref.mapped_span.size() >= size); + const VkDeviceSize ubo_align = device.GetUniformBufferAlignment(); + ASSERT(ubo_align == 0 || (ref.offset % ubo_align) == 0); +#endif return ref.mapped_span; } void BindUniformBuffer(VkBuffer buffer, u32 offset, u32 size) { +#ifdef YUZU_DEBUG + const VkDeviceSize ubo_align = device.GetUniformBufferAlignment(); + ASSERT(ubo_align == 0 || (offset % ubo_align) == 0); +#endif BindBuffer(buffer, offset, size); } void BindStorageBuffer(VkBuffer buffer, u32 offset, u32 size, [[maybe_unused]] bool is_written) { +#ifdef YUZU_DEBUG + const VkDeviceSize ssbo_align = device.GetStorageBufferAlignment(); + ASSERT(ssbo_align == 0 || (offset % ssbo_align) == 0); +#endif BindBuffer(buffer, offset, size); } void BindTextureBuffer(Buffer& buffer, u32 offset, u32 size, VideoCore::Surface::PixelFormat format) { +#ifdef YUZU_DEBUG + const VkDeviceSize texel_align = device.GetTexelBufferAlignment(); + ASSERT(texel_align == 0 || (offset % texel_align) == 0); +#endif guest_descriptor_queue.AddTexelBuffer(buffer.View(offset, size, format)); } diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 89e0b1114e..31cc05c2e1 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -854,6 +854,9 @@ public: for (auto q : flushed_queries) { auto* query = GetQuery(q); u32 result = 0; +#ifdef YUZU_DEBUG + ASSERT(staging_ref.mapped_span.size() >= offset_base + sizeof(u32)); +#endif std::memcpy(&result, staging_ref.mapped_span.data() + offset_base, sizeof(u32)); query->value = static_cast(result); query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; @@ -1567,6 +1570,9 @@ void QueryCacheRuntime::SyncValues(std::span values, VkBuffer ba impl->little_cache[which_copy].first, .size = values[i].size, }); +#ifdef YUZU_DEBUG + ASSERT(ref.mapped_span.size() >= accumulated_size + values[i].size); +#endif std::memcpy(ref.mapped_span.data() + accumulated_size, &values[i].value, values[i].size); accumulated_size += values[i].size; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index d109d22cab..06b4d34078 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -15,6 +15,7 @@ #include "video_core/renderer_vulkan/vk_command_pool.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/vulkan_common/vulkan_device.h" @@ -233,8 +234,14 @@ u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_se upload_cmdbuf.End(); cmdbuf.End(); - if (on_submit) { - on_submit(); + if (staging_buffer_pool) { + staging_buffer_pool->FlushStream(); + } + + for (const auto& callback : on_submit_callbacks) { + if (callback) { + callback(); + } } std::scoped_lock lock{submit_mutex}; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 54ab8ba52b..50d05e57f1 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -10,6 +10,7 @@ #include #include #include +#include #include "common/alignment.h" #include "common/common_types.h" @@ -29,6 +30,7 @@ class Device; class Framebuffer; class GraphicsPipeline; class StateTracker; +class StagingBufferPool; struct QueryCacheParams; @@ -73,9 +75,23 @@ public: query_cache = &query_cache_; } - // Registers a callback to perform on queue submission. + void SetStagingBufferPool(StagingBufferPool* pool) { + staging_buffer_pool = pool; + } + + // Registers a callback to perform on queue submission, replacing existing callbacks. void RegisterOnSubmit(std::function&& func) { - on_submit = std::move(func); + on_submit_callbacks.clear(); + if (func) { + on_submit_callbacks.emplace_back(std::move(func)); + } + } + + // Adds an additional callback to perform on queue submission. + void AddOnSubmit(std::function&& func) { + if (func) { + on_submit_callbacks.emplace_back(std::move(func)); + } } /// Send work to a separate thread. @@ -237,12 +253,13 @@ private: std::unique_ptr command_pool; VideoCommon::QueryCacheBase* query_cache = nullptr; + StagingBufferPool* staging_buffer_pool = nullptr; vk::CommandBuffer current_cmdbuf; vk::CommandBuffer current_upload_cmdbuf; std::unique_ptr chunk; - std::function on_submit; + std::vector> on_submit_callbacks; State state; diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 0fbe707b04..49a0f33805 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -30,6 +30,11 @@ constexpr VkDeviceSize MIN_STREAM_ALIGNMENT = 256; // Stream buffer size in bytes constexpr VkDeviceSize MAX_STREAM_BUFFER_SIZE = 128_MiB; +VkDeviceSize GetStreamAlignment(const Device& device) { + return (std::max)({device.GetUniformBufferAlignment(), device.GetStorageBufferAlignment(), + device.GetTexelBufferAlignment(), MIN_STREAM_ALIGNMENT}); +} + size_t GetStreamBufferSize(const Device& device, VkDeviceSize alignment) { VkDeviceSize size{0}; if (device.HasDebuggingToolAttached()) { @@ -63,8 +68,7 @@ size_t GetStreamBufferSize(const Device& device, VkDeviceSize alignment) { StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, Scheduler& scheduler_) : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_}, - stream_alignment{std::max(device_.GetUniformBufferAlignment(), - MIN_STREAM_ALIGNMENT)}, + stream_alignment{GetStreamAlignment(device_)}, stream_buffer_size{GetStreamBufferSize(device_, stream_alignment)}, region_size{stream_buffer_size / StagingBufferPool::NUM_SYNCS} { VkBufferCreateInfo stream_ci = { @@ -87,9 +91,18 @@ StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& mem } stream_pointer = stream_buffer.Mapped(); ASSERT_MSG(!stream_pointer.empty(), "Stream buffer must be host visible!"); + stream_is_coherent = stream_buffer.IsHostCoherent(); + non_coherent_atom_size = std::max(device.GetNonCoherentAtomSize(), + static_cast(1)); + dirty_begin = stream_buffer_size; + dirty_end = 0; + stream_dirty = false; + scheduler.SetStagingBufferPool(this); } -StagingBufferPool::~StagingBufferPool() = default; +StagingBufferPool::~StagingBufferPool() { + scheduler.SetStagingBufferPool(nullptr); +} StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage, bool deferred) { if (!deferred && usage == MemoryUsage::Upload && size <= region_size) { @@ -121,9 +134,10 @@ void StagingBufferPool::TickFrame() { StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { const size_t alignment = static_cast(stream_alignment); const size_t aligned_size = Common::AlignUp(size, alignment); - const bool wraps = iterator + size >= stream_buffer_size; + const size_t capacity = static_cast(stream_buffer_size); + const bool wraps = iterator + aligned_size > capacity; const size_t new_iterator = - wraps ? aligned_size : Common::AlignUp(iterator + size, alignment); + wraps ? aligned_size : Common::AlignUp(iterator + aligned_size, alignment); const size_t begin_region = wraps ? 0 : Region(iterator); const size_t last_byte = new_iterator == 0 ? 0 : new_iterator - 1; const size_t end_region = (std::min)(Region(last_byte) + 1, NUM_SYNCS); @@ -167,6 +181,8 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { free_iterator = (std::max)(free_iterator, offset + aligned_size); } + TrackStreamWrite(static_cast(offset), static_cast(aligned_size)); + return StagingBufferRef{ .buffer = *stream_buffer, .offset = static_cast(offset), @@ -177,6 +193,53 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { }; } +void StagingBufferPool::TrackStreamWrite(VkDeviceSize offset, VkDeviceSize size) { + if (stream_is_coherent || size == 0) { + return; + } + const VkDeviceSize clamped_offset = (std::min)(offset, stream_buffer_size); + const VkDeviceSize clamped_end = (std::min)(clamped_offset + size, stream_buffer_size); + std::scoped_lock lock{stream_mutex}; + if (!stream_dirty) { + dirty_begin = clamped_offset; + dirty_end = clamped_end; + stream_dirty = true; + return; + } + dirty_begin = (std::min)(dirty_begin, clamped_offset); + dirty_end = (std::max)(dirty_end, clamped_end); +} + +void StagingBufferPool::FlushStream() { + if (stream_is_coherent) { + return; + } + + VkDeviceSize flush_begin = 0; + VkDeviceSize flush_end = 0; + { + std::scoped_lock lock{stream_mutex}; + if (!stream_dirty) { + return; + } + flush_begin = dirty_begin; + flush_end = dirty_end; + stream_dirty = false; + dirty_begin = stream_buffer_size; + dirty_end = 0; + } + + if (flush_begin >= flush_end) { + return; + } + + const VkDeviceSize atom = non_coherent_atom_size; + const VkDeviceSize aligned_begin = Common::AlignDown(flush_begin, atom); + const VkDeviceSize aligned_end = Common::AlignUp(flush_end, atom); + const VkDeviceSize flush_size = aligned_end - aligned_begin; + stream_buffer.FlushRange(aligned_begin, flush_size); +} + bool StagingBufferPool::AreRegionsActive(size_t region_begin, size_t region_end) const { const u64 gpu_tick = scheduler.GetMasterSemaphore().KnownGpuTick(); return std::any_of(sync_ticks.begin() + region_begin, sync_ticks.begin() + region_end, diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 5c40ca069f..9284578975 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -7,6 +7,7 @@ #pragma once #include +#include #include #include "common/common_types.h" @@ -30,6 +31,8 @@ struct StagingBufferRef { class StagingBufferPool { public: + friend class Scheduler; + static constexpr size_t NUM_SYNCS = 16; explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator, @@ -83,6 +86,9 @@ private: StagingBufferRef GetStreamBuffer(size_t size); + void TrackStreamWrite(VkDeviceSize offset, VkDeviceSize size); + void FlushStream(); + bool AreRegionsActive(size_t region_begin, size_t region_end) const; StagingBufferRef GetStagingBuffer(size_t size, MemoryUsage usage, bool deferred = false); @@ -110,6 +116,12 @@ private: std::span stream_pointer; VkDeviceSize stream_buffer_size; VkDeviceSize region_size; + bool stream_is_coherent = true; + VkDeviceSize non_coherent_atom_size = 1; + VkDeviceSize dirty_begin = 0; + VkDeviceSize dirty_end = 0; + bool stream_dirty = false; + std::mutex stream_mutex; size_t iterator = 0; size_t used_iterator = 0; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index cb13f28523..312d25a449 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -295,6 +295,16 @@ public: return properties.properties.limits.minStorageBufferOffsetAlignment; } + /// Returns texel buffer alignment requirement. + VkDeviceSize GetTexelBufferAlignment() const { + return properties.properties.limits.minTexelBufferOffsetAlignment; + } + + /// Returns the non-coherent atom size for memory flushes. + VkDeviceSize GetNonCoherentAtomSize() const { + return properties.properties.limits.nonCoherentAtomSize; + } + /// Returns the maximum range for storage buffers. VkDeviceSize GetMaxStorageBufferRange() const { return properties.properties.limits.maxStorageBufferRange; diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index b77d01711a..f8fbc0c206 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -499,8 +499,12 @@ void Image::Release() const noexcept { } void Buffer::Flush() const { + FlushRange(0, VK_WHOLE_SIZE); +} + +void Buffer::FlushRange(VkDeviceSize offset, VkDeviceSize size) const { if (!is_coherent) { - vmaFlushAllocation(allocator, allocation, 0, VK_WHOLE_SIZE); + vmaFlushAllocation(allocator, allocation, offset, size); } } diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 39396b3279..625dc32fa8 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -772,8 +772,15 @@ public: return !mapped.empty(); } + /// Returns true if the buffer memory is host coherent. + bool IsHostCoherent() const noexcept { + return is_coherent; + } + void Flush() const; + void FlushRange(VkDeviceSize offset, VkDeviceSize size) const; + void Invalidate() const; void SetObjectNameEXT(const char* name) const; From daa5f94915311f21551000767b1ccdbfc2396992 Mon Sep 17 00:00:00 2001 From: Ribbit Date: Wed, 8 Oct 2025 20:09:32 -0700 Subject: [PATCH 2/2] emergency fix --- src/video_core/buffer_cache/buffer_cache.h | 46 +++++++++++++++- .../renderer_vulkan/vk_buffer_cache.cpp | 1 + .../renderer_vulkan/vk_query_cache.cpp | 5 ++ .../vk_staging_buffer_pool.cpp | 16 ++++-- .../renderer_vulkan/vk_staging_buffer_pool.h | 52 ++++++++++++++++++- src/video_core/texture_cache/texture_cache.h | 51 +++++++++++++++++- .../vulkan_common/vulkan_wrapper.cpp | 6 ++- src/video_core/vulkan_common/vulkan_wrapper.h | 2 + 8 files changed, 170 insertions(+), 9 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 388c8034c5..b5c8bd1996 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include "common/range_sets.inc" #include "video_core/buffer_cache/buffer_cache_base.h" @@ -19,6 +21,43 @@ namespace VideoCommon { using Core::DEVICE_PAGESIZE; +namespace staging_detail { +template +struct has_flush_range : std::false_type {}; +template +struct has_flush_range< + T, std::void_t().FlushRange(size_t{}, size_t{}))>> : std::true_type {}; + +template +struct has_invalidate_range : std::false_type {}; +template +struct has_invalidate_range< + T, std::void_t().InvalidateRange(size_t{}, size_t{}))>> + : std::true_type {}; +} // namespace staging_detail + +template +inline void StagingFlushRange(Ref& ref, size_t offset, size_t size) { + if constexpr (staging_detail::has_flush_range::value) { + ref.FlushRange(offset, size); + } else { + (void)ref; + (void)offset; + (void)size; + } +} + +template +inline void StagingInvalidateRange(Ref& ref, size_t offset, size_t size) { + if constexpr (staging_detail::has_invalidate_range::value) { + ref.InvalidateRange(offset, size); + } else { + (void)ref; + (void)offset; + (void)size; + } +} + template BufferCache

::BufferCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, Runtime& runtime_) : runtime{runtime_}, device_memory{device_memory_}, memory_tracker{device_memory} { @@ -633,6 +672,7 @@ void BufferCache

::PopAsyncBuffers() { u8* base = async_buffer->mapped_span.data(); const size_t base_offset = async_buffer->offset; for (const auto& copy : downloads) { + StagingInvalidateRange(*async_buffer, copy.dst_offset, copy.size); const DAddr device_addr = static_cast(copy.src_offset); const u64 dst_offset = copy.dst_offset - base_offset; const u8* read_mapped_memory = base + dst_offset; @@ -696,6 +736,7 @@ void BufferCache

::BindHostIndexBuffer() { {BufferCopy{.src_offset = upload_staging.offset, .dst_offset = 0, .size = size}}}; std::memcpy(upload_staging.mapped_span.data(), draw_state.inline_index_draw_indexes.data(), size); + StagingFlushRange(upload_staging, upload_staging.offset, size); runtime.CopyBuffer(buffer, upload_staging.buffer, copies, true); } else { buffer.ImmediateUpload(0, draw_state.inline_index_draw_indexes); @@ -1519,7 +1560,7 @@ template void BufferCache

::MappedUploadMemory([[maybe_unused]] Buffer& buffer, [[maybe_unused]] u64 total_size_bytes, [[maybe_unused]] std::span copies) { - if constexpr (USE_MEMORY_MAPS) { + if constexpr (USE_MEMORY_MAPS) { auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); const std::span staging_pointer = upload_staging.mapped_span; for (BufferCopy& copy : copies) { @@ -1530,6 +1571,7 @@ void BufferCache

::MappedUploadMemory([[maybe_unused]] Buffer& buffer, // Apply the staging offset copy.src_offset += upload_staging.offset; } + StagingFlushRange(upload_staging, upload_staging.offset, total_size_bytes); const bool can_reorder = runtime.CanReorderUpload(buffer, copies); runtime.CopyBuffer(buffer, upload_staging.buffer, copies, true, can_reorder); } @@ -1572,6 +1614,7 @@ void BufferCache

::InlineMemoryImplementation(DAddr dest_address, size_t copy_ }}; u8* const src_pointer = upload_staging.mapped_span.data(); std::memcpy(src_pointer, inlined_buffer.data(), copy_size); + StagingFlushRange(upload_staging, upload_staging.offset, copy_size); const bool can_reorder = runtime.CanReorderUpload(buffer, copies); runtime.CopyBuffer(buffer, upload_staging.buffer, copies, true, can_reorder); } else { @@ -1626,6 +1669,7 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, DAddr device_addr, u64 } runtime.CopyBuffer(download_staging.buffer, buffer, copies_span, true); runtime.Finish(); + StagingInvalidateRange(download_staging, download_staging.offset, total_size_bytes); for (const BufferCopy& copy : copies) { const DAddr copy_device_addr = buffer.CpuAddr() + copy.src_offset; // Undo the modified offset diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 2c807b9c69..45033ba6d4 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -194,6 +194,7 @@ public: } if (!host_visible) { + staging.FlushRange(staging.offset, static_cast(size_bytes)); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset, dst_buffer = *buffer, size_bytes](vk::CommandBuffer cmdbuf) { diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 31cc05c2e1..32efe87b28 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -850,6 +850,10 @@ public: pending_flush_sets.pop_front(); } + const VkDeviceSize read_size = + static_cast(flushed_queries.size() * TFBQueryBank::QUERY_SIZE); + staging_ref.InvalidateRange(staging_ref.offset, read_size); + size_t offset_base = staging_ref.offset; for (auto q : flushed_queries) { auto* query = GetQuery(q); @@ -1577,6 +1581,7 @@ void QueryCacheRuntime::SyncValues(std::span values, VkBuffer ba values[i].size); accumulated_size += values[i].size; } + ref.FlushRange(ref.offset, static_cast(accumulated_size)); src_buffer = ref.buffer; } else { for (size_t i = 0; i < values.size(); i++) { diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 49a0f33805..88da60b190 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include +#include #include #include @@ -187,9 +188,12 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { .buffer = *stream_buffer, .offset = static_cast(offset), .mapped_span = stream_pointer.subspan(offset, size), - .usage{}, - .log2_level{}, - .index{}, + .usage = MemoryUsage::Upload, + .log2_level = 0, + .index = 0, + .owner = &stream_buffer, + .atom_size = non_coherent_atom_size, + .is_coherent = stream_is_coherent, }; } @@ -301,15 +305,19 @@ StagingBufferRef StagingBufferPool::CreateStagingBuffer(size_t size, MemoryUsage ++buffer_index; buffer.SetObjectNameEXT(fmt::format("Staging Buffer {}", buffer_index).c_str()); } + const bool is_coherent = buffer.IsHostCoherent(); const std::span mapped_span = buffer.Mapped(); + auto buffer_ptr = std::make_unique(std::move(buffer)); StagingBuffer& entry = GetCache(usage)[log2].entries.emplace_back(StagingBuffer{ - .buffer = std::move(buffer), + .buffer = std::move(buffer_ptr), .mapped_span = mapped_span, .usage = usage, .log2_level = log2, .index = unique_ids++, .tick = deferred ? (std::numeric_limits::max)() : scheduler.CurrentTick(), .deferred = deferred, + .is_coherent = is_coherent, + .atom_size = is_coherent ? 1 : non_coherent_atom_size, }); return entry.Ref(); } diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 9284578975..8bd325c51f 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -8,9 +8,11 @@ #include #include +#include #include #include "common/common_types.h" +#include "common/alignment.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" @@ -27,6 +29,47 @@ struct StagingBufferRef { MemoryUsage usage; u32 log2_level; u64 index; + const vk::Buffer* owner = nullptr; + VkDeviceSize atom_size = 1; + bool is_coherent = true; + + void FlushRange(VkDeviceSize range_offset, VkDeviceSize size) const { + if (!owner || is_coherent || size == 0) { + return; + } + if (size == VK_WHOLE_SIZE) { + owner->FlushRange(range_offset, size); + return; + } + const VkDeviceSize atom = atom_size ? atom_size : 1; + const VkDeviceSize range_end = range_offset + size; + if (range_end < range_offset) { + owner->FlushRange(range_offset, size); + return; + } + const VkDeviceSize aligned_begin = Common::AlignDown(range_offset, atom); + const VkDeviceSize aligned_end = Common::AlignUp(range_end, atom); + owner->FlushRange(aligned_begin, aligned_end - aligned_begin); + } + + void InvalidateRange(VkDeviceSize range_offset, VkDeviceSize size) const { + if (!owner || is_coherent || size == 0) { + return; + } + if (size == VK_WHOLE_SIZE) { + owner->InvalidateRange(range_offset, size); + return; + } + const VkDeviceSize atom = atom_size ? atom_size : 1; + const VkDeviceSize range_end = range_offset + size; + if (range_end < range_offset) { + owner->InvalidateRange(range_offset, size); + return; + } + const VkDeviceSize aligned_begin = Common::AlignDown(range_offset, atom); + const VkDeviceSize aligned_end = Common::AlignUp(range_end, atom); + owner->InvalidateRange(aligned_begin, aligned_end - aligned_begin); + } }; class StagingBufferPool { @@ -55,22 +98,27 @@ private: }; struct StagingBuffer { - vk::Buffer buffer; + std::unique_ptr buffer; std::span mapped_span; MemoryUsage usage; u32 log2_level; u64 index; u64 tick = 0; bool deferred{}; + bool is_coherent = true; + VkDeviceSize atom_size = 1; StagingBufferRef Ref() const noexcept { return { - .buffer = *buffer, + .buffer = buffer ? **buffer : VkBuffer{}, .offset = 0, .mapped_span = mapped_span, .usage = usage, .log2_level = log2_level, .index = index, + .owner = buffer.get(), + .atom_size = atom_size, + .is_coherent = is_coherent, }; } }; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index e5d559b591..d855c5f16b 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -7,6 +7,8 @@ #pragma once #include +#include +#include #include #include "common/alignment.h" @@ -30,6 +32,42 @@ using VideoCore::Surface::PixelFormat; using VideoCore::Surface::SurfaceType; using namespace Common::Literals; +namespace staging_detail { +template +struct has_flush_range : std::false_type {}; +template +struct has_flush_range< + T, std::void_t().FlushRange(size_t{}, size_t{}))>> : std::true_type {}; +template +struct has_invalidate_range : std::false_type {}; +template +struct has_invalidate_range< + T, std::void_t().InvalidateRange(size_t{}, size_t{}))>> + : std::true_type {}; +} // namespace staging_detail + +template +inline void StagingFlushRange(Ref& ref, size_t offset, size_t size) { + if constexpr (staging_detail::has_flush_range::value) { + ref.FlushRange(offset, size); + } else { + (void)ref; + (void)offset; + (void)size; + } +} + +template +inline void StagingInvalidateRange(Ref& ref, size_t offset, size_t size) { + if constexpr (staging_detail::has_invalidate_range::value) { + ref.InvalidateRange(offset, size); + } else { + (void)ref; + (void)offset; + (void)size; + } +} + template TextureCache

::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManager& device_memory_) : runtime{runtime_}, device_memory{device_memory_} { @@ -111,6 +149,7 @@ void TextureCache

::RunGarbageCollector() { const auto copies = FullDownloadCopies(image.info); image.DownloadMemory(map, copies); runtime.Finish(); + StagingInvalidateRange(map, map.offset, image.unswizzled_size_bytes); SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span, swizzle_data_buffer); } @@ -567,6 +606,7 @@ void TextureCache

::DownloadMemory(DAddr cpu_addr, size_t size) { const auto copies = FullDownloadCopies(image.info); image.DownloadMemory(map, copies); runtime.Finish(); + StagingInvalidateRange(map, map.offset, image.unswizzled_size_bytes); SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span, swizzle_data_buffer); } @@ -863,13 +903,17 @@ void TextureCache

::PopAsyncFlushes() { if (download_info.is_swizzle) { const ImageBase& image = slot_images[download_info.object_id]; const auto copies = FullDownloadCopies(image.info); - download_buffer.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64); + const size_t aligned_size = + Common::AlignUp(image.unswizzled_size_bytes, static_cast(64)); + download_buffer.offset -= aligned_size; + StagingInvalidateRange(download_buffer, download_buffer.offset, aligned_size); std::span download_span = download_buffer.mapped_span.subspan(download_buffer.offset); SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span, swizzle_data_buffer); } else { const BufferDownload& buffer_info = slot_buffer_downloads[download_info.object_id]; + StagingInvalidateRange(download_buffer, download_buffer.offset, buffer_info.size); std::span download_span = download_buffer.mapped_span.subspan(download_buffer.offset); gpu_memory->WriteBlockUnsafe(buffer_info.address, download_span.data(), @@ -907,6 +951,7 @@ void TextureCache

::PopAsyncFlushes() { } // Wait for downloads to finish runtime.Finish(); + StagingInvalidateRange(download_map, original_offset, total_size_bytes); download_map.offset = original_offset; std::span download_span = download_map.mapped_span; for (const PendingDownload& download_info : download_ids) { @@ -1081,6 +1126,7 @@ void TextureCache

::UploadImageContents(Image& image, StagingBuffer& staging) if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { gpu_memory->ReadBlock(gpu_addr, mapped_span.data(), mapped_span.size_bytes(), VideoCommon::CacheType::NoTextureCache); + StagingFlushRange(staging, staging.offset, mapped_span.size_bytes()); const auto uploads = FullUploadSwizzles(image.info); runtime.AccelerateImageUpload(image, staging, uploads); return; @@ -1094,10 +1140,12 @@ void TextureCache

::UploadImageContents(Image& image, StagingBuffer& staging) auto copies = UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data, unswizzle_data_buffer); ConvertImage(unswizzle_data_buffer, image.info, mapped_span, copies); + StagingFlushRange(staging, staging.offset, mapped_span.size_bytes()); image.UploadMemory(staging, copies); } else { const auto copies = UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data, mapped_span); + StagingFlushRange(staging, staging.offset, mapped_span.size_bytes()); image.UploadMemory(staging, copies); } } @@ -1329,6 +1377,7 @@ void TextureCache

::TickAsyncDecode() { auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image)); std::memcpy(staging.mapped_span.data(), async_decode->decoded_data.data(), async_decode->decoded_data.size()); + StagingFlushRange(staging, staging.offset, async_decode->decoded_data.size()); image.UploadMemory(staging, async_decode->copies); image.flags &= ~ImageFlagBits::IsDecoding; has_uploads = true; diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index f8fbc0c206..77534776cf 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -509,8 +509,12 @@ void Buffer::FlushRange(VkDeviceSize offset, VkDeviceSize size) const { } void Buffer::Invalidate() const { + InvalidateRange(0, VK_WHOLE_SIZE); +} + +void Buffer::InvalidateRange(VkDeviceSize offset, VkDeviceSize size) const { if (!is_coherent) { - vmaInvalidateAllocation(allocator, allocation, 0, VK_WHOLE_SIZE); + vmaInvalidateAllocation(allocator, allocation, offset, size); } } diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 625dc32fa8..7541a08e7f 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -783,6 +783,8 @@ public: void Invalidate() const; + void InvalidateRange(VkDeviceSize offset, VkDeviceSize size) const; + void SetObjectNameEXT(const char* name) const; private: