[vk] Fix Vulkan streaming ring alignment and flushes
Some checks failed
eden-license / license-header (pull_request) Failing after 24s

This commit is contained in:
Ribbit 2025-10-08 18:38:43 -07:00 committed by crueter
parent 954c17c18a
commit 8dfe0283fb
11 changed files with 165 additions and 13 deletions

View file

@ -166,7 +166,7 @@ try
if (Settings::values.renderer_force_max_clock.GetValue() && device.ShouldBoostClocks()) { if (Settings::values.renderer_force_max_clock.GetValue() && device.ShouldBoostClocks()) {
turbo_mode.emplace(instance, dld); turbo_mode.emplace(instance, dld);
scheduler.RegisterOnSubmit([this] { turbo_mode->QueueSubmitted(); }); scheduler.AddOnSubmit([this] { turbo_mode->QueueSubmitted(); });
} }
Report(); Report();
@ -176,7 +176,7 @@ try
} }
RendererVulkan::~RendererVulkan() { RendererVulkan::~RendererVulkan() {
scheduler.RegisterOnSubmit([] {}); scheduler.RegisterOnSubmit(std::function<void()>{});
void(device.GetLogical().WaitIdle()); void(device.GetLogical().WaitIdle());
} }

View file

@ -179,6 +179,11 @@ public:
}(); }();
u8* staging_data = host_visible ? buffer.Mapped().data() : staging.mapped_span.data(); u8* staging_data = host_visible ? buffer.Mapped().data() : staging.mapped_span.data();
#ifdef YUZU_DEBUG
if (!host_visible) {
ASSERT(staging.mapped_span.size() >= size_bytes);
}
#endif
const size_t quad_size = bytes_per_index * 6; const size_t quad_size = bytes_per_index * 6;
for (u32 first = 0; first < num_first_offset_copies; ++first) { for (u32 first = 0; first < num_first_offset_copies; ++first) {
@ -514,6 +519,10 @@ void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat
ReserveNullBuffer(); ReserveNullBuffer();
vk_buffer = *null_buffer; vk_buffer = *null_buffer;
} }
#ifdef YUZU_DEBUG
const size_t bytes_per_index = BytesPerIndex(vk_index_type);
ASSERT(bytes_per_index == 0 || (vk_offset % bytes_per_index) == 0);
#endif
scheduler.Record([vk_buffer, vk_offset, vk_index_type](vk::CommandBuffer cmdbuf) { scheduler.Record([vk_buffer, vk_offset, vk_index_type](vk::CommandBuffer cmdbuf) {
cmdbuf.BindIndexBuffer(vk_buffer, vk_offset, vk_index_type); cmdbuf.BindIndexBuffer(vk_buffer, vk_offset, vk_index_type);
}); });

View file

@ -128,20 +128,37 @@ public:
[[maybe_unused]] u32 binding_index, u32 size) { [[maybe_unused]] u32 binding_index, u32 size) {
const StagingBufferRef ref = staging_pool.Request(size, MemoryUsage::Upload); const StagingBufferRef ref = staging_pool.Request(size, MemoryUsage::Upload);
BindBuffer(ref.buffer, static_cast<u32>(ref.offset), size); BindBuffer(ref.buffer, static_cast<u32>(ref.offset), size);
#ifdef YUZU_DEBUG
ASSERT(ref.mapped_span.size() >= size);
const VkDeviceSize ubo_align = device.GetUniformBufferAlignment();
ASSERT(ubo_align == 0 || (ref.offset % ubo_align) == 0);
#endif
return ref.mapped_span; return ref.mapped_span;
} }
void BindUniformBuffer(VkBuffer buffer, u32 offset, u32 size) { void BindUniformBuffer(VkBuffer buffer, u32 offset, u32 size) {
#ifdef YUZU_DEBUG
const VkDeviceSize ubo_align = device.GetUniformBufferAlignment();
ASSERT(ubo_align == 0 || (offset % ubo_align) == 0);
#endif
BindBuffer(buffer, offset, size); BindBuffer(buffer, offset, size);
} }
void BindStorageBuffer(VkBuffer buffer, u32 offset, u32 size, void BindStorageBuffer(VkBuffer buffer, u32 offset, u32 size,
[[maybe_unused]] bool is_written) { [[maybe_unused]] bool is_written) {
#ifdef YUZU_DEBUG
const VkDeviceSize ssbo_align = device.GetStorageBufferAlignment();
ASSERT(ssbo_align == 0 || (offset % ssbo_align) == 0);
#endif
BindBuffer(buffer, offset, size); BindBuffer(buffer, offset, size);
} }
void BindTextureBuffer(Buffer& buffer, u32 offset, u32 size, void BindTextureBuffer(Buffer& buffer, u32 offset, u32 size,
VideoCore::Surface::PixelFormat format) { VideoCore::Surface::PixelFormat format) {
#ifdef YUZU_DEBUG
const VkDeviceSize texel_align = device.GetTexelBufferAlignment();
ASSERT(texel_align == 0 || (offset % texel_align) == 0);
#endif
guest_descriptor_queue.AddTexelBuffer(buffer.View(offset, size, format)); guest_descriptor_queue.AddTexelBuffer(buffer.View(offset, size, format));
} }

View file

@ -854,6 +854,9 @@ public:
for (auto q : flushed_queries) { for (auto q : flushed_queries) {
auto* query = GetQuery(q); auto* query = GetQuery(q);
u32 result = 0; u32 result = 0;
#ifdef YUZU_DEBUG
ASSERT(staging_ref.mapped_span.size() >= offset_base + sizeof(u32));
#endif
std::memcpy(&result, staging_ref.mapped_span.data() + offset_base, sizeof(u32)); std::memcpy(&result, staging_ref.mapped_span.data() + offset_base, sizeof(u32));
query->value = static_cast<u64>(result); query->value = static_cast<u64>(result);
query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced;
@ -1567,6 +1570,9 @@ void QueryCacheRuntime::SyncValues(std::span<SyncValuesType> values, VkBuffer ba
impl->little_cache[which_copy].first, impl->little_cache[which_copy].first,
.size = values[i].size, .size = values[i].size,
}); });
#ifdef YUZU_DEBUG
ASSERT(ref.mapped_span.size() >= accumulated_size + values[i].size);
#endif
std::memcpy(ref.mapped_span.data() + accumulated_size, &values[i].value, std::memcpy(ref.mapped_span.data() + accumulated_size, &values[i].value,
values[i].size); values[i].size);
accumulated_size += values[i].size; accumulated_size += values[i].size;

View file

@ -15,6 +15,7 @@
#include "video_core/renderer_vulkan/vk_command_pool.h" #include "video_core/renderer_vulkan/vk_command_pool.h"
#include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h"
#include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_state_tracker.h"
#include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_texture_cache.h"
#include "video_core/vulkan_common/vulkan_device.h" #include "video_core/vulkan_common/vulkan_device.h"
@ -233,8 +234,14 @@ u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_se
upload_cmdbuf.End(); upload_cmdbuf.End();
cmdbuf.End(); cmdbuf.End();
if (on_submit) { if (staging_buffer_pool) {
on_submit(); staging_buffer_pool->FlushStream();
}
for (const auto& callback : on_submit_callbacks) {
if (callback) {
callback();
}
} }
std::scoped_lock lock{submit_mutex}; std::scoped_lock lock{submit_mutex};

View file

@ -10,6 +10,7 @@
#include <thread> #include <thread>
#include <utility> #include <utility>
#include <queue> #include <queue>
#include <vector>
#include "common/alignment.h" #include "common/alignment.h"
#include "common/common_types.h" #include "common/common_types.h"
@ -29,6 +30,7 @@ class Device;
class Framebuffer; class Framebuffer;
class GraphicsPipeline; class GraphicsPipeline;
class StateTracker; class StateTracker;
class StagingBufferPool;
struct QueryCacheParams; struct QueryCacheParams;
@ -73,9 +75,23 @@ public:
query_cache = &query_cache_; query_cache = &query_cache_;
} }
// Registers a callback to perform on queue submission. void SetStagingBufferPool(StagingBufferPool* pool) {
staging_buffer_pool = pool;
}
// Registers a callback to perform on queue submission, replacing existing callbacks.
void RegisterOnSubmit(std::function<void()>&& func) { void RegisterOnSubmit(std::function<void()>&& func) {
on_submit = std::move(func); on_submit_callbacks.clear();
if (func) {
on_submit_callbacks.emplace_back(std::move(func));
}
}
// Adds an additional callback to perform on queue submission.
void AddOnSubmit(std::function<void()>&& func) {
if (func) {
on_submit_callbacks.emplace_back(std::move(func));
}
} }
/// Send work to a separate thread. /// Send work to a separate thread.
@ -237,12 +253,13 @@ private:
std::unique_ptr<CommandPool> command_pool; std::unique_ptr<CommandPool> command_pool;
VideoCommon::QueryCacheBase<QueryCacheParams>* query_cache = nullptr; VideoCommon::QueryCacheBase<QueryCacheParams>* query_cache = nullptr;
StagingBufferPool* staging_buffer_pool = nullptr;
vk::CommandBuffer current_cmdbuf; vk::CommandBuffer current_cmdbuf;
vk::CommandBuffer current_upload_cmdbuf; vk::CommandBuffer current_upload_cmdbuf;
std::unique_ptr<CommandChunk> chunk; std::unique_ptr<CommandChunk> chunk;
std::function<void()> on_submit; std::vector<std::function<void()>> on_submit_callbacks;
State state; State state;

View file

@ -30,6 +30,11 @@ constexpr VkDeviceSize MIN_STREAM_ALIGNMENT = 256;
// Stream buffer size in bytes // Stream buffer size in bytes
constexpr VkDeviceSize MAX_STREAM_BUFFER_SIZE = 128_MiB; constexpr VkDeviceSize MAX_STREAM_BUFFER_SIZE = 128_MiB;
VkDeviceSize GetStreamAlignment(const Device& device) {
return (std::max)({device.GetUniformBufferAlignment(), device.GetStorageBufferAlignment(),
device.GetTexelBufferAlignment(), MIN_STREAM_ALIGNMENT});
}
size_t GetStreamBufferSize(const Device& device, VkDeviceSize alignment) { size_t GetStreamBufferSize(const Device& device, VkDeviceSize alignment) {
VkDeviceSize size{0}; VkDeviceSize size{0};
if (device.HasDebuggingToolAttached()) { if (device.HasDebuggingToolAttached()) {
@ -63,8 +68,7 @@ size_t GetStreamBufferSize(const Device& device, VkDeviceSize alignment) {
StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_,
Scheduler& scheduler_) Scheduler& scheduler_)
: device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_}, : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
stream_alignment{std::max<VkDeviceSize>(device_.GetUniformBufferAlignment(), stream_alignment{GetStreamAlignment(device_)},
MIN_STREAM_ALIGNMENT)},
stream_buffer_size{GetStreamBufferSize(device_, stream_alignment)}, stream_buffer_size{GetStreamBufferSize(device_, stream_alignment)},
region_size{stream_buffer_size / StagingBufferPool::NUM_SYNCS} { region_size{stream_buffer_size / StagingBufferPool::NUM_SYNCS} {
VkBufferCreateInfo stream_ci = { VkBufferCreateInfo stream_ci = {
@ -87,9 +91,18 @@ StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& mem
} }
stream_pointer = stream_buffer.Mapped(); stream_pointer = stream_buffer.Mapped();
ASSERT_MSG(!stream_pointer.empty(), "Stream buffer must be host visible!"); ASSERT_MSG(!stream_pointer.empty(), "Stream buffer must be host visible!");
stream_is_coherent = stream_buffer.IsHostCoherent();
non_coherent_atom_size = std::max<VkDeviceSize>(device.GetNonCoherentAtomSize(),
static_cast<VkDeviceSize>(1));
dirty_begin = stream_buffer_size;
dirty_end = 0;
stream_dirty = false;
scheduler.SetStagingBufferPool(this);
} }
StagingBufferPool::~StagingBufferPool() = default; StagingBufferPool::~StagingBufferPool() {
scheduler.SetStagingBufferPool(nullptr);
}
StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage, bool deferred) { StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage, bool deferred) {
if (!deferred && usage == MemoryUsage::Upload && size <= region_size) { if (!deferred && usage == MemoryUsage::Upload && size <= region_size) {
@ -121,9 +134,10 @@ void StagingBufferPool::TickFrame() {
StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
const size_t alignment = static_cast<size_t>(stream_alignment); const size_t alignment = static_cast<size_t>(stream_alignment);
const size_t aligned_size = Common::AlignUp(size, alignment); const size_t aligned_size = Common::AlignUp(size, alignment);
const bool wraps = iterator + size >= stream_buffer_size; const size_t capacity = static_cast<size_t>(stream_buffer_size);
const bool wraps = iterator + aligned_size > capacity;
const size_t new_iterator = const size_t new_iterator =
wraps ? aligned_size : Common::AlignUp(iterator + size, alignment); wraps ? aligned_size : Common::AlignUp(iterator + aligned_size, alignment);
const size_t begin_region = wraps ? 0 : Region(iterator); const size_t begin_region = wraps ? 0 : Region(iterator);
const size_t last_byte = new_iterator == 0 ? 0 : new_iterator - 1; const size_t last_byte = new_iterator == 0 ? 0 : new_iterator - 1;
const size_t end_region = (std::min)(Region(last_byte) + 1, NUM_SYNCS); const size_t end_region = (std::min)(Region(last_byte) + 1, NUM_SYNCS);
@ -167,6 +181,8 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
free_iterator = (std::max)(free_iterator, offset + aligned_size); free_iterator = (std::max)(free_iterator, offset + aligned_size);
} }
TrackStreamWrite(static_cast<VkDeviceSize>(offset), static_cast<VkDeviceSize>(aligned_size));
return StagingBufferRef{ return StagingBufferRef{
.buffer = *stream_buffer, .buffer = *stream_buffer,
.offset = static_cast<VkDeviceSize>(offset), .offset = static_cast<VkDeviceSize>(offset),
@ -177,6 +193,53 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
}; };
} }
void StagingBufferPool::TrackStreamWrite(VkDeviceSize offset, VkDeviceSize size) {
if (stream_is_coherent || size == 0) {
return;
}
const VkDeviceSize clamped_offset = (std::min)(offset, stream_buffer_size);
const VkDeviceSize clamped_end = (std::min)(clamped_offset + size, stream_buffer_size);
std::scoped_lock lock{stream_mutex};
if (!stream_dirty) {
dirty_begin = clamped_offset;
dirty_end = clamped_end;
stream_dirty = true;
return;
}
dirty_begin = (std::min)(dirty_begin, clamped_offset);
dirty_end = (std::max)(dirty_end, clamped_end);
}
void StagingBufferPool::FlushStream() {
if (stream_is_coherent) {
return;
}
VkDeviceSize flush_begin = 0;
VkDeviceSize flush_end = 0;
{
std::scoped_lock lock{stream_mutex};
if (!stream_dirty) {
return;
}
flush_begin = dirty_begin;
flush_end = dirty_end;
stream_dirty = false;
dirty_begin = stream_buffer_size;
dirty_end = 0;
}
if (flush_begin >= flush_end) {
return;
}
const VkDeviceSize atom = non_coherent_atom_size;
const VkDeviceSize aligned_begin = Common::AlignDown(flush_begin, atom);
const VkDeviceSize aligned_end = Common::AlignUp(flush_end, atom);
const VkDeviceSize flush_size = aligned_end - aligned_begin;
stream_buffer.FlushRange(aligned_begin, flush_size);
}
bool StagingBufferPool::AreRegionsActive(size_t region_begin, size_t region_end) const { bool StagingBufferPool::AreRegionsActive(size_t region_begin, size_t region_end) const {
const u64 gpu_tick = scheduler.GetMasterSemaphore().KnownGpuTick(); const u64 gpu_tick = scheduler.GetMasterSemaphore().KnownGpuTick();
return std::any_of(sync_ticks.begin() + region_begin, sync_ticks.begin() + region_end, return std::any_of(sync_ticks.begin() + region_begin, sync_ticks.begin() + region_end,

View file

@ -7,6 +7,7 @@
#pragma once #pragma once
#include <climits> #include <climits>
#include <mutex>
#include <vector> #include <vector>
#include "common/common_types.h" #include "common/common_types.h"
@ -30,6 +31,8 @@ struct StagingBufferRef {
class StagingBufferPool { class StagingBufferPool {
public: public:
friend class Scheduler;
static constexpr size_t NUM_SYNCS = 16; static constexpr size_t NUM_SYNCS = 16;
explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator, explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator,
@ -83,6 +86,9 @@ private:
StagingBufferRef GetStreamBuffer(size_t size); StagingBufferRef GetStreamBuffer(size_t size);
void TrackStreamWrite(VkDeviceSize offset, VkDeviceSize size);
void FlushStream();
bool AreRegionsActive(size_t region_begin, size_t region_end) const; bool AreRegionsActive(size_t region_begin, size_t region_end) const;
StagingBufferRef GetStagingBuffer(size_t size, MemoryUsage usage, bool deferred = false); StagingBufferRef GetStagingBuffer(size_t size, MemoryUsage usage, bool deferred = false);
@ -110,6 +116,12 @@ private:
std::span<u8> stream_pointer; std::span<u8> stream_pointer;
VkDeviceSize stream_buffer_size; VkDeviceSize stream_buffer_size;
VkDeviceSize region_size; VkDeviceSize region_size;
bool stream_is_coherent = true;
VkDeviceSize non_coherent_atom_size = 1;
VkDeviceSize dirty_begin = 0;
VkDeviceSize dirty_end = 0;
bool stream_dirty = false;
std::mutex stream_mutex;
size_t iterator = 0; size_t iterator = 0;
size_t used_iterator = 0; size_t used_iterator = 0;

View file

@ -295,6 +295,16 @@ public:
return properties.properties.limits.minStorageBufferOffsetAlignment; return properties.properties.limits.minStorageBufferOffsetAlignment;
} }
/// Returns texel buffer alignment requirement.
VkDeviceSize GetTexelBufferAlignment() const {
return properties.properties.limits.minTexelBufferOffsetAlignment;
}
/// Returns the non-coherent atom size for memory flushes.
VkDeviceSize GetNonCoherentAtomSize() const {
return properties.properties.limits.nonCoherentAtomSize;
}
/// Returns the maximum range for storage buffers. /// Returns the maximum range for storage buffers.
VkDeviceSize GetMaxStorageBufferRange() const { VkDeviceSize GetMaxStorageBufferRange() const {
return properties.properties.limits.maxStorageBufferRange; return properties.properties.limits.maxStorageBufferRange;

View file

@ -499,8 +499,12 @@ void Image::Release() const noexcept {
} }
void Buffer::Flush() const { void Buffer::Flush() const {
FlushRange(0, VK_WHOLE_SIZE);
}
void Buffer::FlushRange(VkDeviceSize offset, VkDeviceSize size) const {
if (!is_coherent) { if (!is_coherent) {
vmaFlushAllocation(allocator, allocation, 0, VK_WHOLE_SIZE); vmaFlushAllocation(allocator, allocation, offset, size);
} }
} }

View file

@ -772,8 +772,15 @@ public:
return !mapped.empty(); return !mapped.empty();
} }
/// Returns true if the buffer memory is host coherent.
bool IsHostCoherent() const noexcept {
return is_coherent;
}
void Flush() const; void Flush() const;
void FlushRange(VkDeviceSize offset, VkDeviceSize size) const;
void Invalidate() const; void Invalidate() const;
void SetObjectNameEXT(const char* name) const; void SetObjectNameEXT(const char* name) const;