[vk] Fast UBO: fix tracking, resize heuristics, add debug guard (#2695)
Co-authored-by: Ribbit <ribbit@placeholder.com> Reviewed-on: #2695 Reviewed-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: Ribbit <ribbit@eden-emu.dev> Co-committed-by: Ribbit <ribbit@eden-emu.dev>
This commit is contained in:
parent
db65f10768
commit
8078990b9b
6 changed files with 76 additions and 33 deletions
|
@ -1,3 +1,6 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
@ -109,6 +112,9 @@ public:
|
||||||
|
|
||||||
void ReadBlock(DAddr address, void* dest_pointer, size_t size);
|
void ReadBlock(DAddr address, void* dest_pointer, size_t size);
|
||||||
void ReadBlockUnsafe(DAddr address, void* dest_pointer, size_t size);
|
void ReadBlockUnsafe(DAddr address, void* dest_pointer, size_t size);
|
||||||
|
#ifdef YUZU_DEBUG
|
||||||
|
bool ReadBlockFastChecked(DAddr address, void* dest_pointer, size_t size);
|
||||||
|
#endif
|
||||||
void WriteBlock(DAddr address, const void* src_pointer, size_t size);
|
void WriteBlock(DAddr address, const void* src_pointer, size_t size);
|
||||||
void WriteBlockUnsafe(DAddr address, const void* src_pointer, size_t size);
|
void WriteBlockUnsafe(DAddr address, const void* src_pointer, size_t size);
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
@ -467,6 +470,29 @@ void DeviceMemoryManager<Traits>::ReadBlockUnsafe(DAddr address, void* dest_poin
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef YUZU_DEBUG
|
||||||
|
template <typename Traits>
|
||||||
|
bool DeviceMemoryManager<Traits>::ReadBlockFastChecked(DAddr address, void* dest_pointer,
|
||||||
|
size_t size) {
|
||||||
|
bool success = true;
|
||||||
|
WalkBlock(
|
||||||
|
address, size,
|
||||||
|
[&](size_t copy_amount, DAddr current_vaddr) {
|
||||||
|
LOG_CRITICAL(Render, "DeviceMemory OOB/unmapped: addr=0x{:x} size={}", current_vaddr,
|
||||||
|
size);
|
||||||
|
std::memset(dest_pointer, 0, copy_amount);
|
||||||
|
success = false;
|
||||||
|
},
|
||||||
|
[&](size_t copy_amount, const u8* const src_ptr) {
|
||||||
|
std::memcpy(dest_pointer, src_ptr, copy_amount);
|
||||||
|
},
|
||||||
|
[&](const std::size_t copy_amount) {
|
||||||
|
dest_pointer = static_cast<u8*>(dest_pointer) + copy_amount;
|
||||||
|
});
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename Traits>
|
template <typename Traits>
|
||||||
void DeviceMemoryManager<Traits>::WriteBlockUnsafe(DAddr address, const void* src_pointer,
|
void DeviceMemoryManager<Traits>::WriteBlockUnsafe(DAddr address, const void* src_pointer,
|
||||||
size_t size) {
|
size_t size) {
|
||||||
|
|
|
@ -386,11 +386,10 @@ void BufferCache<P>::BindHostComputeBuffers() {
|
||||||
template <class P>
|
template <class P>
|
||||||
void BufferCache<P>::SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask,
|
void BufferCache<P>::SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask,
|
||||||
const UniformBufferSizes* sizes) {
|
const UniformBufferSizes* sizes) {
|
||||||
if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
|
const bool mask_changed = channel_state->enabled_uniform_buffer_masks != mask;
|
||||||
if (channel_state->enabled_uniform_buffer_masks != mask) {
|
if (mask_changed) {
|
||||||
if constexpr (IS_OPENGL) {
|
channel_state->fast_bound_uniform_buffers.fill(0);
|
||||||
channel_state->fast_bound_uniform_buffers.fill(0);
|
if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
|
||||||
}
|
|
||||||
channel_state->dirty_uniform_buffers.fill(~u32{0});
|
channel_state->dirty_uniform_buffers.fill(~u32{0});
|
||||||
channel_state->uniform_buffer_binding_sizes.fill({});
|
channel_state->uniform_buffer_binding_sizes.fill({});
|
||||||
}
|
}
|
||||||
|
@ -806,7 +805,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
|
||||||
channel_state->uniform_buffer_binding_sizes[stage][binding_index] != size;
|
channel_state->uniform_buffer_binding_sizes[stage][binding_index] != size;
|
||||||
if (should_fast_bind) {
|
if (should_fast_bind) {
|
||||||
// We only have to bind when the currently bound buffer is not the fast version
|
// We only have to bind when the currently bound buffer is not the fast version
|
||||||
channel_state->fast_bound_uniform_buffers[stage] |= 1U << binding_index;
|
channel_state->fast_bound_uniform_buffers[stage] |= 1u << binding_index;
|
||||||
channel_state->uniform_buffer_binding_sizes[stage][binding_index] = size;
|
channel_state->uniform_buffer_binding_sizes[stage][binding_index] = size;
|
||||||
runtime.BindFastUniformBuffer(stage, binding_index, size);
|
runtime.BindFastUniformBuffer(stage, binding_index, size);
|
||||||
}
|
}
|
||||||
|
@ -815,13 +814,22 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if constexpr (IS_OPENGL) {
|
channel_state->fast_bound_uniform_buffers[stage] |= 1u << binding_index;
|
||||||
channel_state->fast_bound_uniform_buffers[stage] |= 1U << binding_index;
|
channel_state->uniform_buffer_binding_sizes[stage][binding_index] = size;
|
||||||
channel_state->uniform_buffer_binding_sizes[stage][binding_index] = size;
|
|
||||||
}
|
|
||||||
// Stream buffer path to avoid stalling on non-Nvidia drivers or Vulkan
|
// Stream buffer path to avoid stalling on non-Nvidia drivers or Vulkan
|
||||||
const std::span<u8> span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
|
const std::span<u8> span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
|
||||||
|
#ifdef YUZU_DEBUG
|
||||||
|
ASSERT(binding_index < NUM_GRAPHICS_UNIFORM_BUFFERS);
|
||||||
|
ASSERT(span.size() >= size && "UBO stream span too small");
|
||||||
|
if (!device_memory.ReadBlockFastChecked(device_addr, span.data(), size)) {
|
||||||
|
LOG_CRITICAL(Render, "DeviceMemory OOB/unmapped: addr=0x{:x} size={}", device_addr, size);
|
||||||
|
channel_state->fast_bound_uniform_buffers[stage] &= ~(1u << binding_index);
|
||||||
|
ASSERT(false);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#else
|
||||||
device_memory.ReadBlockUnsafe(device_addr, span.data(), size);
|
device_memory.ReadBlockUnsafe(device_addr, span.data(), size);
|
||||||
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Classic cached path
|
// Classic cached path
|
||||||
|
@ -830,7 +838,8 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
|
||||||
}
|
}
|
||||||
// Skip binding if it's not needed and if the bound buffer is not the fast version
|
// Skip binding if it's not needed and if the bound buffer is not the fast version
|
||||||
// This exists to avoid instances where the fast buffer is bound and a GPU write happens
|
// This exists to avoid instances where the fast buffer is bound and a GPU write happens
|
||||||
needs_bind |= HasFastUniformBufferBound(stage, binding_index);
|
const bool was_fast_bound = HasFastUniformBufferBound(stage, binding_index);
|
||||||
|
needs_bind |= was_fast_bound;
|
||||||
if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
|
if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
|
||||||
needs_bind |= channel_state->uniform_buffer_binding_sizes[stage][binding_index] != size;
|
needs_bind |= channel_state->uniform_buffer_binding_sizes[stage][binding_index] != size;
|
||||||
}
|
}
|
||||||
|
@ -839,9 +848,6 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
|
||||||
}
|
}
|
||||||
const u32 offset = buffer.Offset(device_addr);
|
const u32 offset = buffer.Offset(device_addr);
|
||||||
if constexpr (IS_OPENGL) {
|
if constexpr (IS_OPENGL) {
|
||||||
// Fast buffer will be unbound
|
|
||||||
channel_state->fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
|
|
||||||
|
|
||||||
// Mark the index as dirty if offset doesn't match
|
// Mark the index as dirty if offset doesn't match
|
||||||
const bool is_copy_bind = offset != 0 && !runtime.SupportsNonZeroUniformOffset();
|
const bool is_copy_bind = offset != 0 && !runtime.SupportsNonZeroUniformOffset();
|
||||||
channel_state->dirty_uniform_buffers[stage] |= (is_copy_bind ? 1U : 0U) << index;
|
channel_state->dirty_uniform_buffers[stage] |= (is_copy_bind ? 1U : 0U) << index;
|
||||||
|
@ -855,6 +861,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
|
||||||
} else {
|
} else {
|
||||||
runtime.BindUniformBuffer(buffer, offset, size);
|
runtime.BindUniformBuffer(buffer, offset, size);
|
||||||
}
|
}
|
||||||
|
channel_state->fast_bound_uniform_buffers[stage] &= ~(1u << binding_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class P>
|
template <class P>
|
||||||
|
@ -1789,12 +1796,7 @@ std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) {
|
||||||
|
|
||||||
template <class P>
|
template <class P>
|
||||||
bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept {
|
bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept {
|
||||||
if constexpr (IS_OPENGL) {
|
return ((channel_state->fast_bound_uniform_buffers[stage] >> binding_index) & 1u) != 0;
|
||||||
return ((channel_state->fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0;
|
|
||||||
} else {
|
|
||||||
// Only OpenGL has fast uniform buffers
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class P>
|
template <class P>
|
||||||
|
|
|
@ -53,6 +53,7 @@ constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
|
||||||
constexpr u32 NUM_STORAGE_BUFFERS = 16;
|
constexpr u32 NUM_STORAGE_BUFFERS = 16;
|
||||||
constexpr u32 NUM_TEXTURE_BUFFERS = 32;
|
constexpr u32 NUM_TEXTURE_BUFFERS = 32;
|
||||||
constexpr u32 NUM_STAGES = 5;
|
constexpr u32 NUM_STAGES = 5;
|
||||||
|
static_assert(NUM_GRAPHICS_UNIFORM_BUFFERS <= 32, "fast bitmask must fit u32");
|
||||||
|
|
||||||
using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>;
|
using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>;
|
||||||
using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>;
|
using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>;
|
||||||
|
@ -137,8 +138,8 @@ public:
|
||||||
u32 written_compute_texture_buffers = 0;
|
u32 written_compute_texture_buffers = 0;
|
||||||
u32 image_compute_texture_buffers = 0;
|
u32 image_compute_texture_buffers = 0;
|
||||||
|
|
||||||
std::array<u32, 16> uniform_cache_hits{};
|
std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS> uniform_cache_hits{};
|
||||||
std::array<u32, 16> uniform_cache_shots{};
|
std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS> uniform_cache_shots{};
|
||||||
|
|
||||||
u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE;
|
u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE;
|
||||||
|
|
||||||
|
|
|
@ -25,12 +25,12 @@ namespace {
|
||||||
|
|
||||||
using namespace Common::Literals;
|
using namespace Common::Literals;
|
||||||
|
|
||||||
// Maximum potential alignment of a Vulkan buffer
|
// Minimum alignment we want to enforce for the streaming ring
|
||||||
constexpr VkDeviceSize MAX_ALIGNMENT = 256;
|
constexpr VkDeviceSize MIN_STREAM_ALIGNMENT = 256;
|
||||||
// Stream buffer size in bytes
|
// Stream buffer size in bytes
|
||||||
constexpr VkDeviceSize MAX_STREAM_BUFFER_SIZE = 128_MiB;
|
constexpr VkDeviceSize MAX_STREAM_BUFFER_SIZE = 128_MiB;
|
||||||
|
|
||||||
size_t GetStreamBufferSize(const Device& device) {
|
size_t GetStreamBufferSize(const Device& device, VkDeviceSize alignment) {
|
||||||
VkDeviceSize size{0};
|
VkDeviceSize size{0};
|
||||||
if (device.HasDebuggingToolAttached()) {
|
if (device.HasDebuggingToolAttached()) {
|
||||||
bool found_heap = false;
|
bool found_heap = false;
|
||||||
|
@ -53,8 +53,9 @@ size_t GetStreamBufferSize(const Device& device) {
|
||||||
|
|
||||||
// Clamp to the configured maximum, align up for safety, and ensure a sane minimum so
|
// Clamp to the configured maximum, align up for safety, and ensure a sane minimum so
|
||||||
// region_size (stream_buffer_size / NUM_SYNCS) never becomes zero.
|
// region_size (stream_buffer_size / NUM_SYNCS) never becomes zero.
|
||||||
const VkDeviceSize aligned = (std::min)(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE);
|
const VkDeviceSize aligned =
|
||||||
const VkDeviceSize min_size = MAX_ALIGNMENT * StagingBufferPool::NUM_SYNCS;
|
(std::min)(Common::AlignUp(size, alignment), MAX_STREAM_BUFFER_SIZE);
|
||||||
|
const VkDeviceSize min_size = alignment * StagingBufferPool::NUM_SYNCS;
|
||||||
return static_cast<size_t>((std::max)(aligned, min_size));
|
return static_cast<size_t>((std::max)(aligned, min_size));
|
||||||
}
|
}
|
||||||
} // Anonymous namespace
|
} // Anonymous namespace
|
||||||
|
@ -62,8 +63,10 @@ size_t GetStreamBufferSize(const Device& device) {
|
||||||
StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_,
|
StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_,
|
||||||
Scheduler& scheduler_)
|
Scheduler& scheduler_)
|
||||||
: device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
|
: device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
|
||||||
stream_buffer_size{GetStreamBufferSize(device)}, region_size{stream_buffer_size /
|
stream_alignment{std::max<VkDeviceSize>(device_.GetUniformBufferAlignment(),
|
||||||
StagingBufferPool::NUM_SYNCS} {
|
MIN_STREAM_ALIGNMENT)},
|
||||||
|
stream_buffer_size{GetStreamBufferSize(device_, stream_alignment)},
|
||||||
|
region_size{stream_buffer_size / StagingBufferPool::NUM_SYNCS} {
|
||||||
VkBufferCreateInfo stream_ci = {
|
VkBufferCreateInfo stream_ci = {
|
||||||
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
||||||
.pNext = nullptr,
|
.pNext = nullptr,
|
||||||
|
@ -116,10 +119,11 @@ void StagingBufferPool::TickFrame() {
|
||||||
}
|
}
|
||||||
|
|
||||||
StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
|
StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
|
||||||
const size_t aligned_size = Common::AlignUp(size, MAX_ALIGNMENT);
|
const size_t alignment = static_cast<size_t>(stream_alignment);
|
||||||
|
const size_t aligned_size = Common::AlignUp(size, alignment);
|
||||||
const bool wraps = iterator + size >= stream_buffer_size;
|
const bool wraps = iterator + size >= stream_buffer_size;
|
||||||
const size_t new_iterator =
|
const size_t new_iterator =
|
||||||
wraps ? aligned_size : Common::AlignUp(iterator + size, MAX_ALIGNMENT);
|
wraps ? aligned_size : Common::AlignUp(iterator + size, alignment);
|
||||||
const size_t begin_region = wraps ? 0 : Region(iterator);
|
const size_t begin_region = wraps ? 0 : Region(iterator);
|
||||||
const size_t last_byte = new_iterator == 0 ? 0 : new_iterator - 1;
|
const size_t last_byte = new_iterator == 0 ? 0 : new_iterator - 1;
|
||||||
const size_t end_region = (std::min)(Region(last_byte) + 1, NUM_SYNCS);
|
const size_t end_region = (std::min)(Region(last_byte) + 1, NUM_SYNCS);
|
||||||
|
@ -145,7 +149,7 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
|
||||||
current_tick);
|
current_tick);
|
||||||
used_iterator = 0;
|
used_iterator = 0;
|
||||||
iterator = 0;
|
iterator = 0;
|
||||||
free_iterator = size;
|
free_iterator = aligned_size;
|
||||||
const size_t head_last_byte = aligned_size == 0 ? 0 : aligned_size - 1;
|
const size_t head_last_byte = aligned_size == 0 ? 0 : aligned_size - 1;
|
||||||
const size_t head_end_region = (std::min)(Region(head_last_byte) + 1, NUM_SYNCS);
|
const size_t head_end_region = (std::min)(Region(head_last_byte) + 1, NUM_SYNCS);
|
||||||
if (AreRegionsActive(0, head_end_region)) {
|
if (AreRegionsActive(0, head_end_region)) {
|
||||||
|
@ -160,7 +164,7 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
|
||||||
iterator = new_iterator;
|
iterator = new_iterator;
|
||||||
|
|
||||||
if (!wraps) {
|
if (!wraps) {
|
||||||
free_iterator = (std::max)(free_iterator, offset + size);
|
free_iterator = (std::max)(free_iterator, offset + aligned_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
return StagingBufferRef{
|
return StagingBufferRef{
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
|
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
|
||||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
|
@ -102,6 +105,7 @@ private:
|
||||||
MemoryAllocator& memory_allocator;
|
MemoryAllocator& memory_allocator;
|
||||||
Scheduler& scheduler;
|
Scheduler& scheduler;
|
||||||
|
|
||||||
|
VkDeviceSize stream_alignment;
|
||||||
vk::Buffer stream_buffer;
|
vk::Buffer stream_buffer;
|
||||||
std::span<u8> stream_pointer;
|
std::span<u8> stream_pointer;
|
||||||
VkDeviceSize stream_buffer_size;
|
VkDeviceSize stream_buffer_size;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue