From 94ffebe6eaad2ff4ce4041e303908714969732f5 Mon Sep 17 00:00:00 2001 From: Shinmegumi Date: Sun, 3 Aug 2025 02:28:55 +0200 Subject: [PATCH] [vulkan] Enable Reset Query Pool per spec Vulkan, by spec, requires that every query must be reset before use now for that we require vkCmdResetQueryPool and it must be called outside the renderpass but once we reset the query, it throws us outside the render pass which we must resume using scheduler.RequestRenderpass(texture_cache.GetFramebuffer()); --- .../renderer_vulkan/vk_graphics_pipeline.cpp | 1 - .../renderer_vulkan/vk_query_cache.cpp | 37 +++-- .../renderer_vulkan/vk_query_cache.h | 5 +- .../renderer_vulkan/vk_rasterizer.cpp | 2 +- .../renderer_vulkan/vk_rasterizer.h | 2 - .../renderer_vulkan/vk_texture_cache.cpp | 140 ++++++++++-------- .../vulkan_common/vulkan_wrapper.cpp | 1 + src/video_core/vulkan_common/vulkan_wrapper.h | 5 +- 8 files changed, 110 insertions(+), 83 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 6c40ff1bab..e73e885e66 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -502,7 +502,6 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) { void GraphicsPipeline::ConfigureDraw(const RescalingPushConstant& rescaling, const RenderAreaPushConstant& render_area) { scheduler.RequestRenderpass(texture_cache.GetFramebuffer()); - if (!is_built.load(std::memory_order::relaxed)) { // Wait for the pipeline to be built scheduler.Record([this](vk::CommandBuffer) { diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 1f71bc68c6..d6ecc2b65c 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -13,7 +13,7 @@ #include #include #include - +#include "video_core/renderer_vulkan/vk_texture_cache.h" #include "common/bit_util.h" #include "common/common_types.h" #include "video_core/engines/draw_manager.h" @@ -116,11 +116,11 @@ struct HostSyncValues { class SamplesStreamer : public BaseStreamer { public: explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, - VideoCore::RasterizerInterface* rasterizer_, const Device& device_, + VideoCore::RasterizerInterface* rasterizer_, TextureCache& texture_cache_, const Device& device_, Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, ComputePassDescriptorQueue& compute_pass_descriptor_queue, DescriptorPool& descriptor_pool) - : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, + : BaseStreamer(id_), texture_cache{texture_cache_}, runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, scheduler{scheduler_}, memory_allocator{memory_allocator_} { current_bank = nullptr; current_query = nullptr; @@ -153,16 +153,33 @@ public: if (has_started) { return; } + ReserveHostQuery(); + + // Ensure outside render pass + scheduler.RequestOutsideRenderPassOperationContext(); + + // Reset query pool outside render pass scheduler.Record([query_pool = current_query_pool, - query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { + query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { + cmdbuf.ResetQueryPool(query_pool, static_cast(query_index), 1); + }); + + // Manually restart the render pass (required for vkCmdClearAttachments, etc.) + scheduler.RequestRenderpass(texture_cache.GetFramebuffer()); + + // Begin query inside the newly started render pass + scheduler.Record([query_pool = current_query_pool, + query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { const bool use_precise = Settings::IsGPULevelHigh(); cmdbuf.BeginQuery(query_pool, static_cast(query_index), use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); }); + has_started = true; } + void PauseCounter() override { if (!has_started) { return; @@ -404,7 +421,7 @@ private: size_slots -= amount; } } - + TextureCache& texture_cache; template void ApplyBanksWideOp(std::vector& queries, Func&& func) { std::conditional_t>, @@ -1163,13 +1180,13 @@ struct QueryCacheRuntimeImpl { const MemoryAllocator& memory_allocator_, Scheduler& scheduler_, StagingBufferPool& staging_pool_, ComputePassDescriptorQueue& compute_pass_descriptor_queue, - DescriptorPool& descriptor_pool) + DescriptorPool& descriptor_pool, TextureCache& texture_cache_) : rasterizer{rasterizer_}, device_memory{device_memory_}, buffer_cache{buffer_cache_}, device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, guest_streamer(0, runtime), sample_streamer(static_cast(QueryType::ZPassPixelCount64), runtime, rasterizer, - device, scheduler, memory_allocator, compute_pass_descriptor_queue, - descriptor_pool), + texture_cache_, device, scheduler, memory_allocator, + compute_pass_descriptor_queue, descriptor_pool), tfb_streamer(static_cast(QueryType::StreamingByteCount), runtime, device, scheduler, memory_allocator, staging_pool), primitives_succeeded_streamer( @@ -1240,10 +1257,10 @@ QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, const MemoryAllocator& memory_allocator_, Scheduler& scheduler_, StagingBufferPool& staging_pool_, ComputePassDescriptorQueue& compute_pass_descriptor_queue, - DescriptorPool& descriptor_pool) { + DescriptorPool& descriptor_pool, TextureCache& texture_cache_) { impl = std::make_unique( *this, rasterizer, device_memory_, buffer_cache_, device_, memory_allocator_, scheduler_, - staging_pool_, compute_pass_descriptor_queue, descriptor_pool); + staging_pool_, compute_pass_descriptor_queue, descriptor_pool, texture_cache_); } void QueryCacheRuntime::Bind3DEngine(Maxwell3D* maxwell3d) { diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index f6151123ec..b8dae9bc2d 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -7,7 +7,7 @@ #include "video_core/query_cache/query_cache_base.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" - +#include "video_core/renderer_vulkan/vk_texture_cache.h" namespace VideoCore { class RasterizerInterface; } @@ -17,7 +17,6 @@ class StreamerInterface; } namespace Vulkan { - class Device; class Scheduler; class StagingBufferPool; @@ -32,7 +31,7 @@ public: const MemoryAllocator& memory_allocator_, Scheduler& scheduler_, StagingBufferPool& staging_pool_, ComputePassDescriptorQueue& compute_pass_descriptor_queue, - DescriptorPool& descriptor_pool); + DescriptorPool& descriptor_pool, TextureCache& texture_cache_); ~QueryCacheRuntime(); template diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index dbe2ce66c9..c511a51720 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -189,7 +189,7 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), buffer_cache(device_memory, buffer_cache_runtime), query_cache_runtime(this, device_memory, buffer_cache, device, memory_allocator, scheduler, - staging_pool, compute_pass_descriptor_queue, descriptor_pool), + staging_pool, compute_pass_descriptor_queue, descriptor_pool, texture_cache), query_cache(gpu, *this, device_memory, query_cache_runtime), pipeline_cache(device_memory, device, scheduler, descriptor_pool, guest_descriptor_queue, render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index ea032635c2..30780b9cbd 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -136,7 +136,6 @@ public: void BindChannel(Tegra::Control::ChannelState& channel) override; void ReleaseChannel(s32 channel_id) override; - std::optional AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride); @@ -147,7 +146,6 @@ private: 0x0100E95004038000ULL, // XC2 0x0100A6301214E000ULL, // FE:Engage }; - static constexpr size_t MAX_TEXTURES = 192; static constexpr size_t MAX_IMAGES = 48; static constexpr size_t MAX_IMAGE_VIEWS = MAX_TEXTURES + MAX_IMAGES; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 9259639107..963e7b1c40 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -507,58 +507,84 @@ TransformBufferCopies(std::span copies, size_t bu return value; } } +struct RangedBarrierRange { + u32 min_mip = std::numeric_limits::max(); + u32 max_mip = std::numeric_limits::min(); + u32 min_layer = std::numeric_limits::max(); + u32 max_layer = std::numeric_limits::min(); + void AddLayers(const VkImageSubresourceLayers& layers) { + min_mip = std::min(min_mip, layers.mipLevel); + max_mip = std::max(max_mip, layers.mipLevel + 1); + min_layer = std::min(min_layer, layers.baseArrayLayer); + max_layer = std::max(max_layer, layers.baseArrayLayer + layers.layerCount); + } + + VkImageSubresourceRange SubresourceRange(VkImageAspectFlags aspect_mask) const noexcept { + return VkImageSubresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = min_mip, + .levelCount = max_mip - min_mip, + .baseArrayLayer = min_layer, + .layerCount = max_layer - min_layer, + }; + } +}; void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image, VkImageAspectFlags aspect_mask, bool is_initialized, std::span copies) { static constexpr VkAccessFlags WRITE_ACCESS_FLAGS = - VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; static constexpr VkAccessFlags READ_ACCESS_FLAGS = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT; + + // Compute exact mip/layer range being written to + RangedBarrierRange range; + for (const auto& region : copies) { + range.AddLayers(region.imageSubresource); + } + const VkImageSubresourceRange subresource_range = range.SubresourceRange(aspect_mask); + const VkImageMemoryBarrier read_barrier{ - .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = WRITE_ACCESS_FLAGS, - .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, - .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .image = image, - .subresourceRange{ - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = VK_REMAINING_MIP_LEVELS, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = WRITE_ACCESS_FLAGS, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = subresource_range, }; + const VkImageMemoryBarrier write_barrier{ - .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = WRITE_ACCESS_FLAGS | READ_ACCESS_FLAGS, - .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - .newLayout = VK_IMAGE_LAYOUT_GENERAL, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .image = image, - .subresourceRange{ - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = VK_REMAINING_MIP_LEVELS, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = WRITE_ACCESS_FLAGS | READ_ACCESS_FLAGS, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = subresource_range, }; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, read_barrier); cmdbuf.CopyBufferToImage(src_buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copies); // TODO: Move this to another API - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, - write_barrier); + cmdbuf.PipelineBarrier( + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + 0, nullptr, nullptr, write_barrier); } [[nodiscard]] VkImageBlit MakeImageBlit(const Region2D& dst_region, const Region2D& src_region, @@ -651,29 +677,7 @@ void TryTransformSwizzleIfNeeded(PixelFormat format, std::array::max(); - u32 max_mip = std::numeric_limits::min(); - u32 min_layer = std::numeric_limits::max(); - u32 max_layer = std::numeric_limits::min(); - void AddLayers(const VkImageSubresourceLayers& layers) { - min_mip = std::min(min_mip, layers.mipLevel); - max_mip = std::max(max_mip, layers.mipLevel + 1); - min_layer = std::min(min_layer, layers.baseArrayLayer); - max_layer = std::max(max_layer, layers.baseArrayLayer + layers.layerCount); - } - - VkImageSubresourceRange SubresourceRange(VkImageAspectFlags aspect_mask) const noexcept { - return VkImageSubresourceRange{ - .aspectMask = aspect_mask, - .baseMipLevel = min_mip, - .levelCount = max_mip - min_mip, - .baseArrayLayer = min_layer, - .layerCount = max_layer - min_layer, - }; - } -}; [[nodiscard]] VkFormat Format(Shader::ImageFormat format) { switch (format) { @@ -1457,12 +1461,18 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src, .subresourceRange = dst_range.SubresourceRange(aspect_mask), }, }; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - 0, {}, {}, pre_barriers); + cmdbuf.PipelineBarrier( + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, nullptr, nullptr, pre_barriers); cmdbuf.CopyImage(src_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk_copies); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - 0, {}, {}, post_barriers); + cmdbuf.PipelineBarrier( + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, nullptr, nullptr, post_barriers); }); } @@ -1546,7 +1556,7 @@ void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset, } // Handle MSAA upload if necessary - /* WARNING, TODO: This code uses some hacks, besides being fundamentally ugly + /* WARNING, TODO: This code uses some hacks, besides being fundamentally ugly since tropic didn't want to touch it for a long time, so it needs a rewrite from someone better than me at vulkan.*/ if (info.num_samples > 1 && runtime->CanUploadMSAA()) { // Only use MSAA copy pass for color formats @@ -2352,7 +2362,7 @@ void TextureCacheRuntime::TransitionImageLayout(Image& image) { }; scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([barrier](vk::CommandBuffer cmdbuf) { - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, barrier); }); } diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 5d80531b47..106630182f 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -120,6 +120,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdEndConditionalRenderingEXT); X(vkCmdEndQuery); X(vkCmdEndRenderPass); + X(vkCmdResetQueryPool); X(vkCmdEndTransformFeedbackEXT); X(vkCmdEndDebugUtilsLabelEXT); X(vkCmdFillBuffer); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 5e99444e61..8fd0bff6af 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -219,6 +219,7 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkCmdEndConditionalRenderingEXT vkCmdEndConditionalRenderingEXT{}; PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; PFN_vkCmdEndQuery vkCmdEndQuery{}; + PFN_vkCmdResetQueryPool vkCmdResetQueryPool{}; PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; PFN_vkCmdEndTransformFeedbackEXT vkCmdEndTransformFeedbackEXT{}; PFN_vkCmdFillBuffer vkCmdFillBuffer{}; @@ -1137,7 +1138,9 @@ public: VkCommandBuffer operator*() const noexcept { return handle; } - + void ResetQueryPool(VkQueryPool query_pool, uint32_t first, uint32_t count) const noexcept { + dld->vkCmdResetQueryPool(handle, query_pool, first, count); + } void Begin(const VkCommandBufferBeginInfo& begin_info) const { Check(dld->vkBeginCommandBuffer(handle, &begin_info)); }