forked from eden-emu/eden
		
	Merge pull request #11225 from FernandoS27/no-laxatives-in-santas-cookies
Y.F.C: Rework the Query Cache.
This commit is contained in:
		
						commit
						854457a392
					
				
					 45 changed files with 3571 additions and 384 deletions
				
			
		|  | @ -130,13 +130,17 @@ void LogSettings() { | ||||||
|     log_path("DataStorage_SDMCDir", Common::FS::GetYuzuPath(Common::FS::YuzuPath::SDMCDir)); |     log_path("DataStorage_SDMCDir", Common::FS::GetYuzuPath(Common::FS::YuzuPath::SDMCDir)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | void UpdateGPUAccuracy() { | ||||||
|  |     values.current_gpu_accuracy = values.gpu_accuracy.GetValue(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| bool IsGPULevelExtreme() { | bool IsGPULevelExtreme() { | ||||||
|     return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme; |     return values.current_gpu_accuracy == GpuAccuracy::Extreme; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| bool IsGPULevelHigh() { | bool IsGPULevelHigh() { | ||||||
|     return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme || |     return values.current_gpu_accuracy == GpuAccuracy::Extreme || | ||||||
|            values.gpu_accuracy.GetValue() == GpuAccuracy::High; |            values.current_gpu_accuracy == GpuAccuracy::High; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| bool IsFastmemEnabled() { | bool IsFastmemEnabled() { | ||||||
|  |  | ||||||
|  | @ -307,6 +307,7 @@ struct Values { | ||||||
|                                                       Specialization::Default, |                                                       Specialization::Default, | ||||||
|                                                       true, |                                                       true, | ||||||
|                                                       true}; |                                                       true}; | ||||||
|  |     GpuAccuracy current_gpu_accuracy{GpuAccuracy::High}; | ||||||
|     SwitchableSetting<AnisotropyMode, true> max_anisotropy{ |     SwitchableSetting<AnisotropyMode, true> max_anisotropy{ | ||||||
|         linkage,          AnisotropyMode::Automatic, AnisotropyMode::Automatic, AnisotropyMode::X16, |         linkage,          AnisotropyMode::Automatic, AnisotropyMode::Automatic, AnisotropyMode::X16, | ||||||
|         "max_anisotropy", Category::RendererAdvanced}; |         "max_anisotropy", Category::RendererAdvanced}; | ||||||
|  | @ -522,6 +523,7 @@ struct Values { | ||||||
| 
 | 
 | ||||||
| extern Values values; | extern Values values; | ||||||
| 
 | 
 | ||||||
|  | void UpdateGPUAccuracy(); | ||||||
| bool IsGPULevelExtreme(); | bool IsGPULevelExtreme(); | ||||||
| bool IsGPULevelHigh(); | bool IsGPULevelHigh(); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -95,6 +95,12 @@ add_library(video_core STATIC | ||||||
|     memory_manager.h |     memory_manager.h | ||||||
|     precompiled_headers.h |     precompiled_headers.h | ||||||
|     pte_kind.h |     pte_kind.h | ||||||
|  |     query_cache/bank_base.h | ||||||
|  |     query_cache/query_base.h | ||||||
|  |     query_cache/query_cache_base.h | ||||||
|  |     query_cache/query_cache.h | ||||||
|  |     query_cache/query_stream.h | ||||||
|  |     query_cache/types.h | ||||||
|     query_cache.h |     query_cache.h | ||||||
|     rasterizer_accelerated.cpp |     rasterizer_accelerated.cpp | ||||||
|     rasterizer_accelerated.h |     rasterizer_accelerated.h | ||||||
|  |  | ||||||
|  | @ -272,13 +272,19 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad | ||||||
|     if (!cpu_addr) { |     if (!cpu_addr) { | ||||||
|         return {&slot_buffers[NULL_BUFFER_ID], 0}; |         return {&slot_buffers[NULL_BUFFER_ID], 0}; | ||||||
|     } |     } | ||||||
|     const BufferId buffer_id = FindBuffer(*cpu_addr, size); |     return ObtainCPUBuffer(*cpu_addr, size, sync_info, post_op); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <class P> | ||||||
|  | std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainCPUBuffer( | ||||||
|  |     VAddr cpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op) { | ||||||
|  |     const BufferId buffer_id = FindBuffer(cpu_addr, size); | ||||||
|     Buffer& buffer = slot_buffers[buffer_id]; |     Buffer& buffer = slot_buffers[buffer_id]; | ||||||
| 
 | 
 | ||||||
|     // synchronize op
 |     // synchronize op
 | ||||||
|     switch (sync_info) { |     switch (sync_info) { | ||||||
|     case ObtainBufferSynchronize::FullSynchronize: |     case ObtainBufferSynchronize::FullSynchronize: | ||||||
|         SynchronizeBuffer(buffer, *cpu_addr, size); |         SynchronizeBuffer(buffer, cpu_addr, size); | ||||||
|         break; |         break; | ||||||
|     default: |     default: | ||||||
|         break; |         break; | ||||||
|  | @ -286,11 +292,11 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad | ||||||
| 
 | 
 | ||||||
|     switch (post_op) { |     switch (post_op) { | ||||||
|     case ObtainBufferOperation::MarkAsWritten: |     case ObtainBufferOperation::MarkAsWritten: | ||||||
|         MarkWrittenBuffer(buffer_id, *cpu_addr, size); |         MarkWrittenBuffer(buffer_id, cpu_addr, size); | ||||||
|         break; |         break; | ||||||
|     case ObtainBufferOperation::DiscardWrite: { |     case ObtainBufferOperation::DiscardWrite: { | ||||||
|         VAddr cpu_addr_start = Common::AlignDown(*cpu_addr, 64); |         VAddr cpu_addr_start = Common::AlignDown(cpu_addr, 64); | ||||||
|         VAddr cpu_addr_end = Common::AlignUp(*cpu_addr + size, 64); |         VAddr cpu_addr_end = Common::AlignUp(cpu_addr + size, 64); | ||||||
|         IntervalType interval{cpu_addr_start, cpu_addr_end}; |         IntervalType interval{cpu_addr_start, cpu_addr_end}; | ||||||
|         ClearDownload(interval); |         ClearDownload(interval); | ||||||
|         common_ranges.subtract(interval); |         common_ranges.subtract(interval); | ||||||
|  | @ -300,7 +306,7 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad | ||||||
|         break; |         break; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     return {&buffer, buffer.Offset(*cpu_addr)}; |     return {&buffer, buffer.Offset(cpu_addr)}; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| template <class P> | template <class P> | ||||||
|  |  | ||||||
|  | @ -295,6 +295,10 @@ public: | ||||||
|     [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, |     [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, | ||||||
|                                                        ObtainBufferSynchronize sync_info, |                                                        ObtainBufferSynchronize sync_info, | ||||||
|                                                        ObtainBufferOperation post_op); |                                                        ObtainBufferOperation post_op); | ||||||
|  | 
 | ||||||
|  |     [[nodiscard]] std::pair<Buffer*, u32> ObtainCPUBuffer(VAddr gpu_addr, u32 size, | ||||||
|  |                                                           ObtainBufferSynchronize sync_info, | ||||||
|  |                                                           ObtainBufferOperation post_op); | ||||||
|     void FlushCachedWrites(); |     void FlushCachedWrites(); | ||||||
| 
 | 
 | ||||||
|     /// Return true when there are uncommitted buffers to be downloaded
 |     /// Return true when there are uncommitted buffers to be downloaded
 | ||||||
|  | @ -335,6 +339,14 @@ public: | ||||||
| 
 | 
 | ||||||
|     [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); |     [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); | ||||||
| 
 | 
 | ||||||
|  |     template <typename Func> | ||||||
|  |     void BufferOperations(Func&& func) { | ||||||
|  |         do { | ||||||
|  |             channel_state->has_deleted_buffers = false; | ||||||
|  |             func(); | ||||||
|  |         } while (channel_state->has_deleted_buffers); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     std::recursive_mutex mutex; |     std::recursive_mutex mutex; | ||||||
|     Runtime& runtime; |     Runtime& runtime; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -51,7 +51,7 @@ public: | ||||||
|     virtual void CreateChannel(Tegra::Control::ChannelState& channel); |     virtual void CreateChannel(Tegra::Control::ChannelState& channel); | ||||||
| 
 | 
 | ||||||
|     /// Bind a channel for execution.
 |     /// Bind a channel for execution.
 | ||||||
|     void BindToChannel(s32 id); |     virtual void BindToChannel(s32 id); | ||||||
| 
 | 
 | ||||||
|     /// Erase channel's state.
 |     /// Erase channel's state.
 | ||||||
|     void EraseChannel(s32 id); |     void EraseChannel(s32 id); | ||||||
|  |  | ||||||
|  | @ -46,6 +46,7 @@ public: | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     struct IndirectParams { |     struct IndirectParams { | ||||||
|  |         bool is_byte_count; | ||||||
|         bool is_indexed; |         bool is_indexed; | ||||||
|         bool include_count; |         bool include_count; | ||||||
|         GPUVAddr count_start_address; |         GPUVAddr count_start_address; | ||||||
|  |  | ||||||
|  | @ -20,8 +20,6 @@ | ||||||
| 
 | 
 | ||||||
| namespace Tegra::Engines { | namespace Tegra::Engines { | ||||||
| 
 | 
 | ||||||
| using VideoCore::QueryType; |  | ||||||
| 
 |  | ||||||
| /// First register id that is actually a Macro call.
 | /// First register id that is actually a Macro call.
 | ||||||
| constexpr u32 MacroRegistersStart = 0xE00; | constexpr u32 MacroRegistersStart = 0xE00; | ||||||
| 
 | 
 | ||||||
|  | @ -500,27 +498,21 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void Maxwell3D::ProcessQueryGet() { | void Maxwell3D::ProcessQueryGet() { | ||||||
|  |     VideoCommon::QueryPropertiesFlags flags{}; | ||||||
|  |     if (regs.report_semaphore.query.short_query == 0) { | ||||||
|  |         flags |= VideoCommon::QueryPropertiesFlags::HasTimeout; | ||||||
|  |     } | ||||||
|  |     const GPUVAddr sequence_address{regs.report_semaphore.Address()}; | ||||||
|  |     const VideoCommon::QueryType query_type = | ||||||
|  |         static_cast<VideoCommon::QueryType>(regs.report_semaphore.query.report.Value()); | ||||||
|  |     const u32 payload = regs.report_semaphore.payload; | ||||||
|  |     const u32 subreport = regs.report_semaphore.query.sub_report; | ||||||
|     switch (regs.report_semaphore.query.operation) { |     switch (regs.report_semaphore.query.operation) { | ||||||
|     case Regs::ReportSemaphore::Operation::Release: |     case Regs::ReportSemaphore::Operation::Release: | ||||||
|         if (regs.report_semaphore.query.short_query != 0) { |         if (regs.report_semaphore.query.short_query != 0) { | ||||||
|             const GPUVAddr sequence_address{regs.report_semaphore.Address()}; |             flags |= VideoCommon::QueryPropertiesFlags::IsAFence; | ||||||
|             const u32 payload = regs.report_semaphore.payload; |  | ||||||
|             std::function<void()> operation([this, sequence_address, payload] { |  | ||||||
|                 memory_manager.Write<u32>(sequence_address, payload); |  | ||||||
|             }); |  | ||||||
|             rasterizer->SignalFence(std::move(operation)); |  | ||||||
|         } else { |  | ||||||
|             struct LongQueryResult { |  | ||||||
|                 u64_le value; |  | ||||||
|                 u64_le timestamp; |  | ||||||
|             }; |  | ||||||
|             const GPUVAddr sequence_address{regs.report_semaphore.Address()}; |  | ||||||
|             const u32 payload = regs.report_semaphore.payload; |  | ||||||
|             [this, sequence_address, payload] { |  | ||||||
|                 memory_manager.Write<u64>(sequence_address + sizeof(u64), system.GPU().GetTicks()); |  | ||||||
|                 memory_manager.Write<u64>(sequence_address, payload); |  | ||||||
|             }(); |  | ||||||
|         } |         } | ||||||
|  |         rasterizer->Query(sequence_address, query_type, flags, payload, subreport); | ||||||
|         break; |         break; | ||||||
|     case Regs::ReportSemaphore::Operation::Acquire: |     case Regs::ReportSemaphore::Operation::Acquire: | ||||||
|         // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
 |         // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
 | ||||||
|  | @ -528,11 +520,7 @@ void Maxwell3D::ProcessQueryGet() { | ||||||
|         UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); |         UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); | ||||||
|         break; |         break; | ||||||
|     case Regs::ReportSemaphore::Operation::ReportOnly: |     case Regs::ReportSemaphore::Operation::ReportOnly: | ||||||
|         if (const std::optional<u64> result = GetQueryResult()) { |         rasterizer->Query(sequence_address, query_type, flags, payload, subreport); | ||||||
|             // If the query returns an empty optional it means it's cached and deferred.
 |  | ||||||
|             // In this case we have a non-empty result, so we stamp it immediately.
 |  | ||||||
|             StampQueryResult(*result, regs.report_semaphore.query.short_query == 0); |  | ||||||
|         } |  | ||||||
|         break; |         break; | ||||||
|     case Regs::ReportSemaphore::Operation::Trap: |     case Regs::ReportSemaphore::Operation::Trap: | ||||||
|         UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); |         UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); | ||||||
|  | @ -544,6 +532,10 @@ void Maxwell3D::ProcessQueryGet() { | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void Maxwell3D::ProcessQueryCondition() { | void Maxwell3D::ProcessQueryCondition() { | ||||||
|  |     if (rasterizer->AccelerateConditionalRendering()) { | ||||||
|  |         execute_on = true; | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|     const GPUVAddr condition_address{regs.render_enable.Address()}; |     const GPUVAddr condition_address{regs.render_enable.Address()}; | ||||||
|     switch (regs.render_enable_override) { |     switch (regs.render_enable_override) { | ||||||
|     case Regs::RenderEnable::Override::AlwaysRender: |     case Regs::RenderEnable::Override::AlwaysRender: | ||||||
|  | @ -553,10 +545,6 @@ void Maxwell3D::ProcessQueryCondition() { | ||||||
|         execute_on = false; |         execute_on = false; | ||||||
|         break; |         break; | ||||||
|     case Regs::RenderEnable::Override::UseRenderEnable: { |     case Regs::RenderEnable::Override::UseRenderEnable: { | ||||||
|         if (rasterizer->AccelerateConditionalRendering()) { |  | ||||||
|             execute_on = true; |  | ||||||
|             return; |  | ||||||
|         } |  | ||||||
|         switch (regs.render_enable.mode) { |         switch (regs.render_enable.mode) { | ||||||
|         case Regs::RenderEnable::Mode::True: { |         case Regs::RenderEnable::Mode::True: { | ||||||
|             execute_on = true; |             execute_on = true; | ||||||
|  | @ -598,15 +586,9 @@ void Maxwell3D::ProcessQueryCondition() { | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void Maxwell3D::ProcessCounterReset() { | void Maxwell3D::ProcessCounterReset() { | ||||||
| #if ANDROID |  | ||||||
|     if (!Settings::IsGPULevelHigh()) { |  | ||||||
|         // This is problematic on Android, disable on GPU Normal.
 |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
|     switch (regs.clear_report_value) { |     switch (regs.clear_report_value) { | ||||||
|     case Regs::ClearReport::ZPassPixelCount: |     case Regs::ClearReport::ZPassPixelCount: | ||||||
|         rasterizer->ResetCounter(QueryType::SamplesPassed); |         rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64); | ||||||
|         break; |         break; | ||||||
|     default: |     default: | ||||||
|         LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value); |         LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value); | ||||||
|  | @ -620,28 +602,6 @@ void Maxwell3D::ProcessSyncPoint() { | ||||||
|     rasterizer->SignalSyncPoint(sync_point); |     rasterizer->SignalSyncPoint(sync_point); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| std::optional<u64> Maxwell3D::GetQueryResult() { |  | ||||||
|     switch (regs.report_semaphore.query.report) { |  | ||||||
|     case Regs::ReportSemaphore::Report::Payload: |  | ||||||
|         return regs.report_semaphore.payload; |  | ||||||
|     case Regs::ReportSemaphore::Report::ZPassPixelCount64: |  | ||||||
| #if ANDROID |  | ||||||
|         if (!Settings::IsGPULevelHigh()) { |  | ||||||
|             // This is problematic on Android, disable on GPU Normal.
 |  | ||||||
|             return 120; |  | ||||||
|         } |  | ||||||
| #endif |  | ||||||
|         // Deferred.
 |  | ||||||
|         rasterizer->Query(regs.report_semaphore.Address(), QueryType::SamplesPassed, |  | ||||||
|                           system.GPU().GetTicks()); |  | ||||||
|         return std::nullopt; |  | ||||||
|     default: |  | ||||||
|         LOG_DEBUG(HW_GPU, "Unimplemented query report type {}", |  | ||||||
|                   regs.report_semaphore.query.report.Value()); |  | ||||||
|         return 1; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void Maxwell3D::ProcessCBBind(size_t stage_index) { | void Maxwell3D::ProcessCBBind(size_t stage_index) { | ||||||
|     // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader
 |     // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader
 | ||||||
|     // stage.
 |     // stage.
 | ||||||
|  |  | ||||||
|  | @ -3182,9 +3182,6 @@ private: | ||||||
|     /// Handles writes to syncing register.
 |     /// Handles writes to syncing register.
 | ||||||
|     void ProcessSyncPoint(); |     void ProcessSyncPoint(); | ||||||
| 
 | 
 | ||||||
|     /// Returns a query's value or an empty object if the value will be deferred through a cache.
 |  | ||||||
|     std::optional<u64> GetQueryResult(); |  | ||||||
| 
 |  | ||||||
|     void RefreshParametersImpl(); |     void RefreshParametersImpl(); | ||||||
| 
 | 
 | ||||||
|     bool IsMethodExecutable(u32 method); |     bool IsMethodExecutable(u32 method); | ||||||
|  |  | ||||||
|  | @ -362,21 +362,17 @@ void MaxwellDMA::ReleaseSemaphore() { | ||||||
|     const auto type = regs.launch_dma.semaphore_type; |     const auto type = regs.launch_dma.semaphore_type; | ||||||
|     const GPUVAddr address = regs.semaphore.address; |     const GPUVAddr address = regs.semaphore.address; | ||||||
|     const u32 payload = regs.semaphore.payload; |     const u32 payload = regs.semaphore.payload; | ||||||
|  |     VideoCommon::QueryPropertiesFlags flags{VideoCommon::QueryPropertiesFlags::IsAFence}; | ||||||
|     switch (type) { |     switch (type) { | ||||||
|     case LaunchDMA::SemaphoreType::NONE: |     case LaunchDMA::SemaphoreType::NONE: | ||||||
|         break; |         break; | ||||||
|     case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: { |     case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: { | ||||||
|         std::function<void()> operation( |         rasterizer->Query(address, VideoCommon::QueryType::Payload, flags, payload, 0); | ||||||
|             [this, address, payload] { memory_manager.Write<u32>(address, payload); }); |  | ||||||
|         rasterizer->SignalFence(std::move(operation)); |  | ||||||
|         break; |         break; | ||||||
|     } |     } | ||||||
|     case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: { |     case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: { | ||||||
|         std::function<void()> operation([this, address, payload] { |         rasterizer->Query(address, VideoCommon::QueryType::Payload, | ||||||
|             memory_manager.Write<u64>(address + sizeof(u64), system.GPU().GetTicks()); |                           flags | VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); | ||||||
|             memory_manager.Write<u64>(address, payload); |  | ||||||
|         }); |  | ||||||
|         rasterizer->SignalFence(std::move(operation)); |  | ||||||
|         break; |         break; | ||||||
|     } |     } | ||||||
|     default: |     default: | ||||||
|  |  | ||||||
|  | @ -82,10 +82,8 @@ void Puller::ProcessSemaphoreTriggerMethod() { | ||||||
|     if (op == GpuSemaphoreOperation::WriteLong) { |     if (op == GpuSemaphoreOperation::WriteLong) { | ||||||
|         const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; |         const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; | ||||||
|         const u32 payload = regs.semaphore_sequence; |         const u32 payload = regs.semaphore_sequence; | ||||||
|         [this, sequence_address, payload] { |         rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, | ||||||
|             memory_manager.Write<u64>(sequence_address + sizeof(u64), gpu.GetTicks()); |                           VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); | ||||||
|             memory_manager.Write<u64>(sequence_address, payload); |  | ||||||
|         }(); |  | ||||||
|     } else { |     } else { | ||||||
|         do { |         do { | ||||||
|             const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())}; |             const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())}; | ||||||
|  | @ -120,10 +118,8 @@ void Puller::ProcessSemaphoreTriggerMethod() { | ||||||
| void Puller::ProcessSemaphoreRelease() { | void Puller::ProcessSemaphoreRelease() { | ||||||
|     const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; |     const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; | ||||||
|     const u32 payload = regs.semaphore_release; |     const u32 payload = regs.semaphore_release; | ||||||
|     std::function<void()> operation([this, sequence_address, payload] { |     rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, | ||||||
|         memory_manager.Write<u32>(sequence_address, payload); |                       VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0); | ||||||
|     }); |  | ||||||
|     rasterizer->SignalFence(std::move(operation)); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void Puller::ProcessSemaphoreAcquire() { | void Puller::ProcessSemaphoreAcquire() { | ||||||
|  | @ -132,7 +128,6 @@ void Puller::ProcessSemaphoreAcquire() { | ||||||
|     while (word != value) { |     while (word != value) { | ||||||
|         regs.acquire_active = true; |         regs.acquire_active = true; | ||||||
|         regs.acquire_value = value; |         regs.acquire_value = value; | ||||||
|         std::this_thread::sleep_for(std::chrono::milliseconds(1)); |  | ||||||
|         rasterizer->ReleaseFences(); |         rasterizer->ReleaseFences(); | ||||||
|         word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress()); |         word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress()); | ||||||
|         // TODO(kemathe73) figure out how to do the acquire_timeout
 |         // TODO(kemathe73) figure out how to do the acquire_timeout
 | ||||||
|  |  | ||||||
|  | @ -55,6 +55,9 @@ public: | ||||||
| 
 | 
 | ||||||
|     // Unlike other fences, this one doesn't
 |     // Unlike other fences, this one doesn't
 | ||||||
|     void SignalOrdering() { |     void SignalOrdering() { | ||||||
|  |         if constexpr (!can_async_check) { | ||||||
|  |             TryReleasePendingFences<false>(); | ||||||
|  |         } | ||||||
|         std::scoped_lock lock{buffer_cache.mutex}; |         std::scoped_lock lock{buffer_cache.mutex}; | ||||||
|         buffer_cache.AccumulateFlushes(); |         buffer_cache.AccumulateFlushes(); | ||||||
|     } |     } | ||||||
|  | @ -104,9 +107,25 @@ public: | ||||||
|         SignalFence(std::move(func)); |         SignalFence(std::move(func)); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     void WaitPendingFences() { |     void WaitPendingFences([[maybe_unused]] bool force) { | ||||||
|         if constexpr (!can_async_check) { |         if constexpr (!can_async_check) { | ||||||
|             TryReleasePendingFences<true>(); |             TryReleasePendingFences<true>(); | ||||||
|  |         } else { | ||||||
|  |             if (!force) { | ||||||
|  |                 return; | ||||||
|  |             } | ||||||
|  |             std::mutex wait_mutex; | ||||||
|  |             std::condition_variable wait_cv; | ||||||
|  |             std::atomic<bool> wait_finished{}; | ||||||
|  |             std::function<void()> func([&] { | ||||||
|  |                 std::scoped_lock lk(wait_mutex); | ||||||
|  |                 wait_finished.store(true, std::memory_order_relaxed); | ||||||
|  |                 wait_cv.notify_all(); | ||||||
|  |             }); | ||||||
|  |             SignalFence(std::move(func)); | ||||||
|  |             std::unique_lock lk(wait_mutex); | ||||||
|  |             wait_cv.wait( | ||||||
|  |                 lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); }); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -102,7 +102,8 @@ struct GPU::Impl { | ||||||
| 
 | 
 | ||||||
|     /// Signal the ending of command list.
 |     /// Signal the ending of command list.
 | ||||||
|     void OnCommandListEnd() { |     void OnCommandListEnd() { | ||||||
|         rasterizer->ReleaseFences(); |         rasterizer->ReleaseFences(false); | ||||||
|  |         Settings::UpdateGPUAccuracy(); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     /// Request a host GPU memory flush from the CPU.
 |     /// Request a host GPU memory flush from the CPU.
 | ||||||
|  | @ -220,6 +221,7 @@ struct GPU::Impl { | ||||||
|     /// This can be used to launch any necessary threads and register any necessary
 |     /// This can be used to launch any necessary threads and register any necessary
 | ||||||
|     /// core timing events.
 |     /// core timing events.
 | ||||||
|     void Start() { |     void Start() { | ||||||
|  |         Settings::UpdateGPUAccuracy(); | ||||||
|         gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler); |         gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -41,6 +41,9 @@ set(SHADER_FILES | ||||||
|     pitch_unswizzle.comp |     pitch_unswizzle.comp | ||||||
|     present_bicubic.frag |     present_bicubic.frag | ||||||
|     present_gaussian.frag |     present_gaussian.frag | ||||||
|  |     queries_prefix_scan_sum.comp | ||||||
|  |     queries_prefix_scan_sum_nosubgroups.comp | ||||||
|  |     resolve_conditional_render.comp | ||||||
|     smaa_edge_detection.vert |     smaa_edge_detection.vert | ||||||
|     smaa_edge_detection.frag |     smaa_edge_detection.frag | ||||||
|     smaa_blending_weight_calculation.vert |     smaa_blending_weight_calculation.vert | ||||||
|  | @ -70,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND") | ||||||
| endif() | endif() | ||||||
| 
 | 
 | ||||||
| set(GLSL_FLAGS "") | set(GLSL_FLAGS "") | ||||||
|  | set(SPIR_V_VERSION "spirv1.3") | ||||||
| set(QUIET_FLAG "--quiet") | set(QUIET_FLAG "--quiet") | ||||||
| 
 | 
 | ||||||
| set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) | set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) | ||||||
|  | @ -123,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) | ||||||
|             OUTPUT |             OUTPUT | ||||||
|                 ${SPIRV_HEADER_FILE} |                 ${SPIRV_HEADER_FILE} | ||||||
|             COMMAND |             COMMAND | ||||||
|                 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} |                 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION} | ||||||
|             MAIN_DEPENDENCY |             MAIN_DEPENDENCY | ||||||
|                 ${SOURCE_FILE} |                 ${SOURCE_FILE} | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
							
								
								
									
										173
									
								
								src/video_core/host_shaders/queries_prefix_scan_sum.comp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										173
									
								
								src/video_core/host_shaders/queries_prefix_scan_sum.comp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,173 @@ | ||||||
|  | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||||||
|  | // SPDX-License-Identifier: GPL-3.0-or-later | ||||||
|  | 
 | ||||||
|  | #version 460 core | ||||||
|  | 
 | ||||||
|  | #extension GL_KHR_shader_subgroup_basic : require | ||||||
|  | #extension GL_KHR_shader_subgroup_shuffle : require | ||||||
|  | #extension GL_KHR_shader_subgroup_shuffle_relative : require | ||||||
|  | #extension GL_KHR_shader_subgroup_arithmetic : require | ||||||
|  | 
 | ||||||
|  | #ifdef VULKAN | ||||||
|  | 
 | ||||||
|  | #define HAS_EXTENDED_TYPES 1 | ||||||
|  | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||||||
|  | #define END_PUSH_CONSTANTS }; | ||||||
|  | #define UNIFORM(n) | ||||||
|  | #define BINDING_INPUT_BUFFER 0 | ||||||
|  | #define BINDING_OUTPUT_IMAGE 1 | ||||||
|  | 
 | ||||||
|  | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||||||
|  | 
 | ||||||
|  | #extension GL_NV_gpu_shader5 : enable | ||||||
|  | #ifdef GL_NV_gpu_shader5 | ||||||
|  | #define HAS_EXTENDED_TYPES 1 | ||||||
|  | #else | ||||||
|  | #define HAS_EXTENDED_TYPES 0 | ||||||
|  | #endif | ||||||
|  | #define BEGIN_PUSH_CONSTANTS | ||||||
|  | #define END_PUSH_CONSTANTS | ||||||
|  | #define UNIFORM(n) layout(location = n) uniform | ||||||
|  | #define BINDING_INPUT_BUFFER 0 | ||||||
|  | #define BINDING_OUTPUT_IMAGE 0 | ||||||
|  | 
 | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | BEGIN_PUSH_CONSTANTS | ||||||
|  | UNIFORM(0) uint min_accumulation_base; | ||||||
|  | UNIFORM(1) uint max_accumulation_base; | ||||||
|  | UNIFORM(2) uint accumulation_limit; | ||||||
|  | UNIFORM(3) uint buffer_offset; | ||||||
|  | END_PUSH_CONSTANTS | ||||||
|  | 
 | ||||||
|  | #define LOCAL_RESULTS 8 | ||||||
|  | #define QUERIES_PER_INVOC 2048 | ||||||
|  | 
 | ||||||
|  | layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; | ||||||
|  | 
 | ||||||
|  | layout(std430, binding = 0) readonly buffer block1 { | ||||||
|  |     uvec2 input_data[]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | layout(std430, binding = 1) coherent buffer block2 { | ||||||
|  |     uvec2 output_data[]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | layout(std430, binding = 2) coherent buffer block3 { | ||||||
|  |     uvec2 accumulated_data; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | shared uvec2 shared_data[128]; | ||||||
|  | 
 | ||||||
|  | // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 | ||||||
|  | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||||||
|  |     uint carry = 0; | ||||||
|  |     uvec2 result; | ||||||
|  |     result.x = uaddCarry(value_1.x, value_2.x, carry); | ||||||
|  |     result.y = value_1.y + value_2.y + carry; | ||||||
|  |     return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // do subgroup Prefix Sum using Hillis and Steele's algorithm | ||||||
|  | uvec2 subgroupInclusiveAddUint64(uvec2 value) { | ||||||
|  |     uvec2 result = value; | ||||||
|  |     for (uint i = 1; i < gl_SubgroupSize; i *= 2) { | ||||||
|  |         uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; | ||||||
|  |         if (i <= gl_SubgroupInvocationID) { | ||||||
|  |             result = AddUint64(result, other); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Writes down the results to the output buffer and to the accumulation buffer | ||||||
|  | void WriteResults(uvec2 results[LOCAL_RESULTS]) { | ||||||
|  |     const uint current_id = gl_LocalInvocationID.x; | ||||||
|  |     const uvec2 accum = accumulated_data; | ||||||
|  |     for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||||||
|  |         uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0); | ||||||
|  |         AddUint64(results[i], base_data); | ||||||
|  |     } | ||||||
|  |     for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||||||
|  |         output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i]; | ||||||
|  |     } | ||||||
|  |     uint index = accumulation_limit % LOCAL_RESULTS; | ||||||
|  |     uint base_id = accumulation_limit / LOCAL_RESULTS; | ||||||
|  |     if (min_accumulation_base >= accumulation_limit + 1) { | ||||||
|  |         if (current_id == base_id) { | ||||||
|  |             accumulated_data = results[index]; | ||||||
|  |         } | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     // We have that ugly case in which the accumulation data is reset in the middle somewhere. | ||||||
|  |     barrier(); | ||||||
|  |     groupMemoryBarrier(); | ||||||
|  | 
 | ||||||
|  |     if (current_id == base_id) { | ||||||
|  |         uvec2 reset_value = output_data[max_accumulation_base - 1]; | ||||||
|  |         // Calculate two complement / negate manually | ||||||
|  |         reset_value = AddUint64(uvec2(1,0), ~reset_value); | ||||||
|  |         accumulated_data = AddUint64(results[index], reset_value); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void main() { | ||||||
|  |     const uint subgroup_inv_id = gl_SubgroupInvocationID; | ||||||
|  |     const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups; | ||||||
|  |     const uint last_subgroup_id = subgroupMax(subgroup_inv_id); | ||||||
|  |     const uint current_id = gl_LocalInvocationID.x; | ||||||
|  |     const uint total_work = accumulation_limit; | ||||||
|  |     const uint last_result_id = LOCAL_RESULTS - 1; | ||||||
|  |     uvec2 data[LOCAL_RESULTS]; | ||||||
|  |     for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||||||
|  |         data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i]; | ||||||
|  |     } | ||||||
|  |     uvec2 results[LOCAL_RESULTS]; | ||||||
|  |     results[0] = data[0]; | ||||||
|  |     for (uint i = 1; i < LOCAL_RESULTS; i++) { | ||||||
|  |         results[i] = AddUint64(data[i], results[i - 1]); | ||||||
|  |     } | ||||||
|  |     // make sure all input data has been loaded | ||||||
|  |     subgroupBarrier(); | ||||||
|  |     subgroupMemoryBarrier(); | ||||||
|  | 
 | ||||||
|  |     // on the last local result, do a subgroup inclusive scan sum | ||||||
|  |     results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]); | ||||||
|  |     // get the last local result from the subgroup behind the current | ||||||
|  |     uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1); | ||||||
|  |     if (subgroup_inv_id != 0) { | ||||||
|  |         for (uint i = 1; i < LOCAL_RESULTS; i++) { | ||||||
|  |             results[i - 1] = AddUint64(results[i - 1], result_behind); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // if we had less queries than our subgroup, just write down the results. | ||||||
|  |     if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch. | ||||||
|  |         WriteResults(results); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // We now have more, so lets write the last result into shared memory. | ||||||
|  |     // Only pick the last subgroup. | ||||||
|  |     if (subgroup_inv_id == last_subgroup_id) { | ||||||
|  |         shared_data[subgroup_id] = results[last_result_id]; | ||||||
|  |     } | ||||||
|  |     // wait until everyone loaded their stuffs | ||||||
|  |     barrier(); | ||||||
|  |     memoryBarrierShared(); | ||||||
|  | 
 | ||||||
|  |     // only if it's not the first subgroup | ||||||
|  |     if (subgroup_id != 0) { | ||||||
|  |         // get the results from some previous invocation | ||||||
|  |         uvec2 tmp = shared_data[subgroup_inv_id]; | ||||||
|  |         subgroupBarrier(); | ||||||
|  |         subgroupMemoryBarrierShared(); | ||||||
|  |         tmp = subgroupInclusiveAddUint64(tmp); | ||||||
|  |         // obtain the result that would be equivalent to the previous result | ||||||
|  |         uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1); | ||||||
|  |         for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||||||
|  |             results[i] = AddUint64(results[i], shuffled_result); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     WriteResults(results); | ||||||
|  | } | ||||||
|  | @ -0,0 +1,138 @@ | ||||||
|  | // SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel | ||||||
|  | // SPDX-License-Identifier: MIT | ||||||
|  | 
 | ||||||
|  | // Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and | ||||||
|  | // Nicholas Haemel. Modified to suit needs. | ||||||
|  | 
 | ||||||
|  | #version 460 core | ||||||
|  | 
 | ||||||
|  | #ifdef VULKAN | ||||||
|  | 
 | ||||||
|  | #define HAS_EXTENDED_TYPES 1 | ||||||
|  | #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||||||
|  | #define END_PUSH_CONSTANTS }; | ||||||
|  | #define UNIFORM(n) | ||||||
|  | #define BINDING_INPUT_BUFFER 0 | ||||||
|  | #define BINDING_OUTPUT_IMAGE 1 | ||||||
|  | 
 | ||||||
|  | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||||||
|  | 
 | ||||||
|  | #extension GL_NV_gpu_shader5 : enable | ||||||
|  | #ifdef GL_NV_gpu_shader5 | ||||||
|  | #define HAS_EXTENDED_TYPES 1 | ||||||
|  | #else | ||||||
|  | #define HAS_EXTENDED_TYPES 0 | ||||||
|  | #endif | ||||||
|  | #define BEGIN_PUSH_CONSTANTS | ||||||
|  | #define END_PUSH_CONSTANTS | ||||||
|  | #define UNIFORM(n) layout(location = n) uniform | ||||||
|  | #define BINDING_INPUT_BUFFER 0 | ||||||
|  | #define BINDING_OUTPUT_IMAGE 0 | ||||||
|  | 
 | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | BEGIN_PUSH_CONSTANTS | ||||||
|  | UNIFORM(0) uint min_accumulation_base; | ||||||
|  | UNIFORM(1) uint max_accumulation_base; | ||||||
|  | UNIFORM(2) uint accumulation_limit; | ||||||
|  | UNIFORM(3) uint buffer_offset; | ||||||
|  | END_PUSH_CONSTANTS | ||||||
|  | 
 | ||||||
|  | #define LOCAL_RESULTS 4 | ||||||
|  | #define QUERIES_PER_INVOC 2048 | ||||||
|  | 
 | ||||||
|  | layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; | ||||||
|  | 
 | ||||||
|  | layout(std430, binding = 0) readonly buffer block1 { | ||||||
|  |     uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | layout(std430, binding = 1) writeonly coherent buffer block2 { | ||||||
|  |     uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | layout(std430, binding = 2) coherent buffer block3 { | ||||||
|  |     uvec2 accumulated_data; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; | ||||||
|  | 
 | ||||||
|  | uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||||||
|  |     uint carry = 0; | ||||||
|  |     uvec2 result; | ||||||
|  |     result.x = uaddCarry(value_1.x, value_2.x, carry); | ||||||
|  |     result.y = value_1.y + value_2.y + carry; | ||||||
|  |     return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void main(void) { | ||||||
|  |     uint id = gl_LocalInvocationID.x; | ||||||
|  |     uvec2 base_value[LOCAL_RESULTS]; | ||||||
|  |     const uvec2 accum = accumulated_data; | ||||||
|  |     for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||||||
|  |         base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base | ||||||
|  |                             ? accumulated_data | ||||||
|  |                             : uvec2(0); | ||||||
|  |     } | ||||||
|  |     uint work_size = gl_WorkGroupSize.x; | ||||||
|  |     uint rd_id; | ||||||
|  |     uint wr_id; | ||||||
|  |     uint mask; | ||||||
|  |     uvec2 inputs[LOCAL_RESULTS]; | ||||||
|  |     for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||||||
|  |         inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i]; | ||||||
|  |     } | ||||||
|  |     // The number of steps is the log base 2 of the | ||||||
|  |     // work group size, which should be a power of 2 | ||||||
|  |     const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS)); | ||||||
|  |     uint step = 0; | ||||||
|  | 
 | ||||||
|  |     // Each invocation is responsible for the content of | ||||||
|  |     // two elements of the output array | ||||||
|  |     for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||||||
|  |         shared_data[id * LOCAL_RESULTS + i] = inputs[i]; | ||||||
|  |     } | ||||||
|  |     // Synchronize to make sure that everyone has initialized | ||||||
|  |     // their elements of shared_data[] with data loaded from | ||||||
|  |     // the input arrays | ||||||
|  |     barrier(); | ||||||
|  |     memoryBarrierShared(); | ||||||
|  |     // For each step... | ||||||
|  |     for (step = 0; step < steps; step++) { | ||||||
|  |         // Calculate the read and write index in the | ||||||
|  |         // shared array | ||||||
|  |         mask = (1 << step) - 1; | ||||||
|  |         rd_id = ((id >> step) << (step + 1)) + mask; | ||||||
|  |         wr_id = rd_id + 1 + (id & mask); | ||||||
|  |         // Accumulate the read data into our element | ||||||
|  | 
 | ||||||
|  |         shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); | ||||||
|  |         // Synchronize again to make sure that everyone | ||||||
|  |         // has caught up with us | ||||||
|  |         barrier(); | ||||||
|  |         memoryBarrierShared(); | ||||||
|  |     } | ||||||
|  |     // Add the accumulation | ||||||
|  |     for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||||||
|  |         shared_data[id * LOCAL_RESULTS + i] = | ||||||
|  |             AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]); | ||||||
|  |     } | ||||||
|  |     barrier(); | ||||||
|  |     memoryBarrierShared(); | ||||||
|  | 
 | ||||||
|  |     // Finally write our data back to the output buffer | ||||||
|  |     for (uint i = 0; i < LOCAL_RESULTS; i++) { | ||||||
|  |         output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i]; | ||||||
|  |     } | ||||||
|  |     if (id == 0) { | ||||||
|  |         if (min_accumulation_base >= accumulation_limit + 1) { | ||||||
|  |             accumulated_data = shared_data[accumulation_limit]; | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |         uvec2 reset_value = shared_data[max_accumulation_base - 1]; | ||||||
|  |         uvec2 final_value = shared_data[accumulation_limit]; | ||||||
|  |         // Two complements | ||||||
|  |         reset_value = AddUint64(uvec2(1, 0), ~reset_value); | ||||||
|  |         accumulated_data = AddUint64(final_value, reset_value); | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										20
									
								
								src/video_core/host_shaders/resolve_conditional_render.comp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								src/video_core/host_shaders/resolve_conditional_render.comp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | ||||||
|  | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||||||
|  | // SPDX-License-Identifier: GPL-3.0-or-later | ||||||
|  | 
 | ||||||
|  | #version 450 | ||||||
|  | 
 | ||||||
|  | layout(local_size_x = 1) in; | ||||||
|  | 
 | ||||||
|  | layout(std430, binding = 0) buffer Query { | ||||||
|  |     uvec2 initial; | ||||||
|  |     uvec2 unknown; | ||||||
|  |     uvec2 current; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | layout(std430, binding = 1) buffer Result { | ||||||
|  |     uint result; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | void main() { | ||||||
|  |     result = all(equal(initial, current)) ? 1 : 0; | ||||||
|  | } | ||||||
|  | @ -67,6 +67,7 @@ public: | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         auto& params = maxwell3d.draw_manager->GetIndirectParams(); |         auto& params = maxwell3d.draw_manager->GetIndirectParams(); | ||||||
|  |         params.is_byte_count = false; | ||||||
|         params.is_indexed = false; |         params.is_indexed = false; | ||||||
|         params.include_count = false; |         params.include_count = false; | ||||||
|         params.count_start_address = 0; |         params.count_start_address = 0; | ||||||
|  | @ -161,6 +162,7 @@ public: | ||||||
|                 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); |                 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); | ||||||
|         } |         } | ||||||
|         auto& params = maxwell3d.draw_manager->GetIndirectParams(); |         auto& params = maxwell3d.draw_manager->GetIndirectParams(); | ||||||
|  |         params.is_byte_count = false; | ||||||
|         params.is_indexed = true; |         params.is_indexed = true; | ||||||
|         params.include_count = false; |         params.include_count = false; | ||||||
|         params.count_start_address = 0; |         params.count_start_address = 0; | ||||||
|  | @ -256,6 +258,7 @@ public: | ||||||
|         const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize()); |         const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize()); | ||||||
|         maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; |         maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; | ||||||
|         auto& params = maxwell3d.draw_manager->GetIndirectParams(); |         auto& params = maxwell3d.draw_manager->GetIndirectParams(); | ||||||
|  |         params.is_byte_count = false; | ||||||
|         params.is_indexed = true; |         params.is_indexed = true; | ||||||
|         params.include_count = true; |         params.include_count = true; | ||||||
|         params.count_start_address = maxwell3d.GetMacroAddress(4); |         params.count_start_address = maxwell3d.GetMacroAddress(4); | ||||||
|  | @ -319,6 +322,47 @@ private: | ||||||
|     } |     } | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | class HLE_DrawIndirectByteCount final : public HLEMacroImpl { | ||||||
|  | public: | ||||||
|  |     explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} | ||||||
|  | 
 | ||||||
|  |     void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override { | ||||||
|  |         auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0xFFFFU); | ||||||
|  |         if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) { | ||||||
|  |             Fallback(parameters); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         auto& params = maxwell3d.draw_manager->GetIndirectParams(); | ||||||
|  |         params.is_byte_count = true; | ||||||
|  |         params.is_indexed = false; | ||||||
|  |         params.include_count = false; | ||||||
|  |         params.count_start_address = 0; | ||||||
|  |         params.indirect_start_address = maxwell3d.GetMacroAddress(2); | ||||||
|  |         params.buffer_size = 4; | ||||||
|  |         params.max_draw_counts = 1; | ||||||
|  |         params.stride = parameters[1]; | ||||||
|  |         maxwell3d.regs.draw.begin = parameters[0]; | ||||||
|  |         maxwell3d.regs.draw_auto_stride = parameters[1]; | ||||||
|  |         maxwell3d.regs.draw_auto_byte_count = parameters[2]; | ||||||
|  | 
 | ||||||
|  |         maxwell3d.draw_manager->DrawArrayIndirect(topology); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | private: | ||||||
|  |     void Fallback(const std::vector<u32>& parameters) { | ||||||
|  |         maxwell3d.RefreshParameters(); | ||||||
|  | 
 | ||||||
|  |         maxwell3d.regs.draw.begin = parameters[0]; | ||||||
|  |         maxwell3d.regs.draw_auto_stride = parameters[1]; | ||||||
|  |         maxwell3d.regs.draw_auto_byte_count = parameters[2]; | ||||||
|  | 
 | ||||||
|  |         maxwell3d.draw_manager->DrawArray( | ||||||
|  |             maxwell3d.regs.draw.topology, 0, | ||||||
|  |             maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1); | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { | class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { | ||||||
| public: | public: | ||||||
|     explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} |     explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} | ||||||
|  | @ -536,6 +580,11 @@ HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} { | ||||||
|                          [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> { |                          [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> { | ||||||
|                              return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__); |                              return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__); | ||||||
|                          })); |                          })); | ||||||
|  |     builders.emplace(0xB5F74EDB717278ECULL, | ||||||
|  |                      std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>( | ||||||
|  |                          [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> { | ||||||
|  |                              return std::make_unique<HLE_DrawIndirectByteCount>(maxwell3d__); | ||||||
|  |                          })); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| HLEMacro::~HLEMacro() = default; | HLEMacro::~HLEMacro() = default; | ||||||
|  |  | ||||||
|  | @ -25,6 +25,13 @@ | ||||||
| #include "video_core/rasterizer_interface.h" | #include "video_core/rasterizer_interface.h" | ||||||
| #include "video_core/texture_cache/slot_vector.h" | #include "video_core/texture_cache/slot_vector.h" | ||||||
| 
 | 
 | ||||||
|  | namespace VideoCore { | ||||||
|  | enum class QueryType { | ||||||
|  |     SamplesPassed, | ||||||
|  | }; | ||||||
|  | constexpr std::size_t NumQueryTypes = 1; | ||||||
|  | } // namespace VideoCore
 | ||||||
|  | 
 | ||||||
| namespace VideoCommon { | namespace VideoCommon { | ||||||
| 
 | 
 | ||||||
| using AsyncJobId = SlotId; | using AsyncJobId = SlotId; | ||||||
|  | @ -98,10 +105,10 @@ private: | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> | template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> | ||||||
| class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { | class QueryCacheLegacy : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { | ||||||
| public: | public: | ||||||
|     explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, |     explicit QueryCacheLegacy(VideoCore::RasterizerInterface& rasterizer_, | ||||||
|                             Core::Memory::Memory& cpu_memory_) |                               Core::Memory::Memory& cpu_memory_) | ||||||
|         : rasterizer{rasterizer_}, |         : rasterizer{rasterizer_}, | ||||||
|           // Use reinterpret_cast instead of static_cast as workaround for
 |           // Use reinterpret_cast instead of static_cast as workaround for
 | ||||||
|           // UBSan bug (https://github.com/llvm/llvm-project/issues/59060)
 |           // UBSan bug (https://github.com/llvm/llvm-project/issues/59060)
 | ||||||
|  |  | ||||||
							
								
								
									
										104
									
								
								src/video_core/query_cache/bank_base.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								src/video_core/query_cache/bank_base.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,104 @@ | ||||||
|  | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 | ||||||
|  | // SPDX-License-Identifier: GPL-3.0-or-later
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include <atomic> | ||||||
|  | #include <deque> | ||||||
|  | #include <utility> | ||||||
|  | 
 | ||||||
|  | #include "common/common_types.h" | ||||||
|  | 
 | ||||||
|  | namespace VideoCommon { | ||||||
|  | 
 | ||||||
|  | class BankBase { | ||||||
|  | protected: | ||||||
|  |     const size_t base_bank_size{}; | ||||||
|  |     size_t bank_size{}; | ||||||
|  |     std::atomic<size_t> references{}; | ||||||
|  |     size_t current_slot{}; | ||||||
|  | 
 | ||||||
|  | public: | ||||||
|  |     explicit BankBase(size_t bank_size_) : base_bank_size{bank_size_}, bank_size(bank_size_) {} | ||||||
|  | 
 | ||||||
|  |     virtual ~BankBase() = default; | ||||||
|  | 
 | ||||||
|  |     virtual std::pair<bool, size_t> Reserve() { | ||||||
|  |         if (IsClosed()) { | ||||||
|  |             return {false, bank_size}; | ||||||
|  |         } | ||||||
|  |         const size_t result = current_slot++; | ||||||
|  |         return {true, result}; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual void Reset() { | ||||||
|  |         current_slot = 0; | ||||||
|  |         references = 0; | ||||||
|  |         bank_size = base_bank_size; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     size_t Size() const { | ||||||
|  |         return bank_size; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     void AddReference(size_t how_many = 1) { | ||||||
|  |         references.fetch_add(how_many, std::memory_order_relaxed); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     void CloseReference(size_t how_many = 1) { | ||||||
|  |         if (how_many > references.load(std::memory_order_relaxed)) { | ||||||
|  |             UNREACHABLE(); | ||||||
|  |         } | ||||||
|  |         references.fetch_sub(how_many, std::memory_order_relaxed); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     void Close() { | ||||||
|  |         bank_size = current_slot; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     bool IsClosed() const { | ||||||
|  |         return current_slot >= bank_size; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     bool IsDead() const { | ||||||
|  |         return IsClosed() && references == 0; | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | template <typename BankType> | ||||||
|  | class BankPool { | ||||||
|  | private: | ||||||
|  |     std::deque<BankType> bank_pool; | ||||||
|  |     std::deque<size_t> bank_indices; | ||||||
|  | 
 | ||||||
|  | public: | ||||||
|  |     BankPool() = default; | ||||||
|  |     ~BankPool() = default; | ||||||
|  | 
 | ||||||
|  |     // Reserve a bank from the pool and return its index
 | ||||||
|  |     template <typename Func> | ||||||
|  |     size_t ReserveBank(Func&& builder) { | ||||||
|  |         if (!bank_indices.empty() && bank_pool[bank_indices.front()].IsDead()) { | ||||||
|  |             size_t new_index = bank_indices.front(); | ||||||
|  |             bank_indices.pop_front(); | ||||||
|  |             bank_pool[new_index].Reset(); | ||||||
|  |             return new_index; | ||||||
|  |         } | ||||||
|  |         size_t new_index = bank_pool.size(); | ||||||
|  |         builder(bank_pool, new_index); | ||||||
|  |         bank_indices.push_back(new_index); | ||||||
|  |         return new_index; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Get a reference to a bank using its index
 | ||||||
|  |     BankType& GetBank(size_t index) { | ||||||
|  |         return bank_pool[index]; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Get the total number of banks in the pool
 | ||||||
|  |     size_t BankCount() const { | ||||||
|  |         return bank_pool.size(); | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | } // namespace VideoCommon
 | ||||||
							
								
								
									
										70
									
								
								src/video_core/query_cache/query_base.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								src/video_core/query_cache/query_base.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,70 @@ | ||||||
|  | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 | ||||||
|  | // SPDX-License-Identifier: GPL-3.0-or-later
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include "common/common_funcs.h" | ||||||
|  | #include "common/common_types.h" | ||||||
|  | 
 | ||||||
|  | namespace VideoCommon { | ||||||
|  | 
 | ||||||
|  | enum class QueryFlagBits : u32 { | ||||||
|  |     HasTimestamp = 1 << 0,       ///< Indicates if this query has a timestamp.
 | ||||||
|  |     IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host
 | ||||||
|  |     IsHostSynced = 1 << 2,       ///< Indicates if the query has been synced in the host
 | ||||||
|  |     IsGuestSynced = 1 << 3,      ///< Indicates if the query has been synced with the guest.
 | ||||||
|  |     IsHostManaged = 1 << 4,      ///< Indicates if this query points to a host query
 | ||||||
|  |     IsRewritten = 1 << 5,        ///< Indicates if this query was rewritten by another query
 | ||||||
|  |     IsInvalidated = 1 << 6,      ///< Indicates the value of th query has been nullified.
 | ||||||
|  |     IsOrphan = 1 << 7,           ///< Indicates the query has not been set by a guest query.
 | ||||||
|  |     IsFence = 1 << 8,            ///< Indicates the query is a fence.
 | ||||||
|  | }; | ||||||
|  | DECLARE_ENUM_FLAG_OPERATORS(QueryFlagBits) | ||||||
|  | 
 | ||||||
|  | class QueryBase { | ||||||
|  | public: | ||||||
|  |     VAddr guest_address{}; | ||||||
|  |     QueryFlagBits flags{}; | ||||||
|  |     u64 value{}; | ||||||
|  | 
 | ||||||
|  | protected: | ||||||
|  |     // Default constructor
 | ||||||
|  |     QueryBase() = default; | ||||||
|  | 
 | ||||||
|  |     // Parameterized constructor
 | ||||||
|  |     QueryBase(VAddr address, QueryFlagBits flags_, u64 value_) | ||||||
|  |         : guest_address(address), flags(flags_), value{value_} {} | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | class GuestQuery : public QueryBase { | ||||||
|  | public: | ||||||
|  |     // Parameterized constructor
 | ||||||
|  |     GuestQuery(bool isLong, VAddr address, u64 queryValue) | ||||||
|  |         : QueryBase(address, QueryFlagBits::IsFinalValueSynced, queryValue) { | ||||||
|  |         if (isLong) { | ||||||
|  |             flags |= QueryFlagBits::HasTimestamp; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | class HostQueryBase : public QueryBase { | ||||||
|  | public: | ||||||
|  |     // Default constructor
 | ||||||
|  |     HostQueryBase() : QueryBase(0, QueryFlagBits::IsHostManaged | QueryFlagBits::IsOrphan, 0) {} | ||||||
|  | 
 | ||||||
|  |     // Parameterized constructor
 | ||||||
|  |     HostQueryBase(bool has_timestamp, VAddr address) | ||||||
|  |         : QueryBase(address, QueryFlagBits::IsHostManaged, 0), start_bank_id{}, size_banks{}, | ||||||
|  |           start_slot{}, size_slots{} { | ||||||
|  |         if (has_timestamp) { | ||||||
|  |             flags |= QueryFlagBits::HasTimestamp; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     u32 start_bank_id{}; | ||||||
|  |     u32 size_banks{}; | ||||||
|  |     size_t start_slot{}; | ||||||
|  |     size_t size_slots{}; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | } // namespace VideoCommon
 | ||||||
							
								
								
									
										580
									
								
								src/video_core/query_cache/query_cache.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										580
									
								
								src/video_core/query_cache/query_cache.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,580 @@ | ||||||
|  | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 | ||||||
|  | // SPDX-License-Identifier: GPL-3.0-or-later
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include <array> | ||||||
|  | #include <deque> | ||||||
|  | #include <memory> | ||||||
|  | #include <mutex> | ||||||
|  | #include <unordered_map> | ||||||
|  | #include <utility> | ||||||
|  | 
 | ||||||
|  | #include "common/assert.h" | ||||||
|  | #include "common/common_types.h" | ||||||
|  | #include "common/logging/log.h" | ||||||
|  | #include "common/scope_exit.h" | ||||||
|  | #include "common/settings.h" | ||||||
|  | #include "core/memory.h" | ||||||
|  | #include "video_core/engines/maxwell_3d.h" | ||||||
|  | #include "video_core/gpu.h" | ||||||
|  | #include "video_core/memory_manager.h" | ||||||
|  | #include "video_core/query_cache/bank_base.h" | ||||||
|  | #include "video_core/query_cache/query_base.h" | ||||||
|  | #include "video_core/query_cache/query_cache_base.h" | ||||||
|  | #include "video_core/query_cache/query_stream.h" | ||||||
|  | #include "video_core/query_cache/types.h" | ||||||
|  | 
 | ||||||
|  | namespace VideoCommon { | ||||||
|  | 
 | ||||||
|  | using Maxwell = Tegra::Engines::Maxwell3D; | ||||||
|  | 
 | ||||||
|  | struct SyncValuesStruct { | ||||||
|  |     VAddr address; | ||||||
|  |     u64 value; | ||||||
|  |     u64 size; | ||||||
|  | 
 | ||||||
|  |     static constexpr bool GeneratesBaseBuffer = true; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | class GuestStreamer : public SimpleStreamer<GuestQuery> { | ||||||
|  | public: | ||||||
|  |     using RuntimeType = typename Traits::RuntimeType; | ||||||
|  | 
 | ||||||
|  |     GuestStreamer(size_t id_, RuntimeType& runtime_) | ||||||
|  |         : SimpleStreamer<GuestQuery>(id_), runtime{runtime_} {} | ||||||
|  | 
 | ||||||
|  |     virtual ~GuestStreamer() = default; | ||||||
|  | 
 | ||||||
|  |     size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, | ||||||
|  |                         std::optional<u32> subreport = std::nullopt) override { | ||||||
|  |         auto new_id = BuildQuery(has_timestamp, address, static_cast<u64>(value)); | ||||||
|  |         pending_sync.push_back(new_id); | ||||||
|  |         return new_id; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     bool HasPendingSync() const override { | ||||||
|  |         return !pending_sync.empty(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     void SyncWrites() override { | ||||||
|  |         if (pending_sync.empty()) { | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |         std::vector<SyncValuesStruct> sync_values; | ||||||
|  |         sync_values.reserve(pending_sync.size()); | ||||||
|  |         for (size_t pending_id : pending_sync) { | ||||||
|  |             auto& query = slot_queries[pending_id]; | ||||||
|  |             if (True(query.flags & QueryFlagBits::IsRewritten) || | ||||||
|  |                 True(query.flags & QueryFlagBits::IsInvalidated)) { | ||||||
|  |                 continue; | ||||||
|  |             } | ||||||
|  |             query.flags |= QueryFlagBits::IsHostSynced; | ||||||
|  |             sync_values.emplace_back(SyncValuesStruct{ | ||||||
|  |                 .address = query.guest_address, | ||||||
|  |                 .value = query.value, | ||||||
|  |                 .size = static_cast<u64>(True(query.flags & QueryFlagBits::HasTimestamp) ? 8 : 4)}); | ||||||
|  |         } | ||||||
|  |         pending_sync.clear(); | ||||||
|  |         if (sync_values.size() > 0) { | ||||||
|  |             runtime.template SyncValues<SyncValuesStruct>(sync_values); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | private: | ||||||
|  |     RuntimeType& runtime; | ||||||
|  |     std::deque<size_t> pending_sync; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | class StubStreamer : public GuestStreamer<Traits> { | ||||||
|  | public: | ||||||
|  |     using RuntimeType = typename Traits::RuntimeType; | ||||||
|  | 
 | ||||||
|  |     StubStreamer(size_t id_, RuntimeType& runtime_, u32 stub_value_) | ||||||
|  |         : GuestStreamer<Traits>(id_, runtime_), stub_value{stub_value_} {} | ||||||
|  | 
 | ||||||
|  |     ~StubStreamer() override = default; | ||||||
|  | 
 | ||||||
|  |     size_t WriteCounter(VAddr address, bool has_timestamp, [[maybe_unused]] u32 value, | ||||||
|  |                         std::optional<u32> subreport = std::nullopt) override { | ||||||
|  |         size_t new_id = | ||||||
|  |             GuestStreamer<Traits>::WriteCounter(address, has_timestamp, stub_value, subreport); | ||||||
|  |         return new_id; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | private: | ||||||
|  |     u32 stub_value; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | struct QueryCacheBase<Traits>::QueryCacheBaseImpl { | ||||||
|  |     using RuntimeType = typename Traits::RuntimeType; | ||||||
|  | 
 | ||||||
|  |     QueryCacheBaseImpl(QueryCacheBase<Traits>* owner_, VideoCore::RasterizerInterface& rasterizer_, | ||||||
|  |                        Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_, Tegra::GPU& gpu_) | ||||||
|  |         : owner{owner_}, rasterizer{rasterizer_}, | ||||||
|  |           cpu_memory{cpu_memory_}, runtime{runtime_}, gpu{gpu_} { | ||||||
|  |         streamer_mask = 0; | ||||||
|  |         for (size_t i = 0; i < static_cast<size_t>(QueryType::MaxQueryTypes); i++) { | ||||||
|  |             streamers[i] = runtime.GetStreamerInterface(static_cast<QueryType>(i)); | ||||||
|  |             if (streamers[i]) { | ||||||
|  |                 streamer_mask |= 1ULL << streamers[i]->GetId(); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     template <typename Func> | ||||||
|  |     void ForEachStreamerIn(u64 mask, Func&& func) { | ||||||
|  |         static constexpr bool RETURNS_BOOL = | ||||||
|  |             std::is_same_v<std::invoke_result<Func, StreamerInterface*>, bool>; | ||||||
|  |         while (mask != 0) { | ||||||
|  |             size_t position = std::countr_zero(mask); | ||||||
|  |             mask &= ~(1ULL << position); | ||||||
|  |             if constexpr (RETURNS_BOOL) { | ||||||
|  |                 if (func(streamers[position])) { | ||||||
|  |                     return; | ||||||
|  |                 } | ||||||
|  |             } else { | ||||||
|  |                 func(streamers[position]); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     template <typename Func> | ||||||
|  |     void ForEachStreamer(Func&& func) { | ||||||
|  |         ForEachStreamerIn(streamer_mask, func); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     QueryBase* ObtainQuery(QueryCacheBase<Traits>::QueryLocation location) { | ||||||
|  |         size_t which_stream = location.stream_id.Value(); | ||||||
|  |         auto* streamer = streamers[which_stream]; | ||||||
|  |         if (!streamer) { | ||||||
|  |             return nullptr; | ||||||
|  |         } | ||||||
|  |         return streamer->GetQuery(location.query_id.Value()); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     QueryCacheBase<Traits>* owner; | ||||||
|  |     VideoCore::RasterizerInterface& rasterizer; | ||||||
|  |     Core::Memory::Memory& cpu_memory; | ||||||
|  |     RuntimeType& runtime; | ||||||
|  |     Tegra::GPU& gpu; | ||||||
|  |     std::array<StreamerInterface*, static_cast<size_t>(QueryType::MaxQueryTypes)> streamers; | ||||||
|  |     u64 streamer_mask; | ||||||
|  |     std::mutex flush_guard; | ||||||
|  |     std::deque<u64> flushes_pending; | ||||||
|  |     std::vector<QueryCacheBase<Traits>::QueryLocation> pending_unregister; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | QueryCacheBase<Traits>::QueryCacheBase(Tegra::GPU& gpu_, | ||||||
|  |                                        VideoCore::RasterizerInterface& rasterizer_, | ||||||
|  |                                        Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_) | ||||||
|  |     : cached_queries{} { | ||||||
|  |     impl = std::make_unique<QueryCacheBase<Traits>::QueryCacheBaseImpl>( | ||||||
|  |         this, rasterizer_, cpu_memory_, runtime_, gpu_); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | QueryCacheBase<Traits>::~QueryCacheBase() = default; | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::CounterEnable(QueryType counter_type, bool is_enabled) { | ||||||
|  |     size_t index = static_cast<size_t>(counter_type); | ||||||
|  |     StreamerInterface* streamer = impl->streamers[index]; | ||||||
|  |     if (!streamer) [[unlikely]] { | ||||||
|  |         UNREACHABLE(); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     if (is_enabled) { | ||||||
|  |         streamer->StartCounter(); | ||||||
|  |     } else { | ||||||
|  |         streamer->PauseCounter(); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::CounterClose(QueryType counter_type) { | ||||||
|  |     size_t index = static_cast<size_t>(counter_type); | ||||||
|  |     StreamerInterface* streamer = impl->streamers[index]; | ||||||
|  |     if (!streamer) [[unlikely]] { | ||||||
|  |         UNREACHABLE(); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     streamer->CloseCounter(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::CounterReset(QueryType counter_type) { | ||||||
|  |     size_t index = static_cast<size_t>(counter_type); | ||||||
|  |     StreamerInterface* streamer = impl->streamers[index]; | ||||||
|  |     if (!streamer) [[unlikely]] { | ||||||
|  |         UNIMPLEMENTED(); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     streamer->ResetCounter(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::BindToChannel(s32 id) { | ||||||
|  |     VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo>::BindToChannel(id); | ||||||
|  |     impl->runtime.Bind3DEngine(maxwell3d); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::CounterReport(GPUVAddr addr, QueryType counter_type, | ||||||
|  |                                            QueryPropertiesFlags flags, u32 payload, u32 subreport) { | ||||||
|  |     const bool has_timestamp = True(flags & QueryPropertiesFlags::HasTimeout); | ||||||
|  |     const bool is_fence = True(flags & QueryPropertiesFlags::IsAFence); | ||||||
|  |     size_t streamer_id = static_cast<size_t>(counter_type); | ||||||
|  |     auto* streamer = impl->streamers[streamer_id]; | ||||||
|  |     if (streamer == nullptr) [[unlikely]] { | ||||||
|  |         counter_type = QueryType::Payload; | ||||||
|  |         payload = 1U; | ||||||
|  |         streamer_id = static_cast<size_t>(counter_type); | ||||||
|  |         streamer = impl->streamers[streamer_id]; | ||||||
|  |     } | ||||||
|  |     auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(addr); | ||||||
|  |     if (!cpu_addr_opt) [[unlikely]] { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     VAddr cpu_addr = *cpu_addr_opt; | ||||||
|  |     const size_t new_query_id = streamer->WriteCounter(cpu_addr, has_timestamp, payload, subreport); | ||||||
|  |     auto* query = streamer->GetQuery(new_query_id); | ||||||
|  |     if (is_fence) { | ||||||
|  |         query->flags |= QueryFlagBits::IsFence; | ||||||
|  |     } | ||||||
|  |     QueryLocation query_location{}; | ||||||
|  |     query_location.stream_id.Assign(static_cast<u32>(streamer_id)); | ||||||
|  |     query_location.query_id.Assign(static_cast<u32>(new_query_id)); | ||||||
|  |     const auto gen_caching_indexing = [](VAddr cur_addr) { | ||||||
|  |         return std::make_pair<u64, u32>(cur_addr >> Core::Memory::YUZU_PAGEBITS, | ||||||
|  |                                         static_cast<u32>(cur_addr & Core::Memory::YUZU_PAGEMASK)); | ||||||
|  |     }; | ||||||
|  |     u8* pointer = impl->cpu_memory.GetPointer(cpu_addr); | ||||||
|  |     u8* pointer_timestamp = impl->cpu_memory.GetPointer(cpu_addr + 8); | ||||||
|  |     bool is_synced = !Settings::IsGPULevelHigh() && is_fence; | ||||||
|  | 
 | ||||||
|  |     std::function<void()> operation([this, is_synced, streamer, query_base = query, query_location, | ||||||
|  |                                      pointer, pointer_timestamp] { | ||||||
|  |         if (True(query_base->flags & QueryFlagBits::IsInvalidated)) { | ||||||
|  |             if (!is_synced) [[likely]] { | ||||||
|  |                 impl->pending_unregister.push_back(query_location); | ||||||
|  |             } | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |         if (False(query_base->flags & QueryFlagBits::IsFinalValueSynced)) [[unlikely]] { | ||||||
|  |             UNREACHABLE(); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |         query_base->value += streamer->GetAmmendValue(); | ||||||
|  |         streamer->SetAccumulationValue(query_base->value); | ||||||
|  |         if (True(query_base->flags & QueryFlagBits::HasTimestamp)) { | ||||||
|  |             u64 timestamp = impl->gpu.GetTicks(); | ||||||
|  |             std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp)); | ||||||
|  |             std::memcpy(pointer, &query_base->value, sizeof(query_base->value)); | ||||||
|  |         } else { | ||||||
|  |             u32 value = static_cast<u32>(query_base->value); | ||||||
|  |             std::memcpy(pointer, &value, sizeof(value)); | ||||||
|  |         } | ||||||
|  |         if (!is_synced) [[likely]] { | ||||||
|  |             impl->pending_unregister.push_back(query_location); | ||||||
|  |         } | ||||||
|  |     }); | ||||||
|  |     if (is_fence) { | ||||||
|  |         impl->rasterizer.SignalFence(std::move(operation)); | ||||||
|  |     } else { | ||||||
|  |         if (!Settings::IsGPULevelHigh() && counter_type == QueryType::Payload) { | ||||||
|  |             if (has_timestamp) { | ||||||
|  |                 u64 timestamp = impl->gpu.GetTicks(); | ||||||
|  |                 u64 value = static_cast<u64>(payload); | ||||||
|  |                 std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp)); | ||||||
|  |                 std::memcpy(pointer, &value, sizeof(value)); | ||||||
|  |             } else { | ||||||
|  |                 std::memcpy(pointer, &payload, sizeof(payload)); | ||||||
|  |             } | ||||||
|  |             streamer->Free(new_query_id); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |         impl->rasterizer.SyncOperation(std::move(operation)); | ||||||
|  |     } | ||||||
|  |     if (is_synced) { | ||||||
|  |         streamer->Free(new_query_id); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     auto [cont_addr, base] = gen_caching_indexing(cpu_addr); | ||||||
|  |     { | ||||||
|  |         std::scoped_lock lock(cache_mutex); | ||||||
|  |         auto it1 = cached_queries.try_emplace(cont_addr); | ||||||
|  |         auto& sub_container = it1.first->second; | ||||||
|  |         auto it_current = sub_container.find(base); | ||||||
|  |         if (it_current == sub_container.end()) { | ||||||
|  |             sub_container.insert_or_assign(base, query_location); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |         auto* old_query = impl->ObtainQuery(it_current->second); | ||||||
|  |         old_query->flags |= QueryFlagBits::IsRewritten; | ||||||
|  |         sub_container.insert_or_assign(base, query_location); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::UnregisterPending() { | ||||||
|  |     const auto gen_caching_indexing = [](VAddr cur_addr) { | ||||||
|  |         return std::make_pair<u64, u32>(cur_addr >> Core::Memory::YUZU_PAGEBITS, | ||||||
|  |                                         static_cast<u32>(cur_addr & Core::Memory::YUZU_PAGEMASK)); | ||||||
|  |     }; | ||||||
|  |     std::scoped_lock lock(cache_mutex); | ||||||
|  |     for (QueryLocation loc : impl->pending_unregister) { | ||||||
|  |         const auto [streamer_id, query_id] = loc.unpack(); | ||||||
|  |         auto* streamer = impl->streamers[streamer_id]; | ||||||
|  |         if (!streamer) [[unlikely]] { | ||||||
|  |             continue; | ||||||
|  |         } | ||||||
|  |         auto* query = streamer->GetQuery(query_id); | ||||||
|  |         auto [cont_addr, base] = gen_caching_indexing(query->guest_address); | ||||||
|  |         auto it1 = cached_queries.find(cont_addr); | ||||||
|  |         if (it1 != cached_queries.end()) { | ||||||
|  |             auto it2 = it1->second.find(base); | ||||||
|  |             if (it2 != it1->second.end()) { | ||||||
|  |                 if (it2->second.raw == loc.raw) { | ||||||
|  |                     it1->second.erase(it2); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         streamer->Free(query_id); | ||||||
|  |     } | ||||||
|  |     impl->pending_unregister.clear(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::NotifyWFI() { | ||||||
|  |     bool should_sync = false; | ||||||
|  |     impl->ForEachStreamer( | ||||||
|  |         [&should_sync](StreamerInterface* streamer) { should_sync |= streamer->HasPendingSync(); }); | ||||||
|  |     if (!should_sync) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->PresyncWrites(); }); | ||||||
|  |     impl->runtime.Barriers(true); | ||||||
|  |     impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->SyncWrites(); }); | ||||||
|  |     impl->runtime.Barriers(false); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::NotifySegment(bool resume) { | ||||||
|  |     if (resume) { | ||||||
|  |         impl->runtime.ResumeHostConditionalRendering(); | ||||||
|  |     } else { | ||||||
|  |         CounterClose(VideoCommon::QueryType::ZPassPixelCount64); | ||||||
|  |         CounterClose(VideoCommon::QueryType::StreamingByteCount); | ||||||
|  |         impl->runtime.PauseHostConditionalRendering(); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | bool QueryCacheBase<Traits>::AccelerateHostConditionalRendering() { | ||||||
|  |     bool qc_dirty = false; | ||||||
|  |     const auto gen_lookup = [this, &qc_dirty](GPUVAddr address) -> VideoCommon::LookupData { | ||||||
|  |         auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(address); | ||||||
|  |         if (!cpu_addr_opt) [[unlikely]] { | ||||||
|  |             return VideoCommon::LookupData{ | ||||||
|  |                 .address = 0, | ||||||
|  |                 .found_query = nullptr, | ||||||
|  |             }; | ||||||
|  |         } | ||||||
|  |         VAddr cpu_addr = *cpu_addr_opt; | ||||||
|  |         std::scoped_lock lock(cache_mutex); | ||||||
|  |         auto it1 = cached_queries.find(cpu_addr >> Core::Memory::YUZU_PAGEBITS); | ||||||
|  |         if (it1 == cached_queries.end()) { | ||||||
|  |             return VideoCommon::LookupData{ | ||||||
|  |                 .address = cpu_addr, | ||||||
|  |                 .found_query = nullptr, | ||||||
|  |             }; | ||||||
|  |         } | ||||||
|  |         auto& sub_container = it1->second; | ||||||
|  |         auto it_current = sub_container.find(cpu_addr & Core::Memory::YUZU_PAGEMASK); | ||||||
|  | 
 | ||||||
|  |         if (it_current == sub_container.end()) { | ||||||
|  |             auto it_current_2 = sub_container.find((cpu_addr & Core::Memory::YUZU_PAGEMASK) + 4); | ||||||
|  |             if (it_current_2 == sub_container.end()) { | ||||||
|  |                 return VideoCommon::LookupData{ | ||||||
|  |                     .address = cpu_addr, | ||||||
|  |                     .found_query = nullptr, | ||||||
|  |                 }; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         auto* query = impl->ObtainQuery(it_current->second); | ||||||
|  |         qc_dirty |= True(query->flags & QueryFlagBits::IsHostManaged) && | ||||||
|  |                     False(query->flags & QueryFlagBits::IsGuestSynced); | ||||||
|  |         return VideoCommon::LookupData{ | ||||||
|  |             .address = cpu_addr, | ||||||
|  |             .found_query = query, | ||||||
|  |         }; | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     auto& regs = maxwell3d->regs; | ||||||
|  |     if (regs.render_enable_override != Maxwell::Regs::RenderEnable::Override::UseRenderEnable) { | ||||||
|  |         impl->runtime.EndHostConditionalRendering(); | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |     const ComparisonMode mode = static_cast<ComparisonMode>(regs.render_enable.mode); | ||||||
|  |     const GPUVAddr address = regs.render_enable.Address(); | ||||||
|  |     switch (mode) { | ||||||
|  |     case ComparisonMode::True: | ||||||
|  |         impl->runtime.EndHostConditionalRendering(); | ||||||
|  |         return false; | ||||||
|  |     case ComparisonMode::False: | ||||||
|  |         impl->runtime.EndHostConditionalRendering(); | ||||||
|  |         return false; | ||||||
|  |     case ComparisonMode::Conditional: { | ||||||
|  |         VideoCommon::LookupData object_1{gen_lookup(address)}; | ||||||
|  |         return impl->runtime.HostConditionalRenderingCompareValue(object_1, qc_dirty); | ||||||
|  |     } | ||||||
|  |     case ComparisonMode::IfEqual: { | ||||||
|  |         VideoCommon::LookupData object_1{gen_lookup(address)}; | ||||||
|  |         VideoCommon::LookupData object_2{gen_lookup(address + 16)}; | ||||||
|  |         return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty, | ||||||
|  |                                                                    true); | ||||||
|  |     } | ||||||
|  |     case ComparisonMode::IfNotEqual: { | ||||||
|  |         VideoCommon::LookupData object_1{gen_lookup(address)}; | ||||||
|  |         VideoCommon::LookupData object_2{gen_lookup(address + 16)}; | ||||||
|  |         return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty, | ||||||
|  |                                                                    false); | ||||||
|  |     } | ||||||
|  |     default: | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Async downloads
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::CommitAsyncFlushes() { | ||||||
|  |     // Make sure to have the results synced in Host.
 | ||||||
|  |     NotifyWFI(); | ||||||
|  | 
 | ||||||
|  |     u64 mask{}; | ||||||
|  |     { | ||||||
|  |         std::scoped_lock lk(impl->flush_guard); | ||||||
|  |         impl->ForEachStreamer([&mask](StreamerInterface* streamer) { | ||||||
|  |             bool local_result = streamer->HasUnsyncedQueries(); | ||||||
|  |             if (local_result) { | ||||||
|  |                 mask |= 1ULL << streamer->GetId(); | ||||||
|  |             } | ||||||
|  |         }); | ||||||
|  |         impl->flushes_pending.push_back(mask); | ||||||
|  |     } | ||||||
|  |     std::function<void()> func([this] { UnregisterPending(); }); | ||||||
|  |     impl->rasterizer.SyncOperation(std::move(func)); | ||||||
|  |     if (mask == 0) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     u64 ran_mask = ~mask; | ||||||
|  |     while (mask) { | ||||||
|  |         impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) { | ||||||
|  |             u64 dep_mask = streamer->GetDependentMask(); | ||||||
|  |             if ((dep_mask & ~ran_mask) != 0) { | ||||||
|  |                 return; | ||||||
|  |             } | ||||||
|  |             u64 index = streamer->GetId(); | ||||||
|  |             ran_mask |= (1ULL << index); | ||||||
|  |             mask &= ~(1ULL << index); | ||||||
|  |             streamer->PushUnsyncedQueries(); | ||||||
|  |         }); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | bool QueryCacheBase<Traits>::HasUncommittedFlushes() const { | ||||||
|  |     bool result = false; | ||||||
|  |     impl->ForEachStreamer([&result](StreamerInterface* streamer) { | ||||||
|  |         result |= streamer->HasUnsyncedQueries(); | ||||||
|  |         return result; | ||||||
|  |     }); | ||||||
|  |     return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | bool QueryCacheBase<Traits>::ShouldWaitAsyncFlushes() { | ||||||
|  |     std::scoped_lock lk(impl->flush_guard); | ||||||
|  |     return !impl->flushes_pending.empty() && impl->flushes_pending.front() != 0ULL; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::PopAsyncFlushes() { | ||||||
|  |     u64 mask; | ||||||
|  |     { | ||||||
|  |         std::scoped_lock lk(impl->flush_guard); | ||||||
|  |         mask = impl->flushes_pending.front(); | ||||||
|  |         impl->flushes_pending.pop_front(); | ||||||
|  |     } | ||||||
|  |     if (mask == 0) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     u64 ran_mask = ~mask; | ||||||
|  |     while (mask) { | ||||||
|  |         impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) { | ||||||
|  |             u64 dep_mask = streamer->GetDependenceMask(); | ||||||
|  |             if ((dep_mask & ~ran_mask) != 0) { | ||||||
|  |                 return; | ||||||
|  |             } | ||||||
|  |             u64 index = streamer->GetId(); | ||||||
|  |             ran_mask |= (1ULL << index); | ||||||
|  |             mask &= ~(1ULL << index); | ||||||
|  |             streamer->PopUnsyncedQueries(); | ||||||
|  |         }); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Invalidation
 | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::InvalidateQuery(QueryCacheBase<Traits>::QueryLocation location) { | ||||||
|  |     auto* query_base = impl->ObtainQuery(location); | ||||||
|  |     if (!query_base) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     query_base->flags |= QueryFlagBits::IsInvalidated; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | bool QueryCacheBase<Traits>::IsQueryDirty(QueryCacheBase<Traits>::QueryLocation location) { | ||||||
|  |     auto* query_base = impl->ObtainQuery(location); | ||||||
|  |     if (!query_base) { | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |     return True(query_base->flags & QueryFlagBits::IsHostManaged) && | ||||||
|  |            False(query_base->flags & QueryFlagBits::IsGuestSynced); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | bool QueryCacheBase<Traits>::SemiFlushQueryDirty(QueryCacheBase<Traits>::QueryLocation location) { | ||||||
|  |     auto* query_base = impl->ObtainQuery(location); | ||||||
|  |     if (!query_base) { | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |     if (True(query_base->flags & QueryFlagBits::IsFinalValueSynced) && | ||||||
|  |         False(query_base->flags & QueryFlagBits::IsGuestSynced)) { | ||||||
|  |         auto* ptr = impl->cpu_memory.GetPointer(query_base->guest_address); | ||||||
|  |         if (True(query_base->flags & QueryFlagBits::HasTimestamp)) { | ||||||
|  |             std::memcpy(ptr, &query_base->value, sizeof(query_base->value)); | ||||||
|  |             return false; | ||||||
|  |         } | ||||||
|  |         u32 value_l = static_cast<u32>(query_base->value); | ||||||
|  |         std::memcpy(ptr, &value_l, sizeof(value_l)); | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |     return True(query_base->flags & QueryFlagBits::IsHostManaged) && | ||||||
|  |            False(query_base->flags & QueryFlagBits::IsGuestSynced); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | void QueryCacheBase<Traits>::RequestGuestHostSync() { | ||||||
|  |     impl->rasterizer.ReleaseFences(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | } // namespace VideoCommon
 | ||||||
							
								
								
									
										181
									
								
								src/video_core/query_cache/query_cache_base.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										181
									
								
								src/video_core/query_cache/query_cache_base.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,181 @@ | ||||||
|  | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 | ||||||
|  | // SPDX-License-Identifier: GPL-3.0-or-later
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include <functional> | ||||||
|  | #include <mutex> | ||||||
|  | #include <optional> | ||||||
|  | #include <span> | ||||||
|  | #include <unordered_map> | ||||||
|  | #include <utility> | ||||||
|  | 
 | ||||||
|  | #include "common/assert.h" | ||||||
|  | #include "common/bit_field.h" | ||||||
|  | #include "common/common_types.h" | ||||||
|  | #include "core/memory.h" | ||||||
|  | #include "video_core/control/channel_state_cache.h" | ||||||
|  | #include "video_core/query_cache/query_base.h" | ||||||
|  | #include "video_core/query_cache/types.h" | ||||||
|  | 
 | ||||||
|  | namespace Core::Memory { | ||||||
|  | class Memory; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | namespace VideoCore { | ||||||
|  | class RasterizerInterface; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | namespace Tegra { | ||||||
|  | class GPU; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | namespace VideoCommon { | ||||||
|  | 
 | ||||||
|  | struct LookupData { | ||||||
|  |     VAddr address; | ||||||
|  |     QueryBase* found_query; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | template <typename Traits> | ||||||
|  | class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { | ||||||
|  |     using RuntimeType = typename Traits::RuntimeType; | ||||||
|  | 
 | ||||||
|  | public: | ||||||
|  |     union QueryLocation { | ||||||
|  |         BitField<27, 5, u32> stream_id; | ||||||
|  |         BitField<0, 27, u32> query_id; | ||||||
|  |         u32 raw; | ||||||
|  | 
 | ||||||
|  |         std::pair<size_t, size_t> unpack() const { | ||||||
|  |             return {static_cast<size_t>(stream_id.Value()), static_cast<size_t>(query_id.Value())}; | ||||||
|  |         } | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     explicit QueryCacheBase(Tegra::GPU& gpu, VideoCore::RasterizerInterface& rasterizer_, | ||||||
|  |                             Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_); | ||||||
|  | 
 | ||||||
|  |     ~QueryCacheBase(); | ||||||
|  | 
 | ||||||
|  |     void InvalidateRegion(VAddr addr, std::size_t size) { | ||||||
|  |         IterateCache<true>(addr, size, | ||||||
|  |                            [this](QueryLocation location) { InvalidateQuery(location); }); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     void FlushRegion(VAddr addr, std::size_t size) { | ||||||
|  |         bool result = false; | ||||||
|  |         IterateCache<false>(addr, size, [this, &result](QueryLocation location) { | ||||||
|  |             result |= SemiFlushQueryDirty(location); | ||||||
|  |             return result; | ||||||
|  |         }); | ||||||
|  |         if (result) { | ||||||
|  |             RequestGuestHostSync(); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     static u64 BuildMask(std::span<const QueryType> types) { | ||||||
|  |         u64 mask = 0; | ||||||
|  |         for (auto query_type : types) { | ||||||
|  |             mask |= 1ULL << (static_cast<u64>(query_type)); | ||||||
|  |         } | ||||||
|  |         return mask; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     /// Return true when a CPU region is modified from the GPU
 | ||||||
|  |     [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size) { | ||||||
|  |         bool result = false; | ||||||
|  |         IterateCache<false>(addr, size, [this, &result](QueryLocation location) { | ||||||
|  |             result |= IsQueryDirty(location); | ||||||
|  |             return result; | ||||||
|  |         }); | ||||||
|  |         return result; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     void CounterEnable(QueryType counter_type, bool is_enabled); | ||||||
|  | 
 | ||||||
|  |     void CounterReset(QueryType counter_type); | ||||||
|  | 
 | ||||||
|  |     void CounterClose(QueryType counter_type); | ||||||
|  | 
 | ||||||
|  |     void CounterReport(GPUVAddr addr, QueryType counter_type, QueryPropertiesFlags flags, | ||||||
|  |                        u32 payload, u32 subreport); | ||||||
|  | 
 | ||||||
|  |     void NotifyWFI(); | ||||||
|  | 
 | ||||||
|  |     bool AccelerateHostConditionalRendering(); | ||||||
|  | 
 | ||||||
|  |     // Async downloads
 | ||||||
|  |     void CommitAsyncFlushes(); | ||||||
|  | 
 | ||||||
|  |     bool HasUncommittedFlushes() const; | ||||||
|  | 
 | ||||||
|  |     bool ShouldWaitAsyncFlushes(); | ||||||
|  | 
 | ||||||
|  |     void PopAsyncFlushes(); | ||||||
|  | 
 | ||||||
|  |     void NotifySegment(bool resume); | ||||||
|  | 
 | ||||||
|  |     void BindToChannel(s32 id) override; | ||||||
|  | 
 | ||||||
|  | protected: | ||||||
|  |     template <bool remove_from_cache, typename Func> | ||||||
|  |     void IterateCache(VAddr addr, std::size_t size, Func&& func) { | ||||||
|  |         static constexpr bool RETURNS_BOOL = | ||||||
|  |             std::is_same_v<std::invoke_result<Func, QueryLocation>, bool>; | ||||||
|  |         const u64 addr_begin = addr; | ||||||
|  |         const u64 addr_end = addr_begin + size; | ||||||
|  | 
 | ||||||
|  |         const u64 page_end = addr_end >> Core::Memory::YUZU_PAGEBITS; | ||||||
|  |         std::scoped_lock lock(cache_mutex); | ||||||
|  |         for (u64 page = addr_begin >> Core::Memory::YUZU_PAGEBITS; page <= page_end; ++page) { | ||||||
|  |             const u64 page_start = page << Core::Memory::YUZU_PAGEBITS; | ||||||
|  |             const auto in_range = [page_start, addr_begin, addr_end](const u32 query_location) { | ||||||
|  |                 const u64 cache_begin = page_start + query_location; | ||||||
|  |                 const u64 cache_end = cache_begin + sizeof(u32); | ||||||
|  |                 return cache_begin < addr_end && addr_begin < cache_end; | ||||||
|  |             }; | ||||||
|  |             const auto& it = cached_queries.find(page); | ||||||
|  |             if (it == std::end(cached_queries)) { | ||||||
|  |                 continue; | ||||||
|  |             } | ||||||
|  |             auto& contents = it->second; | ||||||
|  |             for (auto& query : contents) { | ||||||
|  |                 if (!in_range(query.first)) { | ||||||
|  |                     continue; | ||||||
|  |                 } | ||||||
|  |                 if constexpr (RETURNS_BOOL) { | ||||||
|  |                     if (func(query.second)) { | ||||||
|  |                         return; | ||||||
|  |                     } | ||||||
|  |                 } else { | ||||||
|  |                     func(query.second); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             if constexpr (remove_from_cache) { | ||||||
|  |                 const auto in_range2 = [&](const std::pair<u32, QueryLocation>& pair) { | ||||||
|  |                     return in_range(pair.first); | ||||||
|  |                 }; | ||||||
|  |                 std::erase_if(contents, in_range2); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     using ContentCache = std::unordered_map<u64, std::unordered_map<u32, QueryLocation>>; | ||||||
|  | 
 | ||||||
|  |     void InvalidateQuery(QueryLocation location); | ||||||
|  |     bool IsQueryDirty(QueryLocation location); | ||||||
|  |     bool SemiFlushQueryDirty(QueryLocation location); | ||||||
|  |     void RequestGuestHostSync(); | ||||||
|  |     void UnregisterPending(); | ||||||
|  | 
 | ||||||
|  |     std::unordered_map<u64, std::unordered_map<u32, QueryLocation>> cached_queries; | ||||||
|  |     std::mutex cache_mutex; | ||||||
|  | 
 | ||||||
|  |     struct QueryCacheBaseImpl; | ||||||
|  |     friend struct QueryCacheBaseImpl; | ||||||
|  |     friend RuntimeType; | ||||||
|  | 
 | ||||||
|  |     std::unique_ptr<QueryCacheBaseImpl> impl; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | } // namespace VideoCommon
 | ||||||
							
								
								
									
										149
									
								
								src/video_core/query_cache/query_stream.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										149
									
								
								src/video_core/query_cache/query_stream.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,149 @@ | ||||||
|  | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 | ||||||
|  | // SPDX-License-Identifier: GPL-3.0-or-later
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include <deque> | ||||||
|  | #include <optional> | ||||||
|  | #include <vector> | ||||||
|  | 
 | ||||||
|  | #include "common/assert.h" | ||||||
|  | #include "common/common_types.h" | ||||||
|  | #include "video_core/query_cache/bank_base.h" | ||||||
|  | #include "video_core/query_cache/query_base.h" | ||||||
|  | 
 | ||||||
|  | namespace VideoCommon { | ||||||
|  | 
 | ||||||
|  | class StreamerInterface { | ||||||
|  | public: | ||||||
|  |     explicit StreamerInterface(size_t id_) : id{id_}, dependence_mask{}, dependent_mask{} {} | ||||||
|  |     virtual ~StreamerInterface() = default; | ||||||
|  | 
 | ||||||
|  |     virtual QueryBase* GetQuery(size_t id) = 0; | ||||||
|  | 
 | ||||||
|  |     virtual void StartCounter() { | ||||||
|  |         /* Do Nothing */ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual void PauseCounter() { | ||||||
|  |         /* Do Nothing */ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual void ResetCounter() { | ||||||
|  |         /* Do Nothing */ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual void CloseCounter() { | ||||||
|  |         /* Do Nothing */ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual bool HasPendingSync() const { | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual void PresyncWrites() { | ||||||
|  |         /* Do Nothing */ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual void SyncWrites() { | ||||||
|  |         /* Do Nothing */ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, | ||||||
|  |                                 std::optional<u32> subreport = std::nullopt) = 0; | ||||||
|  | 
 | ||||||
|  |     virtual bool HasUnsyncedQueries() const { | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual void PushUnsyncedQueries() { | ||||||
|  |         /* Do Nothing */ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual void PopUnsyncedQueries() { | ||||||
|  |         /* Do Nothing */ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual void Free(size_t query_id) = 0; | ||||||
|  | 
 | ||||||
|  |     size_t GetId() const { | ||||||
|  |         return id; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     u64 GetDependenceMask() const { | ||||||
|  |         return dependence_mask; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     u64 GetDependentMask() const { | ||||||
|  |         return dependence_mask; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     u64 GetAmmendValue() const { | ||||||
|  |         return ammend_value; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     void SetAccumulationValue(u64 new_value) { | ||||||
|  |         acumulation_value = new_value; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | protected: | ||||||
|  |     void MakeDependent(StreamerInterface* depend_on) { | ||||||
|  |         dependence_mask |= 1ULL << depend_on->id; | ||||||
|  |         depend_on->dependent_mask |= 1ULL << id; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const size_t id; | ||||||
|  |     u64 dependence_mask; | ||||||
|  |     u64 dependent_mask; | ||||||
|  |     u64 ammend_value{}; | ||||||
|  |     u64 acumulation_value{}; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | template <typename QueryType> | ||||||
|  | class SimpleStreamer : public StreamerInterface { | ||||||
|  | public: | ||||||
|  |     explicit SimpleStreamer(size_t id_) : StreamerInterface{id_} {} | ||||||
|  |     virtual ~SimpleStreamer() = default; | ||||||
|  | 
 | ||||||
|  | protected: | ||||||
|  |     virtual QueryType* GetQuery(size_t query_id) override { | ||||||
|  |         if (query_id < slot_queries.size()) { | ||||||
|  |             return &slot_queries[query_id]; | ||||||
|  |         } | ||||||
|  |         return nullptr; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     virtual void Free(size_t query_id) override { | ||||||
|  |         std::scoped_lock lk(guard); | ||||||
|  |         ReleaseQuery(query_id); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     template <typename... Args, typename = decltype(QueryType(std::declval<Args>()...))> | ||||||
|  |     size_t BuildQuery(Args&&... args) { | ||||||
|  |         std::scoped_lock lk(guard); | ||||||
|  |         if (!old_queries.empty()) { | ||||||
|  |             size_t new_id = old_queries.front(); | ||||||
|  |             old_queries.pop_front(); | ||||||
|  |             new (&slot_queries[new_id]) QueryType(std::forward<Args>(args)...); | ||||||
|  |             return new_id; | ||||||
|  |         } | ||||||
|  |         size_t new_id = slot_queries.size(); | ||||||
|  |         slot_queries.emplace_back(std::forward<Args>(args)...); | ||||||
|  |         return new_id; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     void ReleaseQuery(size_t query_id) { | ||||||
|  | 
 | ||||||
|  |         if (query_id < slot_queries.size()) { | ||||||
|  |             old_queries.push_back(query_id); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |         UNREACHABLE(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     std::mutex guard; | ||||||
|  |     std::deque<QueryType> slot_queries; | ||||||
|  |     std::deque<size_t> old_queries; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | } // namespace VideoCommon
 | ||||||
							
								
								
									
										74
									
								
								src/video_core/query_cache/types.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								src/video_core/query_cache/types.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,74 @@ | ||||||
|  | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 | ||||||
|  | // SPDX-License-Identifier: GPL-3.0-or-later
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include "common/common_funcs.h" | ||||||
|  | #include "common/common_types.h" | ||||||
|  | 
 | ||||||
|  | namespace VideoCommon { | ||||||
|  | 
 | ||||||
|  | enum class QueryPropertiesFlags : u32 { | ||||||
|  |     HasTimeout = 1 << 0, | ||||||
|  |     IsAFence = 1 << 1, | ||||||
|  | }; | ||||||
|  | DECLARE_ENUM_FLAG_OPERATORS(QueryPropertiesFlags) | ||||||
|  | 
 | ||||||
|  | // This should always be equivalent to maxwell3d Report Semaphore Reports
 | ||||||
|  | enum class QueryType : u32 { | ||||||
|  |     Payload = 0, // "None" in docs, but confirmed via hardware to return the payload
 | ||||||
|  |     VerticesGenerated = 1, | ||||||
|  |     ZPassPixelCount = 2, | ||||||
|  |     PrimitivesGenerated = 3, | ||||||
|  |     AlphaBetaClocks = 4, | ||||||
|  |     VertexShaderInvocations = 5, | ||||||
|  |     StreamingPrimitivesNeededMinusSucceeded = 6, | ||||||
|  |     GeometryShaderInvocations = 7, | ||||||
|  |     GeometryShaderPrimitivesGenerated = 9, | ||||||
|  |     ZCullStats0 = 10, | ||||||
|  |     StreamingPrimitivesSucceeded = 11, | ||||||
|  |     ZCullStats1 = 12, | ||||||
|  |     StreamingPrimitivesNeeded = 13, | ||||||
|  |     ZCullStats2 = 14, | ||||||
|  |     ClipperInvocations = 15, | ||||||
|  |     ZCullStats3 = 16, | ||||||
|  |     ClipperPrimitivesGenerated = 17, | ||||||
|  |     VtgPrimitivesOut = 18, | ||||||
|  |     PixelShaderInvocations = 19, | ||||||
|  |     ZPassPixelCount64 = 21, | ||||||
|  |     IEEECleanColorTarget = 24, | ||||||
|  |     IEEECleanZetaTarget = 25, | ||||||
|  |     StreamingByteCount = 26, | ||||||
|  |     TessellationInitInvocations = 27, | ||||||
|  |     BoundingRectangle = 28, | ||||||
|  |     TessellationShaderInvocations = 29, | ||||||
|  |     TotalStreamingPrimitivesNeededMinusSucceeded = 30, | ||||||
|  |     TessellationShaderPrimitivesGenerated = 31, | ||||||
|  |     // max.
 | ||||||
|  |     MaxQueryTypes, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | // Comparison modes for Host Conditional Rendering
 | ||||||
|  | enum class ComparisonMode : u32 { | ||||||
|  |     False = 0, | ||||||
|  |     True = 1, | ||||||
|  |     Conditional = 2, | ||||||
|  |     IfEqual = 3, | ||||||
|  |     IfNotEqual = 4, | ||||||
|  |     MaxComparisonMode, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | // Reduction ops.
 | ||||||
|  | enum class ReductionOp : u32 { | ||||||
|  |     RedAdd = 0, | ||||||
|  |     RedMin = 1, | ||||||
|  |     RedMax = 2, | ||||||
|  |     RedInc = 3, | ||||||
|  |     RedDec = 4, | ||||||
|  |     RedAnd = 5, | ||||||
|  |     RedOr = 6, | ||||||
|  |     RedXor = 7, | ||||||
|  |     MaxReductionOp, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | } // namespace VideoCommon
 | ||||||
|  | @ -12,6 +12,7 @@ | ||||||
| #include "video_core/cache_types.h" | #include "video_core/cache_types.h" | ||||||
| #include "video_core/engines/fermi_2d.h" | #include "video_core/engines/fermi_2d.h" | ||||||
| #include "video_core/gpu.h" | #include "video_core/gpu.h" | ||||||
|  | #include "video_core/query_cache/types.h" | ||||||
| #include "video_core/rasterizer_download_area.h" | #include "video_core/rasterizer_download_area.h" | ||||||
| 
 | 
 | ||||||
| namespace Tegra { | namespace Tegra { | ||||||
|  | @ -26,11 +27,6 @@ struct ChannelState; | ||||||
| 
 | 
 | ||||||
| namespace VideoCore { | namespace VideoCore { | ||||||
| 
 | 
 | ||||||
| enum class QueryType { |  | ||||||
|     SamplesPassed, |  | ||||||
| }; |  | ||||||
| constexpr std::size_t NumQueryTypes = 1; |  | ||||||
| 
 |  | ||||||
| enum class LoadCallbackStage { | enum class LoadCallbackStage { | ||||||
|     Prepare, |     Prepare, | ||||||
|     Build, |     Build, | ||||||
|  | @ -58,10 +54,11 @@ public: | ||||||
|     virtual void DispatchCompute() = 0; |     virtual void DispatchCompute() = 0; | ||||||
| 
 | 
 | ||||||
|     /// Resets the counter of a query
 |     /// Resets the counter of a query
 | ||||||
|     virtual void ResetCounter(QueryType type) = 0; |     virtual void ResetCounter(VideoCommon::QueryType type) = 0; | ||||||
| 
 | 
 | ||||||
|     /// Records a GPU query and caches it
 |     /// Records a GPU query and caches it
 | ||||||
|     virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; |     virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, | ||||||
|  |                        VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0; | ||||||
| 
 | 
 | ||||||
|     /// Signal an uniform buffer binding
 |     /// Signal an uniform buffer binding
 | ||||||
|     virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, |     virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | ||||||
|  | @ -83,7 +80,7 @@ public: | ||||||
|     virtual void SignalReference() = 0; |     virtual void SignalReference() = 0; | ||||||
| 
 | 
 | ||||||
|     /// Release all pending fences.
 |     /// Release all pending fences.
 | ||||||
|     virtual void ReleaseFences() = 0; |     virtual void ReleaseFences(bool force = true) = 0; | ||||||
| 
 | 
 | ||||||
|     /// Notify rasterizer that all caches should be flushed to Switch memory
 |     /// Notify rasterizer that all caches should be flushed to Switch memory
 | ||||||
|     virtual void FlushAll() = 0; |     virtual void FlushAll() = 0; | ||||||
|  |  | ||||||
|  | @ -26,16 +26,18 @@ void RasterizerNull::Draw(bool is_indexed, u32 instance_count) {} | ||||||
| void RasterizerNull::DrawTexture() {} | void RasterizerNull::DrawTexture() {} | ||||||
| void RasterizerNull::Clear(u32 layer_count) {} | void RasterizerNull::Clear(u32 layer_count) {} | ||||||
| void RasterizerNull::DispatchCompute() {} | void RasterizerNull::DispatchCompute() {} | ||||||
| void RasterizerNull::ResetCounter(VideoCore::QueryType type) {} | void RasterizerNull::ResetCounter(VideoCommon::QueryType type) {} | ||||||
| void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, | void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, | ||||||
|                            std::optional<u64> timestamp) { |                            VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { | ||||||
|     if (!gpu_memory) { |     if (!gpu_memory) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| 
 |     if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { | ||||||
|     gpu_memory->Write(gpu_addr, u64{0}); |         u64 ticks = m_gpu.GetTicks(); | ||||||
|     if (timestamp) { |         gpu_memory->Write<u64>(gpu_addr + 8, ticks); | ||||||
|         gpu_memory->Write(gpu_addr + 8, *timestamp); |         gpu_memory->Write<u64>(gpu_addr, static_cast<u64>(payload)); | ||||||
|  |     } else { | ||||||
|  |         gpu_memory->Write<u32>(gpu_addr, payload); | ||||||
|     } |     } | ||||||
| } | } | ||||||
| void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | ||||||
|  | @ -74,7 +76,7 @@ void RasterizerNull::SignalSyncPoint(u32 value) { | ||||||
|     syncpoint_manager.IncrementHost(value); |     syncpoint_manager.IncrementHost(value); | ||||||
| } | } | ||||||
| void RasterizerNull::SignalReference() {} | void RasterizerNull::SignalReference() {} | ||||||
| void RasterizerNull::ReleaseFences() {} | void RasterizerNull::ReleaseFences(bool) {} | ||||||
| void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} | void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} | ||||||
| void RasterizerNull::WaitForIdle() {} | void RasterizerNull::WaitForIdle() {} | ||||||
| void RasterizerNull::FragmentBarrier() {} | void RasterizerNull::FragmentBarrier() {} | ||||||
|  |  | ||||||
|  | @ -42,8 +42,9 @@ public: | ||||||
|     void DrawTexture() override; |     void DrawTexture() override; | ||||||
|     void Clear(u32 layer_count) override; |     void Clear(u32 layer_count) override; | ||||||
|     void DispatchCompute() override; |     void DispatchCompute() override; | ||||||
|     void ResetCounter(VideoCore::QueryType type) override; |     void ResetCounter(VideoCommon::QueryType type) override; | ||||||
|     void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; |     void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, | ||||||
|  |                VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; | ||||||
|     void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; |     void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; | ||||||
|     void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; |     void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; | ||||||
|     void FlushAll() override; |     void FlushAll() override; | ||||||
|  | @ -63,7 +64,7 @@ public: | ||||||
|     void SyncOperation(std::function<void()>&& func) override; |     void SyncOperation(std::function<void()>&& func) override; | ||||||
|     void SignalSyncPoint(u32 value) override; |     void SignalSyncPoint(u32 value) override; | ||||||
|     void SignalReference() override; |     void SignalReference() override; | ||||||
|     void ReleaseFences() override; |     void ReleaseFences(bool force) override; | ||||||
|     void FlushAndInvalidateRegion( |     void FlushAndInvalidateRegion( | ||||||
|         VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; |         VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; | ||||||
|     void WaitForIdle() override; |     void WaitForIdle() override; | ||||||
|  |  | ||||||
|  | @ -27,7 +27,7 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { | ||||||
| } // Anonymous namespace
 | } // Anonymous namespace
 | ||||||
| 
 | 
 | ||||||
| QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) | QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) | ||||||
|     : QueryCacheBase(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} |     : QueryCacheLegacy(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} | ||||||
| 
 | 
 | ||||||
| QueryCache::~QueryCache() = default; | QueryCache::~QueryCache() = default; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -26,7 +26,7 @@ class RasterizerOpenGL; | ||||||
| using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; | using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; | ||||||
| 
 | 
 | ||||||
| class QueryCache final | class QueryCache final | ||||||
|     : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { |     : public VideoCommon::QueryCacheLegacy<QueryCache, CachedQuery, CounterStream, HostCounter> { | ||||||
| public: | public: | ||||||
|     explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); |     explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); | ||||||
|     ~QueryCache(); |     ~QueryCache(); | ||||||
|  |  | ||||||
|  | @ -396,13 +396,39 @@ void RasterizerOpenGL::DispatchCompute() { | ||||||
|     has_written_global_memory |= pipeline->WritesGlobalMemory(); |     has_written_global_memory |= pipeline->WritesGlobalMemory(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { | void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) { | ||||||
|     query_cache.ResetCounter(type); |     if (type == VideoCommon::QueryType::ZPassPixelCount64) { | ||||||
|  |         query_cache.ResetCounter(VideoCore::QueryType::SamplesPassed); | ||||||
|  |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, | void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, | ||||||
|                              std::optional<u64> timestamp) { |                              VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { | ||||||
|     query_cache.Query(gpu_addr, type, timestamp); |     if (type == VideoCommon::QueryType::ZPassPixelCount64) { | ||||||
|  |         if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { | ||||||
|  |             query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, {gpu.GetTicks()}); | ||||||
|  |         } else { | ||||||
|  |             query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, std::nullopt); | ||||||
|  |         } | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     if (type != VideoCommon::QueryType::Payload) { | ||||||
|  |         payload = 1u; | ||||||
|  |     } | ||||||
|  |     std::function<void()> func([this, gpu_addr, flags, memory_manager = gpu_memory, payload]() { | ||||||
|  |         if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { | ||||||
|  |             u64 ticks = gpu.GetTicks(); | ||||||
|  |             memory_manager->Write<u64>(gpu_addr + 8, ticks); | ||||||
|  |             memory_manager->Write<u64>(gpu_addr, static_cast<u64>(payload)); | ||||||
|  |         } else { | ||||||
|  |             memory_manager->Write<u32>(gpu_addr, payload); | ||||||
|  |         } | ||||||
|  |     }); | ||||||
|  |     if (True(flags & VideoCommon::QueryPropertiesFlags::IsAFence)) { | ||||||
|  |         SignalFence(std::move(func)); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     func(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | ||||||
|  | @ -573,8 +599,8 @@ void RasterizerOpenGL::SignalReference() { | ||||||
|     fence_manager.SignalOrdering(); |     fence_manager.SignalOrdering(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerOpenGL::ReleaseFences() { | void RasterizerOpenGL::ReleaseFences(bool force) { | ||||||
|     fence_manager.WaitPendingFences(); |     fence_manager.WaitPendingFences(force); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size, | void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size, | ||||||
|  |  | ||||||
|  | @ -86,8 +86,9 @@ public: | ||||||
|     void DrawTexture() override; |     void DrawTexture() override; | ||||||
|     void Clear(u32 layer_count) override; |     void Clear(u32 layer_count) override; | ||||||
|     void DispatchCompute() override; |     void DispatchCompute() override; | ||||||
|     void ResetCounter(VideoCore::QueryType type) override; |     void ResetCounter(VideoCommon::QueryType type) override; | ||||||
|     void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; |     void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, | ||||||
|  |                VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; | ||||||
|     void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; |     void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; | ||||||
|     void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; |     void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; | ||||||
|     void FlushAll() override; |     void FlushAll() override; | ||||||
|  | @ -107,7 +108,7 @@ public: | ||||||
|     void SyncOperation(std::function<void()>&& func) override; |     void SyncOperation(std::function<void()>&& func) override; | ||||||
|     void SignalSyncPoint(u32 value) override; |     void SignalSyncPoint(u32 value) override; | ||||||
|     void SignalReference() override; |     void SignalReference() override; | ||||||
|     void ReleaseFences() override; |     void ReleaseFences(bool force = true) override; | ||||||
|     void FlushAndInvalidateRegion( |     void FlushAndInvalidateRegion( | ||||||
|         VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; |         VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; | ||||||
|     void WaitForIdle() override; |     void WaitForIdle() override; | ||||||
|  |  | ||||||
|  | @ -61,6 +61,9 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo | ||||||
|     if (device.IsExtTransformFeedbackSupported()) { |     if (device.IsExtTransformFeedbackSupported()) { | ||||||
|         flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; |         flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; | ||||||
|     } |     } | ||||||
|  |     if (device.IsExtConditionalRendering()) { | ||||||
|  |         flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT; | ||||||
|  |     } | ||||||
|     const VkBufferCreateInfo buffer_ci = { |     const VkBufferCreateInfo buffer_ci = { | ||||||
|         .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, |         .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, | ||||||
|         .pNext = nullptr, |         .pNext = nullptr, | ||||||
|  |  | ||||||
|  | @ -12,6 +12,9 @@ | ||||||
| #include "common/common_types.h" | #include "common/common_types.h" | ||||||
| #include "common/div_ceil.h" | #include "common/div_ceil.h" | ||||||
| #include "video_core/host_shaders/astc_decoder_comp_spv.h" | #include "video_core/host_shaders/astc_decoder_comp_spv.h" | ||||||
|  | #include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" | ||||||
|  | #include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h" | ||||||
|  | #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" | ||||||
| #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" | #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" | ||||||
| #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" | #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" | ||||||
| #include "video_core/renderer_vulkan/vk_compute_pass.h" | #include "video_core/renderer_vulkan/vk_compute_pass.h" | ||||||
|  | @ -57,6 +60,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE | ||||||
|     }, |     }, | ||||||
| }}; | }}; | ||||||
| 
 | 
 | ||||||
|  | constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{ | ||||||
|  |     { | ||||||
|  |         .binding = 0, | ||||||
|  |         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||||
|  |         .descriptorCount = 1, | ||||||
|  |         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||||||
|  |         .pImmutableSamplers = nullptr, | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |         .binding = 1, | ||||||
|  |         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||||
|  |         .descriptorCount = 1, | ||||||
|  |         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||||||
|  |         .pImmutableSamplers = nullptr, | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |         .binding = 2, | ||||||
|  |         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||||
|  |         .descriptorCount = 1, | ||||||
|  |         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||||||
|  |         .pImmutableSamplers = nullptr, | ||||||
|  |     }, | ||||||
|  | }}; | ||||||
|  | 
 | ||||||
| constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ | constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ | ||||||
|     .uniform_buffers = 0, |     .uniform_buffers = 0, | ||||||
|     .storage_buffers = 2, |     .storage_buffers = 2, | ||||||
|  | @ -67,6 +94,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ | ||||||
|     .score = 2, |     .score = 2, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{ | ||||||
|  |     .uniform_buffers = 0, | ||||||
|  |     .storage_buffers = 3, | ||||||
|  |     .texture_buffers = 0, | ||||||
|  |     .image_buffers = 0, | ||||||
|  |     .textures = 0, | ||||||
|  |     .images = 0, | ||||||
|  |     .score = 3, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ | constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ | ||||||
|     { |     { | ||||||
|         .binding = ASTC_BINDING_INPUT_BUFFER, |         .binding = ASTC_BINDING_INPUT_BUFFER, | ||||||
|  | @ -103,6 +140,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT | ||||||
|     .stride = sizeof(DescriptorUpdateEntry), |     .stride = sizeof(DescriptorUpdateEntry), | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{ | ||||||
|  |     .dstBinding = 0, | ||||||
|  |     .dstArrayElement = 0, | ||||||
|  |     .descriptorCount = 3, | ||||||
|  |     .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||||
|  |     .offset = 0, | ||||||
|  |     .stride = sizeof(DescriptorUpdateEntry), | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> | constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> | ||||||
|     ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ |     ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ | ||||||
|         { |         { | ||||||
|  | @ -131,13 +177,21 @@ struct AstcPushConstants { | ||||||
|     u32 block_height; |     u32 block_height; | ||||||
|     u32 block_height_mask; |     u32 block_height_mask; | ||||||
| }; | }; | ||||||
|  | 
 | ||||||
|  | struct QueriesPrefixScanPushConstants { | ||||||
|  |     u32 min_accumulation_base; | ||||||
|  |     u32 max_accumulation_base; | ||||||
|  |     u32 accumulation_limit; | ||||||
|  |     u32 buffer_offset; | ||||||
|  | }; | ||||||
| } // Anonymous namespace
 | } // Anonymous namespace
 | ||||||
| 
 | 
 | ||||||
| ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, | ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, | ||||||
|                          vk::Span<VkDescriptorSetLayoutBinding> bindings, |                          vk::Span<VkDescriptorSetLayoutBinding> bindings, | ||||||
|                          vk::Span<VkDescriptorUpdateTemplateEntry> templates, |                          vk::Span<VkDescriptorUpdateTemplateEntry> templates, | ||||||
|                          const DescriptorBankInfo& bank_info, |                          const DescriptorBankInfo& bank_info, | ||||||
|                          vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code) |                          vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, | ||||||
|  |                          std::optional<u32> optional_subgroup_size) | ||||||
|     : device{device_} { |     : device{device_} { | ||||||
|     descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ |     descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ | ||||||
|         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, |         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, | ||||||
|  | @ -178,13 +232,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, | ||||||
|         .pCode = code.data(), |         .pCode = code.data(), | ||||||
|     }); |     }); | ||||||
|     device.SaveShader(code); |     device.SaveShader(code); | ||||||
|  |     const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{ | ||||||
|  |         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, | ||||||
|  |         .pNext = nullptr, | ||||||
|  |         .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U, | ||||||
|  |     }; | ||||||
|  |     bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size; | ||||||
|     pipeline = device.GetLogical().CreateComputePipeline({ |     pipeline = device.GetLogical().CreateComputePipeline({ | ||||||
|         .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, |         .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, | ||||||
|         .pNext = nullptr, |         .pNext = nullptr, | ||||||
|         .flags = 0, |         .flags = 0, | ||||||
|         .stage{ |         .stage{ | ||||||
|             .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, |             .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, | ||||||
|             .pNext = nullptr, |             .pNext = use_setup_size ? &subgroup_size_ci : nullptr, | ||||||
|             .flags = 0, |             .flags = 0, | ||||||
|             .stage = VK_SHADER_STAGE_COMPUTE_BIT, |             .stage = VK_SHADER_STAGE_COMPUTE_BIT, | ||||||
|             .module = *module, |             .module = *module, | ||||||
|  | @ -302,6 +362,123 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble( | ||||||
|     return {staging.buffer, staging.offset}; |     return {staging.buffer, staging.offset}; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | ConditionalRenderingResolvePass::ConditionalRenderingResolvePass( | ||||||
|  |     const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, | ||||||
|  |     ComputePassDescriptorQueue& compute_pass_descriptor_queue_) | ||||||
|  |     : ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS, | ||||||
|  |                   INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr, | ||||||
|  |                   RESOLVE_CONDITIONAL_RENDER_COMP_SPV), | ||||||
|  |       scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} | ||||||
|  | 
 | ||||||
|  | void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, | ||||||
|  |                                               u32 src_offset, bool compare_to_zero) { | ||||||
|  |     const size_t compare_size = compare_to_zero ? 8 : 24; | ||||||
|  | 
 | ||||||
|  |     compute_pass_descriptor_queue.Acquire(); | ||||||
|  |     compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, compare_size); | ||||||
|  |     compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, sizeof(u32)); | ||||||
|  |     const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; | ||||||
|  | 
 | ||||||
|  |     scheduler.RequestOutsideRenderPassOperationContext(); | ||||||
|  |     scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) { | ||||||
|  |         static constexpr VkMemoryBarrier read_barrier{ | ||||||
|  |             .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||||||
|  |             .pNext = nullptr, | ||||||
|  |             .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||||||
|  |             .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||||||
|  |         }; | ||||||
|  |         static constexpr VkMemoryBarrier write_barrier{ | ||||||
|  |             .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||||||
|  |             .pNext = nullptr, | ||||||
|  |             .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, | ||||||
|  |             .dstAccessMask = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, | ||||||
|  |         }; | ||||||
|  |         const VkDescriptorSet set = descriptor_allocator.Commit(); | ||||||
|  |         device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); | ||||||
|  | 
 | ||||||
|  |         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | ||||||
|  |                                VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); | ||||||
|  |         cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); | ||||||
|  |         cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); | ||||||
|  |         cmdbuf.Dispatch(1, 1, 1); | ||||||
|  |         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, | ||||||
|  |                                VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); | ||||||
|  |     }); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | QueriesPrefixScanPass::QueriesPrefixScanPass( | ||||||
|  |     const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, | ||||||
|  |     ComputePassDescriptorQueue& compute_pass_descriptor_queue_) | ||||||
|  |     : ComputePass( | ||||||
|  |           device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, | ||||||
|  |           QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, | ||||||
|  |           COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>, | ||||||
|  |           device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) && | ||||||
|  |                   device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) && | ||||||
|  |                   device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && | ||||||
|  |                   device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) | ||||||
|  |               ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) | ||||||
|  |               : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)), | ||||||
|  |       scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} | ||||||
|  | 
 | ||||||
|  | void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, | ||||||
|  |                                 VkBuffer src_buffer, size_t number_of_sums, | ||||||
|  |                                 size_t min_accumulation_limit, size_t max_accumulation_limit) { | ||||||
|  |     size_t current_runs = number_of_sums; | ||||||
|  |     size_t offset = 0; | ||||||
|  |     while (current_runs != 0) { | ||||||
|  |         static constexpr size_t DISPATCH_SIZE = 2048U; | ||||||
|  |         size_t runs_to_do = std::min<size_t>(current_runs, DISPATCH_SIZE); | ||||||
|  |         current_runs -= runs_to_do; | ||||||
|  |         compute_pass_descriptor_queue.Acquire(); | ||||||
|  |         compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64)); | ||||||
|  |         compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64)); | ||||||
|  |         compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); | ||||||
|  |         const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; | ||||||
|  |         size_t used_offset = offset; | ||||||
|  |         offset += runs_to_do; | ||||||
|  | 
 | ||||||
|  |         scheduler.RequestOutsideRenderPassOperationContext(); | ||||||
|  |         scheduler.Record([this, descriptor_data, min_accumulation_limit, max_accumulation_limit, | ||||||
|  |                           runs_to_do, used_offset](vk::CommandBuffer cmdbuf) { | ||||||
|  |             static constexpr VkMemoryBarrier read_barrier{ | ||||||
|  |                 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||||||
|  |                 .pNext = nullptr, | ||||||
|  |                 .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | ||||||
|  |                 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||||||
|  |             }; | ||||||
|  |             static constexpr VkMemoryBarrier write_barrier{ | ||||||
|  |                 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||||||
|  |                 .pNext = nullptr, | ||||||
|  |                 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, | ||||||
|  |                 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | | ||||||
|  |                                  VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | | ||||||
|  |                                  VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | | ||||||
|  |                                  VK_ACCESS_UNIFORM_READ_BIT | | ||||||
|  |                                  VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, | ||||||
|  |             }; | ||||||
|  |             const QueriesPrefixScanPushConstants uniforms{ | ||||||
|  |                 .min_accumulation_base = static_cast<u32>(min_accumulation_limit), | ||||||
|  |                 .max_accumulation_base = static_cast<u32>(max_accumulation_limit), | ||||||
|  |                 .accumulation_limit = static_cast<u32>(runs_to_do - 1), | ||||||
|  |                 .buffer_offset = static_cast<u32>(used_offset), | ||||||
|  |             }; | ||||||
|  |             const VkDescriptorSet set = descriptor_allocator.Commit(); | ||||||
|  |             device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); | ||||||
|  | 
 | ||||||
|  |             cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | ||||||
|  |                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); | ||||||
|  |             cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); | ||||||
|  |             cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); | ||||||
|  |             cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); | ||||||
|  |             cmdbuf.Dispatch(1, 1, 1); | ||||||
|  |             cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, | ||||||
|  |                                    VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, | ||||||
|  |                                    write_barrier); | ||||||
|  |         }); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, | ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, | ||||||
|                                  DescriptorPool& descriptor_pool_, |                                  DescriptorPool& descriptor_pool_, | ||||||
|                                  StagingBufferPool& staging_buffer_pool_, |                                  StagingBufferPool& staging_buffer_pool_, | ||||||
|  |  | ||||||
|  | @ -3,6 +3,7 @@ | ||||||
| 
 | 
 | ||||||
| #pragma once | #pragma once | ||||||
| 
 | 
 | ||||||
|  | #include <optional> | ||||||
| #include <span> | #include <span> | ||||||
| #include <utility> | #include <utility> | ||||||
| 
 | 
 | ||||||
|  | @ -31,7 +32,8 @@ public: | ||||||
|                          vk::Span<VkDescriptorSetLayoutBinding> bindings, |                          vk::Span<VkDescriptorSetLayoutBinding> bindings, | ||||||
|                          vk::Span<VkDescriptorUpdateTemplateEntry> templates, |                          vk::Span<VkDescriptorUpdateTemplateEntry> templates, | ||||||
|                          const DescriptorBankInfo& bank_info, |                          const DescriptorBankInfo& bank_info, | ||||||
|                          vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); |                          vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, | ||||||
|  |                          std::optional<u32> optional_subgroup_size = std::nullopt); | ||||||
|     ~ComputePass(); |     ~ComputePass(); | ||||||
| 
 | 
 | ||||||
| protected: | protected: | ||||||
|  | @ -82,6 +84,33 @@ private: | ||||||
|     ComputePassDescriptorQueue& compute_pass_descriptor_queue; |     ComputePassDescriptorQueue& compute_pass_descriptor_queue; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | class ConditionalRenderingResolvePass final : public ComputePass { | ||||||
|  | public: | ||||||
|  |     explicit ConditionalRenderingResolvePass( | ||||||
|  |         const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, | ||||||
|  |         ComputePassDescriptorQueue& compute_pass_descriptor_queue_); | ||||||
|  | 
 | ||||||
|  |     void Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero); | ||||||
|  | 
 | ||||||
|  | private: | ||||||
|  |     Scheduler& scheduler; | ||||||
|  |     ComputePassDescriptorQueue& compute_pass_descriptor_queue; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | class QueriesPrefixScanPass final : public ComputePass { | ||||||
|  | public: | ||||||
|  |     explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_, | ||||||
|  |                                    DescriptorPool& descriptor_pool_, | ||||||
|  |                                    ComputePassDescriptorQueue& compute_pass_descriptor_queue_); | ||||||
|  | 
 | ||||||
|  |     void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, | ||||||
|  |              size_t number_of_sums, size_t min_accumulation_limit, size_t max_accumulation_limit); | ||||||
|  | 
 | ||||||
|  | private: | ||||||
|  |     Scheduler& scheduler; | ||||||
|  |     ComputePassDescriptorQueue& compute_pass_descriptor_queue; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| class ASTCDecoderPass final : public ComputePass { | class ASTCDecoderPass final : public ComputePass { | ||||||
| public: | public: | ||||||
|     explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, |     explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, | ||||||
|  |  | ||||||
|  | @ -7,6 +7,7 @@ | ||||||
| 
 | 
 | ||||||
| #include "video_core/fence_manager.h" | #include "video_core/fence_manager.h" | ||||||
| #include "video_core/renderer_vulkan/vk_buffer_cache.h" | #include "video_core/renderer_vulkan/vk_buffer_cache.h" | ||||||
|  | #include "video_core/renderer_vulkan/vk_query_cache.h" | ||||||
| #include "video_core/renderer_vulkan/vk_texture_cache.h" | #include "video_core/renderer_vulkan/vk_texture_cache.h" | ||||||
| 
 | 
 | ||||||
| namespace Core { | namespace Core { | ||||||
|  | @ -20,7 +21,6 @@ class RasterizerInterface; | ||||||
| namespace Vulkan { | namespace Vulkan { | ||||||
| 
 | 
 | ||||||
| class Device; | class Device; | ||||||
| class QueryCache; |  | ||||||
| class Scheduler; | class Scheduler; | ||||||
| 
 | 
 | ||||||
| class InnerFence : public VideoCommon::FenceBase { | class InnerFence : public VideoCommon::FenceBase { | ||||||
|  |  | ||||||
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -1,101 +1,75 @@ | ||||||
| // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 | ||||||
| // SPDX-License-Identifier: GPL-2.0-or-later
 | // SPDX-License-Identifier: GPL-3.0-or-later
 | ||||||
| 
 | 
 | ||||||
| #pragma once | #pragma once | ||||||
| 
 | 
 | ||||||
| #include <cstddef> |  | ||||||
| #include <memory> | #include <memory> | ||||||
| #include <utility> |  | ||||||
| #include <vector> |  | ||||||
| 
 | 
 | ||||||
| #include "common/common_types.h" | #include "video_core/query_cache/query_cache_base.h" | ||||||
| #include "video_core/query_cache.h" | #include "video_core/renderer_vulkan/vk_buffer_cache.h" | ||||||
| #include "video_core/renderer_vulkan/vk_resource_pool.h" |  | ||||||
| #include "video_core/vulkan_common/vulkan_wrapper.h" |  | ||||||
| 
 | 
 | ||||||
| namespace VideoCore { | namespace VideoCore { | ||||||
| class RasterizerInterface; | class RasterizerInterface; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | namespace VideoCommon { | ||||||
|  | class StreamerInterface; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| namespace Vulkan { | namespace Vulkan { | ||||||
| 
 | 
 | ||||||
| class CachedQuery; |  | ||||||
| class Device; | class Device; | ||||||
| class HostCounter; |  | ||||||
| class QueryCache; |  | ||||||
| class Scheduler; | class Scheduler; | ||||||
|  | class StagingBufferPool; | ||||||
| 
 | 
 | ||||||
| using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; | struct QueryCacheRuntimeImpl; | ||||||
| 
 | 
 | ||||||
| class QueryPool final : public ResourcePool { | class QueryCacheRuntime { | ||||||
| public: | public: | ||||||
|     explicit QueryPool(const Device& device, Scheduler& scheduler, VideoCore::QueryType type); |     explicit QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, | ||||||
|     ~QueryPool() override; |                                Core::Memory::Memory& cpu_memory_, | ||||||
|  |                                Vulkan::BufferCache& buffer_cache_, const Device& device_, | ||||||
|  |                                const MemoryAllocator& memory_allocator_, Scheduler& scheduler_, | ||||||
|  |                                StagingBufferPool& staging_pool_, | ||||||
|  |                                ComputePassDescriptorQueue& compute_pass_descriptor_queue, | ||||||
|  |                                DescriptorPool& descriptor_pool); | ||||||
|  |     ~QueryCacheRuntime(); | ||||||
| 
 | 
 | ||||||
|     std::pair<VkQueryPool, u32> Commit(); |     template <typename SyncValuesType> | ||||||
|  |     void SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer = nullptr); | ||||||
| 
 | 
 | ||||||
|     void Reserve(std::pair<VkQueryPool, u32> query); |     void Barriers(bool is_prebarrier); | ||||||
| 
 | 
 | ||||||
| protected: |     void EndHostConditionalRendering(); | ||||||
|     void Allocate(std::size_t begin, std::size_t end) override; | 
 | ||||||
|  |     void PauseHostConditionalRendering(); | ||||||
|  | 
 | ||||||
|  |     void ResumeHostConditionalRendering(); | ||||||
|  | 
 | ||||||
|  |     bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty); | ||||||
|  | 
 | ||||||
|  |     bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, | ||||||
|  |                                                VideoCommon::LookupData object_2, bool qc_dirty, | ||||||
|  |                                                bool equal_check); | ||||||
|  | 
 | ||||||
|  |     VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type); | ||||||
|  | 
 | ||||||
|  |     void Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d); | ||||||
|  | 
 | ||||||
|  |     template <typename Func> | ||||||
|  |     void View3DRegs(Func&& func); | ||||||
| 
 | 
 | ||||||
| private: | private: | ||||||
|     static constexpr std::size_t GROW_STEP = 512; |     void HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, bool is_equal); | ||||||
| 
 |     void HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal); | ||||||
|     const Device& device; |     friend struct QueryCacheRuntimeImpl; | ||||||
|     const VideoCore::QueryType type; |     std::unique_ptr<QueryCacheRuntimeImpl> impl; | ||||||
| 
 |  | ||||||
|     std::vector<vk::QueryPool> pools; |  | ||||||
|     std::vector<bool> usage; |  | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| class QueryCache final | struct QueryCacheParams { | ||||||
|     : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { |     using RuntimeType = typename Vulkan::QueryCacheRuntime; | ||||||
| public: |  | ||||||
|     explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_, |  | ||||||
|                         Core::Memory::Memory& cpu_memory_, const Device& device_, |  | ||||||
|                         Scheduler& scheduler_); |  | ||||||
|     ~QueryCache(); |  | ||||||
| 
 |  | ||||||
|     std::pair<VkQueryPool, u32> AllocateQuery(VideoCore::QueryType type); |  | ||||||
| 
 |  | ||||||
|     void Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query); |  | ||||||
| 
 |  | ||||||
|     const Device& GetDevice() const noexcept { |  | ||||||
|         return device; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     Scheduler& GetScheduler() const noexcept { |  | ||||||
|         return scheduler; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
| private: |  | ||||||
|     const Device& device; |  | ||||||
|     Scheduler& scheduler; |  | ||||||
|     std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; |  | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { | using QueryCache = VideoCommon::QueryCacheBase<QueryCacheParams>; | ||||||
| public: |  | ||||||
|     explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, |  | ||||||
|                          VideoCore::QueryType type_); |  | ||||||
|     ~HostCounter(); |  | ||||||
| 
 |  | ||||||
|     void EndQuery(); |  | ||||||
| 
 |  | ||||||
| private: |  | ||||||
|     u64 BlockingQuery(bool async = false) const override; |  | ||||||
| 
 |  | ||||||
|     QueryCache& cache; |  | ||||||
|     const VideoCore::QueryType type; |  | ||||||
|     const std::pair<VkQueryPool, u32> query; |  | ||||||
|     const u64 tick; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { |  | ||||||
| public: |  | ||||||
|     explicit CachedQuery(QueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_) |  | ||||||
|         : CachedQueryBase{cpu_addr_, host_ptr_} {} |  | ||||||
| }; |  | ||||||
| 
 | 
 | ||||||
| } // namespace Vulkan
 | } // namespace Vulkan
 | ||||||
|  |  | ||||||
|  | @ -24,6 +24,7 @@ | ||||||
| #include "video_core/renderer_vulkan/vk_compute_pipeline.h" | #include "video_core/renderer_vulkan/vk_compute_pipeline.h" | ||||||
| #include "video_core/renderer_vulkan/vk_descriptor_pool.h" | #include "video_core/renderer_vulkan/vk_descriptor_pool.h" | ||||||
| #include "video_core/renderer_vulkan/vk_pipeline_cache.h" | #include "video_core/renderer_vulkan/vk_pipeline_cache.h" | ||||||
|  | #include "video_core/renderer_vulkan/vk_query_cache.h" | ||||||
| #include "video_core/renderer_vulkan/vk_rasterizer.h" | #include "video_core/renderer_vulkan/vk_rasterizer.h" | ||||||
| #include "video_core/renderer_vulkan/vk_scheduler.h" | #include "video_core/renderer_vulkan/vk_scheduler.h" | ||||||
| #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" | #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" | ||||||
|  | @ -170,9 +171,11 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra | ||||||
|       buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, |       buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, | ||||||
|                            guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), |                            guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), | ||||||
|       buffer_cache(*this, cpu_memory_, buffer_cache_runtime), |       buffer_cache(*this, cpu_memory_, buffer_cache_runtime), | ||||||
|  |       query_cache_runtime(this, cpu_memory_, buffer_cache, device, memory_allocator, scheduler, | ||||||
|  |                           staging_pool, compute_pass_descriptor_queue, descriptor_pool), | ||||||
|  |       query_cache(gpu, *this, cpu_memory_, query_cache_runtime), | ||||||
|       pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue, |       pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue, | ||||||
|                      render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), |                      render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), | ||||||
|       query_cache{*this, cpu_memory_, device, scheduler}, |  | ||||||
|       accelerate_dma(buffer_cache, texture_cache, scheduler), |       accelerate_dma(buffer_cache, texture_cache, scheduler), | ||||||
|       fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), |       fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), | ||||||
|       wfi_event(device.GetLogical().CreateEvent()) { |       wfi_event(device.GetLogical().CreateEvent()) { | ||||||
|  | @ -189,14 +192,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { | ||||||
|     FlushWork(); |     FlushWork(); | ||||||
|     gpu_memory->FlushCaching(); |     gpu_memory->FlushCaching(); | ||||||
| 
 | 
 | ||||||
| #if ANDROID |     query_cache.NotifySegment(true); | ||||||
|     if (Settings::IsGPULevelHigh()) { |  | ||||||
|         // This is problematic on Android, disable on GPU Normal.
 |  | ||||||
|         query_cache.UpdateCounters(); |  | ||||||
|     } |  | ||||||
| #else |  | ||||||
|     query_cache.UpdateCounters(); |  | ||||||
| #endif |  | ||||||
| 
 | 
 | ||||||
|     GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; |     GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; | ||||||
|     if (!pipeline) { |     if (!pipeline) { | ||||||
|  | @ -207,13 +203,12 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { | ||||||
|     pipeline->SetEngine(maxwell3d, gpu_memory); |     pipeline->SetEngine(maxwell3d, gpu_memory); | ||||||
|     pipeline->Configure(is_indexed); |     pipeline->Configure(is_indexed); | ||||||
| 
 | 
 | ||||||
|     BeginTransformFeedback(); |  | ||||||
| 
 |  | ||||||
|     UpdateDynamicStates(); |     UpdateDynamicStates(); | ||||||
| 
 | 
 | ||||||
|  |     HandleTransformFeedback(); | ||||||
|  |     query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, | ||||||
|  |                               maxwell3d->regs.zpass_pixel_count_enable); | ||||||
|     draw_func(); |     draw_func(); | ||||||
| 
 |  | ||||||
|     EndTransformFeedback(); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { | void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { | ||||||
|  | @ -241,6 +236,14 @@ void RasterizerVulkan::DrawIndirect() { | ||||||
|         const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer(); |         const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer(); | ||||||
|         const auto& buffer = indirect_buffer.first; |         const auto& buffer = indirect_buffer.first; | ||||||
|         const auto& offset = indirect_buffer.second; |         const auto& offset = indirect_buffer.second; | ||||||
|  |         if (params.is_byte_count) { | ||||||
|  |             scheduler.Record([buffer_obj = buffer->Handle(), offset, | ||||||
|  |                               stride = params.stride](vk::CommandBuffer cmdbuf) { | ||||||
|  |                 cmdbuf.DrawIndirectByteCountEXT(1, 0, buffer_obj, offset, 0, | ||||||
|  |                                                 static_cast<u32>(stride)); | ||||||
|  |             }); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|         if (params.include_count) { |         if (params.include_count) { | ||||||
|             const auto count = buffer_cache.GetDrawIndirectCount(); |             const auto count = buffer_cache.GetDrawIndirectCount(); | ||||||
|             const auto& draw_buffer = count.first; |             const auto& draw_buffer = count.first; | ||||||
|  | @ -280,20 +283,15 @@ void RasterizerVulkan::DrawTexture() { | ||||||
|     SCOPE_EXIT({ gpu.TickWork(); }); |     SCOPE_EXIT({ gpu.TickWork(); }); | ||||||
|     FlushWork(); |     FlushWork(); | ||||||
| 
 | 
 | ||||||
| #if ANDROID |     query_cache.NotifySegment(true); | ||||||
|     if (Settings::IsGPULevelHigh()) { |  | ||||||
|         // This is problematic on Android, disable on GPU Normal.
 |  | ||||||
|         query_cache.UpdateCounters(); |  | ||||||
|     } |  | ||||||
| #else |  | ||||||
|     query_cache.UpdateCounters(); |  | ||||||
| #endif |  | ||||||
| 
 | 
 | ||||||
|     texture_cache.SynchronizeGraphicsDescriptors(); |     texture_cache.SynchronizeGraphicsDescriptors(); | ||||||
|     texture_cache.UpdateRenderTargets(false); |     texture_cache.UpdateRenderTargets(false); | ||||||
| 
 | 
 | ||||||
|     UpdateDynamicStates(); |     UpdateDynamicStates(); | ||||||
| 
 | 
 | ||||||
|  |     query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, | ||||||
|  |                               maxwell3d->regs.zpass_pixel_count_enable); | ||||||
|     const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState(); |     const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState(); | ||||||
|     const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler); |     const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler); | ||||||
|     const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture); |     const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture); | ||||||
|  | @ -316,14 +314,9 @@ void RasterizerVulkan::Clear(u32 layer_count) { | ||||||
|     FlushWork(); |     FlushWork(); | ||||||
|     gpu_memory->FlushCaching(); |     gpu_memory->FlushCaching(); | ||||||
| 
 | 
 | ||||||
| #if ANDROID |     query_cache.NotifySegment(true); | ||||||
|     if (Settings::IsGPULevelHigh()) { |     query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, | ||||||
|         // This is problematic on Android, disable on GPU Normal.
 |                               maxwell3d->regs.zpass_pixel_count_enable); | ||||||
|         query_cache.UpdateCounters(); |  | ||||||
|     } |  | ||||||
| #else |  | ||||||
|     query_cache.UpdateCounters(); |  | ||||||
| #endif |  | ||||||
| 
 | 
 | ||||||
|     auto& regs = maxwell3d->regs; |     auto& regs = maxwell3d->regs; | ||||||
|     const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || |     const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || | ||||||
|  | @ -482,13 +475,13 @@ void RasterizerVulkan::DispatchCompute() { | ||||||
|     scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); |     scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { | void RasterizerVulkan::ResetCounter(VideoCommon::QueryType type) { | ||||||
|     query_cache.ResetCounter(type); |     query_cache.CounterReset(type); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, | void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, | ||||||
|                              std::optional<u64> timestamp) { |                              VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { | ||||||
|     query_cache.Query(gpu_addr, type, timestamp); |     query_cache.CounterReport(gpu_addr, type, flags, payload, subreport); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | ||||||
|  | @ -669,8 +662,8 @@ void RasterizerVulkan::SignalReference() { | ||||||
|     fence_manager.SignalReference(); |     fence_manager.SignalReference(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerVulkan::ReleaseFences() { | void RasterizerVulkan::ReleaseFences(bool force) { | ||||||
|     fence_manager.WaitPendingFences(); |     fence_manager.WaitPendingFences(force); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size, | void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size, | ||||||
|  | @ -694,6 +687,8 @@ void RasterizerVulkan::WaitForIdle() { | ||||||
|         flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; |         flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     query_cache.NotifyWFI(); | ||||||
|  | 
 | ||||||
|     scheduler.RequestOutsideRenderPassOperationContext(); |     scheduler.RequestOutsideRenderPassOperationContext(); | ||||||
|     scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { |     scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { | ||||||
|         cmdbuf.SetEvent(event, flags); |         cmdbuf.SetEvent(event, flags); | ||||||
|  | @ -737,19 +732,7 @@ void RasterizerVulkan::TickFrame() { | ||||||
| 
 | 
 | ||||||
| bool RasterizerVulkan::AccelerateConditionalRendering() { | bool RasterizerVulkan::AccelerateConditionalRendering() { | ||||||
|     gpu_memory->FlushCaching(); |     gpu_memory->FlushCaching(); | ||||||
|     if (Settings::IsGPULevelHigh()) { |     return query_cache.AccelerateHostConditionalRendering(); | ||||||
|         // TODO(Blinkhawk): Reimplement Host conditional rendering.
 |  | ||||||
|         return false; |  | ||||||
|     } |  | ||||||
|     // Medium / Low Hack: stub any checks on queries written into the buffer cache.
 |  | ||||||
|     const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; |  | ||||||
|     Maxwell::ReportSemaphore::Compare cmp; |  | ||||||
|     if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), |  | ||||||
|                                   VideoCommon::CacheType::BufferCache | |  | ||||||
|                                       VideoCommon::CacheType::QueryCache)) { |  | ||||||
|         return true; |  | ||||||
|     } |  | ||||||
|     return false; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, | bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, | ||||||
|  | @ -795,6 +778,7 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, | ||||||
|     if (!image_view) { |     if (!image_view) { | ||||||
|         return false; |         return false; | ||||||
|     } |     } | ||||||
|  |     query_cache.NotifySegment(false); | ||||||
|     screen_info.image = image_view->ImageHandle(); |     screen_info.image = image_view->ImageHandle(); | ||||||
|     screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D); |     screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D); | ||||||
|     screen_info.width = image_view->size.width; |     screen_info.width = image_view->size.width; | ||||||
|  | @ -933,31 +917,18 @@ void RasterizerVulkan::UpdateDynamicStates() { | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerVulkan::BeginTransformFeedback() { | void RasterizerVulkan::HandleTransformFeedback() { | ||||||
|     const auto& regs = maxwell3d->regs; |     const auto& regs = maxwell3d->regs; | ||||||
|     if (regs.transform_feedback_enabled == 0) { |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|     if (!device.IsExtTransformFeedbackSupported()) { |     if (!device.IsExtTransformFeedbackSupported()) { | ||||||
|         LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); |         LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|     UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || |     query_cache.CounterEnable(VideoCommon::QueryType::StreamingByteCount, | ||||||
|                      regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); |                               regs.transform_feedback_enabled); | ||||||
|     scheduler.Record( |     if (regs.transform_feedback_enabled != 0) { | ||||||
|         [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); |         UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || | ||||||
| } |                          regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); | ||||||
| 
 |  | ||||||
| void RasterizerVulkan::EndTransformFeedback() { |  | ||||||
|     const auto& regs = maxwell3d->regs; |  | ||||||
|     if (regs.transform_feedback_enabled == 0) { |  | ||||||
|         return; |  | ||||||
|     } |     } | ||||||
|     if (!device.IsExtTransformFeedbackSupported()) { |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|     scheduler.Record( |  | ||||||
|         [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { | void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { | ||||||
|  |  | ||||||
|  | @ -84,8 +84,9 @@ public: | ||||||
|     void DrawTexture() override; |     void DrawTexture() override; | ||||||
|     void Clear(u32 layer_count) override; |     void Clear(u32 layer_count) override; | ||||||
|     void DispatchCompute() override; |     void DispatchCompute() override; | ||||||
|     void ResetCounter(VideoCore::QueryType type) override; |     void ResetCounter(VideoCommon::QueryType type) override; | ||||||
|     void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; |     void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, | ||||||
|  |                VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; | ||||||
|     void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; |     void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; | ||||||
|     void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; |     void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; | ||||||
|     void FlushAll() override; |     void FlushAll() override; | ||||||
|  | @ -106,7 +107,7 @@ public: | ||||||
|     void SyncOperation(std::function<void()>&& func) override; |     void SyncOperation(std::function<void()>&& func) override; | ||||||
|     void SignalSyncPoint(u32 value) override; |     void SignalSyncPoint(u32 value) override; | ||||||
|     void SignalReference() override; |     void SignalReference() override; | ||||||
|     void ReleaseFences() override; |     void ReleaseFences(bool force = true) override; | ||||||
|     void FlushAndInvalidateRegion( |     void FlushAndInvalidateRegion( | ||||||
|         VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; |         VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; | ||||||
|     void WaitForIdle() override; |     void WaitForIdle() override; | ||||||
|  | @ -146,9 +147,7 @@ private: | ||||||
| 
 | 
 | ||||||
|     void UpdateDynamicStates(); |     void UpdateDynamicStates(); | ||||||
| 
 | 
 | ||||||
|     void BeginTransformFeedback(); |     void HandleTransformFeedback(); | ||||||
| 
 |  | ||||||
|     void EndTransformFeedback(); |  | ||||||
| 
 | 
 | ||||||
|     void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); |     void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); | ||||||
|     void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); |     void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); | ||||||
|  | @ -195,8 +194,9 @@ private: | ||||||
|     TextureCache texture_cache; |     TextureCache texture_cache; | ||||||
|     BufferCacheRuntime buffer_cache_runtime; |     BufferCacheRuntime buffer_cache_runtime; | ||||||
|     BufferCache buffer_cache; |     BufferCache buffer_cache; | ||||||
|     PipelineCache pipeline_cache; |     QueryCacheRuntime query_cache_runtime; | ||||||
|     QueryCache query_cache; |     QueryCache query_cache; | ||||||
|  |     PipelineCache pipeline_cache; | ||||||
|     AccelerateDMA accelerate_dma; |     AccelerateDMA accelerate_dma; | ||||||
|     FenceManager fence_manager; |     FenceManager fence_manager; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -243,10 +243,10 @@ void Scheduler::AllocateNewContext() { | ||||||
| #if ANDROID | #if ANDROID | ||||||
|         if (Settings::IsGPULevelHigh()) { |         if (Settings::IsGPULevelHigh()) { | ||||||
|             // This is problematic on Android, disable on GPU Normal.
 |             // This is problematic on Android, disable on GPU Normal.
 | ||||||
|             query_cache->UpdateCounters(); |             query_cache->NotifySegment(true); | ||||||
|         } |         } | ||||||
| #else | #else | ||||||
|         query_cache->UpdateCounters(); |         query_cache->NotifySegment(true); | ||||||
| #endif | #endif | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | @ -261,11 +261,12 @@ void Scheduler::EndPendingOperations() { | ||||||
| #if ANDROID | #if ANDROID | ||||||
|     if (Settings::IsGPULevelHigh()) { |     if (Settings::IsGPULevelHigh()) { | ||||||
|         // This is problematic on Android, disable on GPU Normal.
 |         // This is problematic on Android, disable on GPU Normal.
 | ||||||
|         query_cache->DisableStreams(); |         // query_cache->DisableStreams();
 | ||||||
|     } |     } | ||||||
| #else | #else | ||||||
|     query_cache->DisableStreams(); |     // query_cache->DisableStreams();
 | ||||||
| #endif | #endif | ||||||
|  |     query_cache->NotifySegment(false); | ||||||
|     EndRenderPass(); |     EndRenderPass(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -17,6 +17,11 @@ | ||||||
| #include "video_core/renderer_vulkan/vk_master_semaphore.h" | #include "video_core/renderer_vulkan/vk_master_semaphore.h" | ||||||
| #include "video_core/vulkan_common/vulkan_wrapper.h" | #include "video_core/vulkan_common/vulkan_wrapper.h" | ||||||
| 
 | 
 | ||||||
|  | namespace VideoCommon { | ||||||
|  | template <typename Trait> | ||||||
|  | class QueryCacheBase; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| namespace Vulkan { | namespace Vulkan { | ||||||
| 
 | 
 | ||||||
| class CommandPool; | class CommandPool; | ||||||
|  | @ -24,7 +29,8 @@ class Device; | ||||||
| class Framebuffer; | class Framebuffer; | ||||||
| class GraphicsPipeline; | class GraphicsPipeline; | ||||||
| class StateTracker; | class StateTracker; | ||||||
| class QueryCache; | 
 | ||||||
|  | struct QueryCacheParams; | ||||||
| 
 | 
 | ||||||
| /// The scheduler abstracts command buffer and fence management with an interface that's able to do
 | /// The scheduler abstracts command buffer and fence management with an interface that's able to do
 | ||||||
| /// OpenGL-like operations on Vulkan command buffers.
 | /// OpenGL-like operations on Vulkan command buffers.
 | ||||||
|  | @ -63,7 +69,7 @@ public: | ||||||
|     void InvalidateState(); |     void InvalidateState(); | ||||||
| 
 | 
 | ||||||
|     /// Assigns the query cache.
 |     /// Assigns the query cache.
 | ||||||
|     void SetQueryCache(QueryCache& query_cache_) { |     void SetQueryCache(VideoCommon::QueryCacheBase<QueryCacheParams>& query_cache_) { | ||||||
|         query_cache = &query_cache_; |         query_cache = &query_cache_; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  | @ -219,7 +225,7 @@ private: | ||||||
|     std::unique_ptr<MasterSemaphore> master_semaphore; |     std::unique_ptr<MasterSemaphore> master_semaphore; | ||||||
|     std::unique_ptr<CommandPool> command_pool; |     std::unique_ptr<CommandPool> command_pool; | ||||||
| 
 | 
 | ||||||
|     QueryCache* query_cache = nullptr; |     VideoCommon::QueryCacheBase<QueryCacheParams>* query_cache = nullptr; | ||||||
| 
 | 
 | ||||||
|     vk::CommandBuffer current_cmdbuf; |     vk::CommandBuffer current_cmdbuf; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -61,6 +61,7 @@ VK_DEFINE_HANDLE(VmaAllocator) | ||||||
| 
 | 
 | ||||||
| // Define miscellaneous extensions which may be used by the implementation here.
 | // Define miscellaneous extensions which may be used by the implementation here.
 | ||||||
| #define FOR_EACH_VK_EXTENSION(EXTENSION)                                                           \ | #define FOR_EACH_VK_EXTENSION(EXTENSION)                                                           \ | ||||||
|  |     EXTENSION(EXT, CONDITIONAL_RENDERING, conditional_rendering)                                   \ | ||||||
|     EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization)                         \ |     EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization)                         \ | ||||||
|     EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted)                             \ |     EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted)                             \ | ||||||
|     EXTENSION(EXT, MEMORY_BUDGET, memory_budget)                                                   \ |     EXTENSION(EXT, MEMORY_BUDGET, memory_budget)                                                   \ | ||||||
|  | @ -93,6 +94,7 @@ VK_DEFINE_HANDLE(VmaAllocator) | ||||||
| 
 | 
 | ||||||
| // Define extensions where the absence of the extension may result in a degraded experience.
 | // Define extensions where the absence of the extension may result in a degraded experience.
 | ||||||
| #define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME)                                          \ | #define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME)                                          \ | ||||||
|  |     EXTENSION_NAME(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME)                                    \ | ||||||
|     EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME)                               \ |     EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME)                               \ | ||||||
|     EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME)                                 \ |     EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME)                                 \ | ||||||
|     EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME)                                   \ |     EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME)                                   \ | ||||||
|  | @ -541,6 +543,10 @@ public: | ||||||
|         return extensions.shader_atomic_int64; |         return extensions.shader_atomic_int64; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     bool IsExtConditionalRendering() const { | ||||||
|  |         return extensions.conditional_rendering; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     bool HasTimelineSemaphore() const; |     bool HasTimelineSemaphore() const; | ||||||
| 
 | 
 | ||||||
|     /// Returns the minimum supported version of SPIR-V.
 |     /// Returns the minimum supported version of SPIR-V.
 | ||||||
|  |  | ||||||
|  | @ -75,6 +75,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { | ||||||
|     X(vkBeginCommandBuffer); |     X(vkBeginCommandBuffer); | ||||||
|     X(vkBindBufferMemory); |     X(vkBindBufferMemory); | ||||||
|     X(vkBindImageMemory); |     X(vkBindImageMemory); | ||||||
|  |     X(vkCmdBeginConditionalRenderingEXT); | ||||||
|     X(vkCmdBeginQuery); |     X(vkCmdBeginQuery); | ||||||
|     X(vkCmdBeginRenderPass); |     X(vkCmdBeginRenderPass); | ||||||
|     X(vkCmdBeginTransformFeedbackEXT); |     X(vkCmdBeginTransformFeedbackEXT); | ||||||
|  | @ -91,6 +92,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { | ||||||
|     X(vkCmdCopyBufferToImage); |     X(vkCmdCopyBufferToImage); | ||||||
|     X(vkCmdCopyImage); |     X(vkCmdCopyImage); | ||||||
|     X(vkCmdCopyImageToBuffer); |     X(vkCmdCopyImageToBuffer); | ||||||
|  |     X(vkCmdCopyQueryPoolResults); | ||||||
|     X(vkCmdDispatch); |     X(vkCmdDispatch); | ||||||
|     X(vkCmdDispatchIndirect); |     X(vkCmdDispatchIndirect); | ||||||
|     X(vkCmdDraw); |     X(vkCmdDraw); | ||||||
|  | @ -99,6 +101,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { | ||||||
|     X(vkCmdDrawIndexedIndirect); |     X(vkCmdDrawIndexedIndirect); | ||||||
|     X(vkCmdDrawIndirectCount); |     X(vkCmdDrawIndirectCount); | ||||||
|     X(vkCmdDrawIndexedIndirectCount); |     X(vkCmdDrawIndexedIndirectCount); | ||||||
|  |     X(vkCmdDrawIndirectByteCountEXT); | ||||||
|  |     X(vkCmdEndConditionalRenderingEXT); | ||||||
|     X(vkCmdEndQuery); |     X(vkCmdEndQuery); | ||||||
|     X(vkCmdEndRenderPass); |     X(vkCmdEndRenderPass); | ||||||
|     X(vkCmdEndTransformFeedbackEXT); |     X(vkCmdEndTransformFeedbackEXT); | ||||||
|  |  | ||||||
|  | @ -185,6 +185,7 @@ struct DeviceDispatch : InstanceDispatch { | ||||||
|     PFN_vkBeginCommandBuffer vkBeginCommandBuffer{}; |     PFN_vkBeginCommandBuffer vkBeginCommandBuffer{}; | ||||||
|     PFN_vkBindBufferMemory vkBindBufferMemory{}; |     PFN_vkBindBufferMemory vkBindBufferMemory{}; | ||||||
|     PFN_vkBindImageMemory vkBindImageMemory{}; |     PFN_vkBindImageMemory vkBindImageMemory{}; | ||||||
|  |     PFN_vkCmdBeginConditionalRenderingEXT vkCmdBeginConditionalRenderingEXT{}; | ||||||
|     PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{}; |     PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{}; | ||||||
|     PFN_vkCmdBeginQuery vkCmdBeginQuery{}; |     PFN_vkCmdBeginQuery vkCmdBeginQuery{}; | ||||||
|     PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{}; |     PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{}; | ||||||
|  | @ -202,6 +203,7 @@ struct DeviceDispatch : InstanceDispatch { | ||||||
|     PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{}; |     PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{}; | ||||||
|     PFN_vkCmdCopyImage vkCmdCopyImage{}; |     PFN_vkCmdCopyImage vkCmdCopyImage{}; | ||||||
|     PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; |     PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; | ||||||
|  |     PFN_vkCmdCopyQueryPoolResults vkCmdCopyQueryPoolResults{}; | ||||||
|     PFN_vkCmdDispatch vkCmdDispatch{}; |     PFN_vkCmdDispatch vkCmdDispatch{}; | ||||||
|     PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect{}; |     PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect{}; | ||||||
|     PFN_vkCmdDraw vkCmdDraw{}; |     PFN_vkCmdDraw vkCmdDraw{}; | ||||||
|  | @ -210,6 +212,8 @@ struct DeviceDispatch : InstanceDispatch { | ||||||
|     PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; |     PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; | ||||||
|     PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; |     PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; | ||||||
|     PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; |     PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; | ||||||
|  |     PFN_vkCmdDrawIndirectByteCountEXT vkCmdDrawIndirectByteCountEXT{}; | ||||||
|  |     PFN_vkCmdEndConditionalRenderingEXT vkCmdEndConditionalRenderingEXT{}; | ||||||
|     PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; |     PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; | ||||||
|     PFN_vkCmdEndQuery vkCmdEndQuery{}; |     PFN_vkCmdEndQuery vkCmdEndQuery{}; | ||||||
|     PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; |     PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; | ||||||
|  | @ -1182,6 +1186,13 @@ public: | ||||||
|                                            count_offset, draw_count, stride); |                                            count_offset, draw_count, stride); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     void DrawIndirectByteCountEXT(u32 instance_count, u32 first_instance, VkBuffer counter_buffer, | ||||||
|  |                                   VkDeviceSize counter_buffer_offset, u32 counter_offset, | ||||||
|  |                                   u32 stride) { | ||||||
|  |         dld->vkCmdDrawIndirectByteCountEXT(handle, instance_count, first_instance, counter_buffer, | ||||||
|  |                                            counter_buffer_offset, counter_offset, stride); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     void ClearAttachments(Span<VkClearAttachment> attachments, |     void ClearAttachments(Span<VkClearAttachment> attachments, | ||||||
|                           Span<VkClearRect> rects) const noexcept { |                           Span<VkClearRect> rects) const noexcept { | ||||||
|         dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(), |         dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(), | ||||||
|  | @ -1270,6 +1281,13 @@ public: | ||||||
|                                     regions.data()); |                                     regions.data()); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     void CopyQueryPoolResults(VkQueryPool query_pool, u32 first_query, u32 query_count, | ||||||
|  |                               VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize stride, | ||||||
|  |                               VkQueryResultFlags flags) const noexcept { | ||||||
|  |         dld->vkCmdCopyQueryPoolResults(handle, query_pool, first_query, query_count, dst_buffer, | ||||||
|  |                                        dst_offset, stride, flags); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, |     void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, | ||||||
|                     u32 data) const noexcept { |                     u32 data) const noexcept { | ||||||
|         dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data); |         dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data); | ||||||
|  | @ -1448,6 +1466,15 @@ public: | ||||||
|                                           counter_buffers, counter_buffer_offsets); |                                           counter_buffers, counter_buffer_offsets); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     void BeginConditionalRenderingEXT( | ||||||
|  |         const VkConditionalRenderingBeginInfoEXT& info) const noexcept { | ||||||
|  |         dld->vkCmdBeginConditionalRenderingEXT(handle, &info); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     void EndConditionalRenderingEXT() const noexcept { | ||||||
|  |         dld->vkCmdEndConditionalRenderingEXT(handle); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept { |     void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept { | ||||||
|         const VkDebugUtilsLabelEXT label_info{ |         const VkDebugUtilsLabelEXT label_info{ | ||||||
|             .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, |             .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 liamwhite
						liamwhite