forked from eden-emu/eden
		
	Merge pull request #6465 from FernandoS27/sex-on-the-beach
GPU: Implement a garbage collector for GPU Caches (project Reaper+)
This commit is contained in:
		
						commit
						17fff10e06
					
				
					 25 changed files with 493 additions and 63 deletions
				
			
		|  | @ -24,6 +24,7 @@ enum : u64 { | ||||||
|     Size_128_MB = 128ULL * Size_1_MB, |     Size_128_MB = 128ULL * Size_1_MB, | ||||||
|     Size_448_MB = 448ULL * Size_1_MB, |     Size_448_MB = 448ULL * Size_1_MB, | ||||||
|     Size_507_MB = 507ULL * Size_1_MB, |     Size_507_MB = 507ULL * Size_1_MB, | ||||||
|  |     Size_512_MB = 512ULL * Size_1_MB, | ||||||
|     Size_562_MB = 562ULL * Size_1_MB, |     Size_562_MB = 562ULL * Size_1_MB, | ||||||
|     Size_1554_MB = 1554ULL * Size_1_MB, |     Size_1554_MB = 1554ULL * Size_1_MB, | ||||||
|     Size_2048_MB = 2048ULL * Size_1_MB, |     Size_2048_MB = 2048ULL * Size_1_MB, | ||||||
|  |  | ||||||
|  | @ -59,6 +59,7 @@ void LogSettings() { | ||||||
|     log_setting("Renderer_UseVsync", values.use_vsync.GetValue()); |     log_setting("Renderer_UseVsync", values.use_vsync.GetValue()); | ||||||
|     log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue()); |     log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue()); | ||||||
|     log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue()); |     log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue()); | ||||||
|  |     log_setting("Renderer_UseGarbageCollection", values.use_caches_gc.GetValue()); | ||||||
|     log_setting("Renderer_AnisotropicFilteringLevel", values.max_anisotropy.GetValue()); |     log_setting("Renderer_AnisotropicFilteringLevel", values.max_anisotropy.GetValue()); | ||||||
|     log_setting("Audio_OutputEngine", values.sink_id); |     log_setting("Audio_OutputEngine", values.sink_id); | ||||||
|     log_setting("Audio_EnableAudioStretching", values.enable_audio_stretching.GetValue()); |     log_setting("Audio_EnableAudioStretching", values.enable_audio_stretching.GetValue()); | ||||||
|  | @ -142,6 +143,7 @@ void RestoreGlobalState(bool is_powered_on) { | ||||||
|     values.use_assembly_shaders.SetGlobal(true); |     values.use_assembly_shaders.SetGlobal(true); | ||||||
|     values.use_asynchronous_shaders.SetGlobal(true); |     values.use_asynchronous_shaders.SetGlobal(true); | ||||||
|     values.use_fast_gpu_time.SetGlobal(true); |     values.use_fast_gpu_time.SetGlobal(true); | ||||||
|  |     values.use_caches_gc.SetGlobal(true); | ||||||
|     values.bg_red.SetGlobal(true); |     values.bg_red.SetGlobal(true); | ||||||
|     values.bg_green.SetGlobal(true); |     values.bg_green.SetGlobal(true); | ||||||
|     values.bg_blue.SetGlobal(true); |     values.bg_blue.SetGlobal(true); | ||||||
|  |  | ||||||
|  | @ -154,6 +154,7 @@ struct Values { | ||||||
|     Setting<bool> use_assembly_shaders; |     Setting<bool> use_assembly_shaders; | ||||||
|     Setting<bool> use_asynchronous_shaders; |     Setting<bool> use_asynchronous_shaders; | ||||||
|     Setting<bool> use_fast_gpu_time; |     Setting<bool> use_fast_gpu_time; | ||||||
|  |     Setting<bool> use_caches_gc; | ||||||
| 
 | 
 | ||||||
|     Setting<float> bg_red; |     Setting<float> bg_red; | ||||||
|     Setting<float> bg_green; |     Setting<float> bg_green; | ||||||
|  |  | ||||||
|  | @ -256,6 +256,16 @@ public: | ||||||
|         stream_score += score; |         stream_score += score; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     /// Sets the new frame tick
 | ||||||
|  |     void SetFrameTick(u64 new_frame_tick) noexcept { | ||||||
|  |         frame_tick = new_frame_tick; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     /// Returns the new frame tick
 | ||||||
|  |     [[nodiscard]] u64 FrameTick() const noexcept { | ||||||
|  |         return frame_tick; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     /// Returns the likeliness of this being a stream buffer
 |     /// Returns the likeliness of this being a stream buffer
 | ||||||
|     [[nodiscard]] int StreamScore() const noexcept { |     [[nodiscard]] int StreamScore() const noexcept { | ||||||
|         return stream_score; |         return stream_score; | ||||||
|  | @ -586,6 +596,7 @@ private: | ||||||
|     RasterizerInterface* rasterizer = nullptr; |     RasterizerInterface* rasterizer = nullptr; | ||||||
|     VAddr cpu_addr = 0; |     VAddr cpu_addr = 0; | ||||||
|     Words words; |     Words words; | ||||||
|  |     u64 frame_tick = 0; | ||||||
|     BufferFlagBits flags{}; |     BufferFlagBits flags{}; | ||||||
|     int stream_score = 0; |     int stream_score = 0; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | @ -16,6 +16,7 @@ | ||||||
| 
 | 
 | ||||||
| #include <boost/container/small_vector.hpp> | #include <boost/container/small_vector.hpp> | ||||||
| 
 | 
 | ||||||
|  | #include "common/common_sizes.h" | ||||||
| #include "common/common_types.h" | #include "common/common_types.h" | ||||||
| #include "common/div_ceil.h" | #include "common/div_ceil.h" | ||||||
| #include "common/microprofile.h" | #include "common/microprofile.h" | ||||||
|  | @ -65,6 +66,9 @@ class BufferCache { | ||||||
| 
 | 
 | ||||||
|     static constexpr BufferId NULL_BUFFER_ID{0}; |     static constexpr BufferId NULL_BUFFER_ID{0}; | ||||||
| 
 | 
 | ||||||
|  |     static constexpr u64 EXPECTED_MEMORY = Common::Size_512_MB; | ||||||
|  |     static constexpr u64 CRITICAL_MEMORY = Common::Size_1_GB; | ||||||
|  | 
 | ||||||
|     using Maxwell = Tegra::Engines::Maxwell3D::Regs; |     using Maxwell = Tegra::Engines::Maxwell3D::Regs; | ||||||
| 
 | 
 | ||||||
|     using Runtime = typename P::Runtime; |     using Runtime = typename P::Runtime; | ||||||
|  | @ -102,6 +106,8 @@ public: | ||||||
| 
 | 
 | ||||||
|     void TickFrame(); |     void TickFrame(); | ||||||
| 
 | 
 | ||||||
|  |     void RunGarbageCollector(); | ||||||
|  | 
 | ||||||
|     void WriteMemory(VAddr cpu_addr, u64 size); |     void WriteMemory(VAddr cpu_addr, u64 size); | ||||||
| 
 | 
 | ||||||
|     void CachedWriteMemory(VAddr cpu_addr, u64 size); |     void CachedWriteMemory(VAddr cpu_addr, u64 size); | ||||||
|  | @ -243,6 +249,8 @@ private: | ||||||
|     template <bool insert> |     template <bool insert> | ||||||
|     void ChangeRegister(BufferId buffer_id); |     void ChangeRegister(BufferId buffer_id); | ||||||
| 
 | 
 | ||||||
|  |     void TouchBuffer(Buffer& buffer) const noexcept; | ||||||
|  | 
 | ||||||
|     bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); |     bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); | ||||||
| 
 | 
 | ||||||
|     bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); |     bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); | ||||||
|  | @ -255,6 +263,10 @@ private: | ||||||
| 
 | 
 | ||||||
|     void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); |     void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); | ||||||
| 
 | 
 | ||||||
|  |     void DownloadBufferMemory(Buffer& buffer_id); | ||||||
|  | 
 | ||||||
|  |     void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); | ||||||
|  | 
 | ||||||
|     void DeleteBuffer(BufferId buffer_id); |     void DeleteBuffer(BufferId buffer_id); | ||||||
| 
 | 
 | ||||||
|     void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); |     void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); | ||||||
|  | @ -319,6 +331,10 @@ private: | ||||||
|     size_t immediate_buffer_capacity = 0; |     size_t immediate_buffer_capacity = 0; | ||||||
|     std::unique_ptr<u8[]> immediate_buffer_alloc; |     std::unique_ptr<u8[]> immediate_buffer_alloc; | ||||||
| 
 | 
 | ||||||
|  |     typename SlotVector<Buffer>::Iterator deletion_iterator; | ||||||
|  |     u64 frame_tick = 0; | ||||||
|  |     u64 total_used_memory = 0; | ||||||
|  | 
 | ||||||
|     std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; |     std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | @ -332,6 +348,28 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, | ||||||
|       gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { |       gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { | ||||||
|     // Ensure the first slot is used for the null buffer
 |     // Ensure the first slot is used for the null buffer
 | ||||||
|     void(slot_buffers.insert(runtime, NullBufferParams{})); |     void(slot_buffers.insert(runtime, NullBufferParams{})); | ||||||
|  |     deletion_iterator = slot_buffers.end(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <class P> | ||||||
|  | void BufferCache<P>::RunGarbageCollector() { | ||||||
|  |     const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY; | ||||||
|  |     const u64 ticks_to_destroy = aggressive_gc ? 60 : 120; | ||||||
|  |     int num_iterations = aggressive_gc ? 64 : 32; | ||||||
|  |     for (; num_iterations > 0; --num_iterations) { | ||||||
|  |         if (deletion_iterator == slot_buffers.end()) { | ||||||
|  |             deletion_iterator = slot_buffers.begin(); | ||||||
|  |         } | ||||||
|  |         ++deletion_iterator; | ||||||
|  |         if (deletion_iterator == slot_buffers.end()) { | ||||||
|  |             break; | ||||||
|  |         } | ||||||
|  |         const auto [buffer_id, buffer] = *deletion_iterator; | ||||||
|  |         if (buffer->FrameTick() + ticks_to_destroy < frame_tick) { | ||||||
|  |             DownloadBufferMemory(*buffer); | ||||||
|  |             DeleteBuffer(buffer_id); | ||||||
|  |         } | ||||||
|  |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| template <class P> | template <class P> | ||||||
|  | @ -349,6 +387,10 @@ void BufferCache<P>::TickFrame() { | ||||||
|     const bool skip_preferred = hits * 256 < shots * 251; |     const bool skip_preferred = hits * 256 < shots * 251; | ||||||
|     uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; |     uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; | ||||||
| 
 | 
 | ||||||
|  |     if (Settings::values.use_caches_gc.GetValue() && total_used_memory >= EXPECTED_MEMORY) { | ||||||
|  |         RunGarbageCollector(); | ||||||
|  |     } | ||||||
|  |     ++frame_tick; | ||||||
|     delayed_destruction_ring.Tick(); |     delayed_destruction_ring.Tick(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -371,50 +413,8 @@ void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { | ||||||
| 
 | 
 | ||||||
| template <class P> | template <class P> | ||||||
| void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { | void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { | ||||||
|     ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { |     ForEachBufferInRange(cpu_addr, size, | ||||||
|         boost::container::small_vector<BufferCopy, 1> copies; |                          [&](BufferId, Buffer& buffer) { DownloadBufferMemory(buffer); }); | ||||||
|         u64 total_size_bytes = 0; |  | ||||||
|         u64 largest_copy = 0; |  | ||||||
|         buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { |  | ||||||
|             copies.push_back(BufferCopy{ |  | ||||||
|                 .src_offset = range_offset, |  | ||||||
|                 .dst_offset = total_size_bytes, |  | ||||||
|                 .size = range_size, |  | ||||||
|             }); |  | ||||||
|             total_size_bytes += range_size; |  | ||||||
|             largest_copy = std::max(largest_copy, range_size); |  | ||||||
|         }); |  | ||||||
|         if (total_size_bytes == 0) { |  | ||||||
|             return; |  | ||||||
|         } |  | ||||||
|         MICROPROFILE_SCOPE(GPU_DownloadMemory); |  | ||||||
| 
 |  | ||||||
|         if constexpr (USE_MEMORY_MAPS) { |  | ||||||
|             auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); |  | ||||||
|             const u8* const mapped_memory = download_staging.mapped_span.data(); |  | ||||||
|             const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); |  | ||||||
|             for (BufferCopy& copy : copies) { |  | ||||||
|                 // Modify copies to have the staging offset in mind
 |  | ||||||
|                 copy.dst_offset += download_staging.offset; |  | ||||||
|             } |  | ||||||
|             runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); |  | ||||||
|             runtime.Finish(); |  | ||||||
|             for (const BufferCopy& copy : copies) { |  | ||||||
|                 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; |  | ||||||
|                 // Undo the modified offset
 |  | ||||||
|                 const u64 dst_offset = copy.dst_offset - download_staging.offset; |  | ||||||
|                 const u8* copy_mapped_memory = mapped_memory + dst_offset; |  | ||||||
|                 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); |  | ||||||
|             } |  | ||||||
|         } else { |  | ||||||
|             const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); |  | ||||||
|             for (const BufferCopy& copy : copies) { |  | ||||||
|                 buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); |  | ||||||
|                 const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; |  | ||||||
|                 cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     }); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| template <class P> | template <class P> | ||||||
|  | @ -640,6 +640,7 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { | ||||||
| template <class P> | template <class P> | ||||||
| void BufferCache<P>::BindHostIndexBuffer() { | void BufferCache<P>::BindHostIndexBuffer() { | ||||||
|     Buffer& buffer = slot_buffers[index_buffer.buffer_id]; |     Buffer& buffer = slot_buffers[index_buffer.buffer_id]; | ||||||
|  |     TouchBuffer(buffer); | ||||||
|     const u32 offset = buffer.Offset(index_buffer.cpu_addr); |     const u32 offset = buffer.Offset(index_buffer.cpu_addr); | ||||||
|     const u32 size = index_buffer.size; |     const u32 size = index_buffer.size; | ||||||
|     SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); |     SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); | ||||||
|  | @ -658,6 +659,7 @@ void BufferCache<P>::BindHostVertexBuffers() { | ||||||
|     for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { |     for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { | ||||||
|         const Binding& binding = vertex_buffers[index]; |         const Binding& binding = vertex_buffers[index]; | ||||||
|         Buffer& buffer = slot_buffers[binding.buffer_id]; |         Buffer& buffer = slot_buffers[binding.buffer_id]; | ||||||
|  |         TouchBuffer(buffer); | ||||||
|         SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); |         SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); | ||||||
|         if (!flags[Dirty::VertexBuffer0 + index]) { |         if (!flags[Dirty::VertexBuffer0 + index]) { | ||||||
|             continue; |             continue; | ||||||
|  | @ -693,6 +695,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 | ||||||
|     const VAddr cpu_addr = binding.cpu_addr; |     const VAddr cpu_addr = binding.cpu_addr; | ||||||
|     const u32 size = binding.size; |     const u32 size = binding.size; | ||||||
|     Buffer& buffer = slot_buffers[binding.buffer_id]; |     Buffer& buffer = slot_buffers[binding.buffer_id]; | ||||||
|  |     TouchBuffer(buffer); | ||||||
|     const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && |     const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && | ||||||
|                                  size <= uniform_buffer_skip_cache_size && |                                  size <= uniform_buffer_skip_cache_size && | ||||||
|                                  !buffer.IsRegionGpuModified(cpu_addr, size); |                                  !buffer.IsRegionGpuModified(cpu_addr, size); | ||||||
|  | @ -744,6 +747,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) { | ||||||
|     ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { |     ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { | ||||||
|         const Binding& binding = storage_buffers[stage][index]; |         const Binding& binding = storage_buffers[stage][index]; | ||||||
|         Buffer& buffer = slot_buffers[binding.buffer_id]; |         Buffer& buffer = slot_buffers[binding.buffer_id]; | ||||||
|  |         TouchBuffer(buffer); | ||||||
|         const u32 size = binding.size; |         const u32 size = binding.size; | ||||||
|         SynchronizeBuffer(buffer, binding.cpu_addr, size); |         SynchronizeBuffer(buffer, binding.cpu_addr, size); | ||||||
| 
 | 
 | ||||||
|  | @ -766,6 +770,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() { | ||||||
|     for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { |     for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { | ||||||
|         const Binding& binding = transform_feedback_buffers[index]; |         const Binding& binding = transform_feedback_buffers[index]; | ||||||
|         Buffer& buffer = slot_buffers[binding.buffer_id]; |         Buffer& buffer = slot_buffers[binding.buffer_id]; | ||||||
|  |         TouchBuffer(buffer); | ||||||
|         const u32 size = binding.size; |         const u32 size = binding.size; | ||||||
|         SynchronizeBuffer(buffer, binding.cpu_addr, size); |         SynchronizeBuffer(buffer, binding.cpu_addr, size); | ||||||
| 
 | 
 | ||||||
|  | @ -784,6 +789,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() { | ||||||
|     ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { |     ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { | ||||||
|         const Binding& binding = compute_uniform_buffers[index]; |         const Binding& binding = compute_uniform_buffers[index]; | ||||||
|         Buffer& buffer = slot_buffers[binding.buffer_id]; |         Buffer& buffer = slot_buffers[binding.buffer_id]; | ||||||
|  |         TouchBuffer(buffer); | ||||||
|         const u32 size = binding.size; |         const u32 size = binding.size; | ||||||
|         SynchronizeBuffer(buffer, binding.cpu_addr, size); |         SynchronizeBuffer(buffer, binding.cpu_addr, size); | ||||||
| 
 | 
 | ||||||
|  | @ -803,6 +809,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() { | ||||||
|     ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { |     ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { | ||||||
|         const Binding& binding = compute_storage_buffers[index]; |         const Binding& binding = compute_storage_buffers[index]; | ||||||
|         Buffer& buffer = slot_buffers[binding.buffer_id]; |         Buffer& buffer = slot_buffers[binding.buffer_id]; | ||||||
|  |         TouchBuffer(buffer); | ||||||
|         const u32 size = binding.size; |         const u32 size = binding.size; | ||||||
|         SynchronizeBuffer(buffer, binding.cpu_addr, size); |         SynchronizeBuffer(buffer, binding.cpu_addr, size); | ||||||
| 
 | 
 | ||||||
|  | @ -1101,6 +1108,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { | ||||||
|     const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); |     const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); | ||||||
|     const u32 size = static_cast<u32>(overlap.end - overlap.begin); |     const u32 size = static_cast<u32>(overlap.end - overlap.begin); | ||||||
|     const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); |     const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); | ||||||
|  |     TouchBuffer(slot_buffers[new_buffer_id]); | ||||||
|     for (const BufferId overlap_id : overlap.ids) { |     for (const BufferId overlap_id : overlap.ids) { | ||||||
|         JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); |         JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); | ||||||
|     } |     } | ||||||
|  | @ -1122,8 +1130,14 @@ template <class P> | ||||||
| template <bool insert> | template <bool insert> | ||||||
| void BufferCache<P>::ChangeRegister(BufferId buffer_id) { | void BufferCache<P>::ChangeRegister(BufferId buffer_id) { | ||||||
|     const Buffer& buffer = slot_buffers[buffer_id]; |     const Buffer& buffer = slot_buffers[buffer_id]; | ||||||
|  |     const auto size = buffer.SizeBytes(); | ||||||
|  |     if (insert) { | ||||||
|  |         total_used_memory += Common::AlignUp(size, 1024); | ||||||
|  |     } else { | ||||||
|  |         total_used_memory -= Common::AlignUp(size, 1024); | ||||||
|  |     } | ||||||
|     const VAddr cpu_addr_begin = buffer.CpuAddr(); |     const VAddr cpu_addr_begin = buffer.CpuAddr(); | ||||||
|     const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes(); |     const VAddr cpu_addr_end = cpu_addr_begin + size; | ||||||
|     const u64 page_begin = cpu_addr_begin / PAGE_SIZE; |     const u64 page_begin = cpu_addr_begin / PAGE_SIZE; | ||||||
|     const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); |     const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); | ||||||
|     for (u64 page = page_begin; page != page_end; ++page) { |     for (u64 page = page_begin; page != page_end; ++page) { | ||||||
|  | @ -1135,6 +1149,11 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | template <class P> | ||||||
|  | void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept { | ||||||
|  |     buffer.SetFrameTick(frame_tick); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| template <class P> | template <class P> | ||||||
| bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { | bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { | ||||||
|     if (buffer.CpuAddr() == 0) { |     if (buffer.CpuAddr() == 0) { | ||||||
|  | @ -1211,6 +1230,57 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, | ||||||
|     runtime.CopyBuffer(buffer, upload_staging.buffer, copies); |     runtime.CopyBuffer(buffer, upload_staging.buffer, copies); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | template <class P> | ||||||
|  | void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) { | ||||||
|  |     DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes()); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <class P> | ||||||
|  | void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 size) { | ||||||
|  |     boost::container::small_vector<BufferCopy, 1> copies; | ||||||
|  |     u64 total_size_bytes = 0; | ||||||
|  |     u64 largest_copy = 0; | ||||||
|  |     buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { | ||||||
|  |         copies.push_back(BufferCopy{ | ||||||
|  |             .src_offset = range_offset, | ||||||
|  |             .dst_offset = total_size_bytes, | ||||||
|  |             .size = range_size, | ||||||
|  |         }); | ||||||
|  |         total_size_bytes += range_size; | ||||||
|  |         largest_copy = std::max(largest_copy, range_size); | ||||||
|  |     }); | ||||||
|  |     if (total_size_bytes == 0) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     MICROPROFILE_SCOPE(GPU_DownloadMemory); | ||||||
|  | 
 | ||||||
|  |     if constexpr (USE_MEMORY_MAPS) { | ||||||
|  |         auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); | ||||||
|  |         const u8* const mapped_memory = download_staging.mapped_span.data(); | ||||||
|  |         const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); | ||||||
|  |         for (BufferCopy& copy : copies) { | ||||||
|  |             // Modify copies to have the staging offset in mind
 | ||||||
|  |             copy.dst_offset += download_staging.offset; | ||||||
|  |         } | ||||||
|  |         runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); | ||||||
|  |         runtime.Finish(); | ||||||
|  |         for (const BufferCopy& copy : copies) { | ||||||
|  |             const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; | ||||||
|  |             // Undo the modified offset
 | ||||||
|  |             const u64 dst_offset = copy.dst_offset - download_staging.offset; | ||||||
|  |             const u8* copy_mapped_memory = mapped_memory + dst_offset; | ||||||
|  |             cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); | ||||||
|  |         } | ||||||
|  |     } else { | ||||||
|  |         const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); | ||||||
|  |         for (const BufferCopy& copy : copies) { | ||||||
|  |             buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); | ||||||
|  |             const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; | ||||||
|  |             cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| template <class P> | template <class P> | ||||||
| void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { | void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { | ||||||
|     const auto scalar_replace = [buffer_id](Binding& binding) { |     const auto scalar_replace = [buffer_id](Binding& binding) { | ||||||
|  | @ -1236,6 +1306,7 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { | ||||||
| 
 | 
 | ||||||
|     Unregister(buffer_id); |     Unregister(buffer_id); | ||||||
|     delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); |     delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); | ||||||
|  |     slot_buffers.erase(buffer_id); | ||||||
| 
 | 
 | ||||||
|     NotifyBufferDeletion(); |     NotifyBufferDeletion(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -737,6 +737,8 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | Image::~Image() = default; | ||||||
|  | 
 | ||||||
| void Image::UploadMemory(const ImageBufferMap& map, | void Image::UploadMemory(const ImageBufferMap& map, | ||||||
|                          std::span<const VideoCommon::BufferImageCopy> copies) { |                          std::span<const VideoCommon::BufferImageCopy> copies) { | ||||||
|     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); |     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); | ||||||
|  |  | ||||||
|  | @ -143,6 +143,14 @@ public: | ||||||
|     explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, |     explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, | ||||||
|                    VAddr cpu_addr); |                    VAddr cpu_addr); | ||||||
| 
 | 
 | ||||||
|  |     ~Image(); | ||||||
|  | 
 | ||||||
|  |     Image(const Image&) = delete; | ||||||
|  |     Image& operator=(const Image&) = delete; | ||||||
|  | 
 | ||||||
|  |     Image(Image&&) = default; | ||||||
|  |     Image& operator=(Image&&) = default; | ||||||
|  | 
 | ||||||
|     void UploadMemory(const ImageBufferMap& map, |     void UploadMemory(const ImageBufferMap& map, | ||||||
|                       std::span<const VideoCommon::BufferImageCopy> copies); |                       std::span<const VideoCommon::BufferImageCopy> copies); | ||||||
| 
 | 
 | ||||||
|  | @ -235,6 +243,7 @@ struct TextureCacheParams { | ||||||
|     static constexpr bool ENABLE_VALIDATION = true; |     static constexpr bool ENABLE_VALIDATION = true; | ||||||
|     static constexpr bool FRAMEBUFFER_BLITS = true; |     static constexpr bool FRAMEBUFFER_BLITS = true; | ||||||
|     static constexpr bool HAS_EMULATED_COPIES = true; |     static constexpr bool HAS_EMULATED_COPIES = true; | ||||||
|  |     static constexpr bool HAS_DEVICE_MEMORY_INFO = false; | ||||||
| 
 | 
 | ||||||
|     using Runtime = OpenGL::TextureCacheRuntime; |     using Runtime = OpenGL::TextureCacheRuntime; | ||||||
|     using Image = OpenGL::Image; |     using Image = OpenGL::Image; | ||||||
|  |  | ||||||
|  | @ -818,6 +818,10 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src, | ||||||
|     }); |     }); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | u64 TextureCacheRuntime::GetDeviceLocalMemory() const { | ||||||
|  |     return device.GetDeviceLocalMemory(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_addr_, | Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_addr_, | ||||||
|              VAddr cpu_addr_) |              VAddr cpu_addr_) | ||||||
|     : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime.scheduler}, |     : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime.scheduler}, | ||||||
|  | @ -876,6 +880,8 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_ | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | Image::~Image() = default; | ||||||
|  | 
 | ||||||
| void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { | void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { | ||||||
|     // TODO: Move this to another API
 |     // TODO: Move this to another API
 | ||||||
|     scheduler->RequestOutsideRenderPassOperationContext(); |     scheduler->RequestOutsideRenderPassOperationContext(); | ||||||
|  |  | ||||||
|  | @ -97,6 +97,8 @@ struct TextureCacheRuntime { | ||||||
|         // All known Vulkan drivers can natively handle BGR textures
 |         // All known Vulkan drivers can natively handle BGR textures
 | ||||||
|         return true; |         return true; | ||||||
|     } |     } | ||||||
|  | 
 | ||||||
|  |     u64 GetDeviceLocalMemory() const; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| class Image : public VideoCommon::ImageBase { | class Image : public VideoCommon::ImageBase { | ||||||
|  | @ -104,6 +106,14 @@ public: | ||||||
|     explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, |     explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, | ||||||
|                    VAddr cpu_addr); |                    VAddr cpu_addr); | ||||||
| 
 | 
 | ||||||
|  |     ~Image(); | ||||||
|  | 
 | ||||||
|  |     Image(const Image&) = delete; | ||||||
|  |     Image& operator=(const Image&) = delete; | ||||||
|  | 
 | ||||||
|  |     Image(Image&&) = default; | ||||||
|  |     Image& operator=(Image&&) = default; | ||||||
|  | 
 | ||||||
|     void UploadMemory(const StagingBufferRef& map, |     void UploadMemory(const StagingBufferRef& map, | ||||||
|                       std::span<const VideoCommon::BufferImageCopy> copies); |                       std::span<const VideoCommon::BufferImageCopy> copies); | ||||||
| 
 | 
 | ||||||
|  | @ -257,6 +267,7 @@ struct TextureCacheParams { | ||||||
|     static constexpr bool ENABLE_VALIDATION = true; |     static constexpr bool ENABLE_VALIDATION = true; | ||||||
|     static constexpr bool FRAMEBUFFER_BLITS = false; |     static constexpr bool FRAMEBUFFER_BLITS = false; | ||||||
|     static constexpr bool HAS_EMULATED_COPIES = false; |     static constexpr bool HAS_EMULATED_COPIES = false; | ||||||
|  |     static constexpr bool HAS_DEVICE_MEMORY_INFO = true; | ||||||
| 
 | 
 | ||||||
|     using Runtime = Vulkan::TextureCacheRuntime; |     using Runtime = Vulkan::TextureCacheRuntime; | ||||||
|     using Image = Vulkan::Image; |     using Image = Vulkan::Image; | ||||||
|  |  | ||||||
|  | @ -283,4 +283,11 @@ std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { | ||||||
|     return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; |     return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format) { | ||||||
|  |     constexpr u64 RGBA8_PIXEL_SIZE = 4; | ||||||
|  |     const u64 base_block_size = static_cast<u64>(DefaultBlockWidth(format)) * | ||||||
|  |                                 static_cast<u64>(DefaultBlockHeight(format)) * RGBA8_PIXEL_SIZE; | ||||||
|  |     return (base_size * base_block_size) / BytesPerBlock(format); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| } // namespace VideoCore::Surface
 | } // namespace VideoCore::Surface
 | ||||||
|  |  | ||||||
|  | @ -462,4 +462,6 @@ bool IsPixelFormatSRGB(PixelFormat format); | ||||||
| 
 | 
 | ||||||
| std::pair<u32, u32> GetASTCBlockSize(PixelFormat format); | std::pair<u32, u32> GetASTCBlockSize(PixelFormat format); | ||||||
| 
 | 
 | ||||||
|  | u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format); | ||||||
|  | 
 | ||||||
| } // namespace VideoCore::Surface
 | } // namespace VideoCore::Surface
 | ||||||
|  |  | ||||||
|  | @ -113,6 +113,43 @@ void ImageBase::InsertView(const ImageViewInfo& view_info, ImageViewId image_vie | ||||||
|     image_view_ids.push_back(image_view_id); |     image_view_ids.push_back(image_view_id); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | bool ImageBase::IsSafeDownload() const noexcept { | ||||||
|  |     // Skip images that were not modified from the GPU
 | ||||||
|  |     if (False(flags & ImageFlagBits::GpuModified)) { | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |     // Skip images that .are. modified from the CPU
 | ||||||
|  |     // We don't want to write sensitive data from the guest
 | ||||||
|  |     if (True(flags & ImageFlagBits::CpuModified)) { | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |     if (info.num_samples > 1) { | ||||||
|  |         LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented"); | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |     return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void ImageBase::CheckBadOverlapState() { | ||||||
|  |     if (False(flags & ImageFlagBits::BadOverlap)) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     if (!overlapping_images.empty()) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     flags &= ~ImageFlagBits::BadOverlap; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void ImageBase::CheckAliasState() { | ||||||
|  |     if (False(flags & ImageFlagBits::Alias)) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     if (!aliased_images.empty()) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     flags &= ~ImageFlagBits::Alias; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_id) { | void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_id) { | ||||||
|     static constexpr auto OPTIONS = RelaxedOptions::Size | RelaxedOptions::Format; |     static constexpr auto OPTIONS = RelaxedOptions::Size | RelaxedOptions::Format; | ||||||
|     ASSERT(lhs.info.type == rhs.info.type); |     ASSERT(lhs.info.type == rhs.info.type); | ||||||
|  |  | ||||||
|  | @ -25,6 +25,12 @@ enum class ImageFlagBits : u32 { | ||||||
|     Strong = 1 << 5,      ///< Exists in the image table, the dimensions are can be trusted
 |     Strong = 1 << 5,      ///< Exists in the image table, the dimensions are can be trusted
 | ||||||
|     Registered = 1 << 6,  ///< True when the image is registered
 |     Registered = 1 << 6,  ///< True when the image is registered
 | ||||||
|     Picked = 1 << 7,      ///< Temporary flag to mark the image as picked
 |     Picked = 1 << 7,      ///< Temporary flag to mark the image as picked
 | ||||||
|  | 
 | ||||||
|  |     // Garbage Collection Flags
 | ||||||
|  |     BadOverlap = 1 << 8, ///< This image overlaps other but doesn't fit, has higher
 | ||||||
|  |                          ///< garbage collection priority
 | ||||||
|  |     Alias = 1 << 9,      ///< This image has aliases and has priority on garbage
 | ||||||
|  |                          ///< collection
 | ||||||
| }; | }; | ||||||
| DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) | DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) | ||||||
| 
 | 
 | ||||||
|  | @ -44,11 +50,16 @@ struct ImageBase { | ||||||
| 
 | 
 | ||||||
|     void InsertView(const ImageViewInfo& view_info, ImageViewId image_view_id); |     void InsertView(const ImageViewInfo& view_info, ImageViewId image_view_id); | ||||||
| 
 | 
 | ||||||
|  |     [[nodiscard]] bool IsSafeDownload() const noexcept; | ||||||
|  | 
 | ||||||
|     [[nodiscard]] bool Overlaps(VAddr overlap_cpu_addr, size_t overlap_size) const noexcept { |     [[nodiscard]] bool Overlaps(VAddr overlap_cpu_addr, size_t overlap_size) const noexcept { | ||||||
|         const VAddr overlap_end = overlap_cpu_addr + overlap_size; |         const VAddr overlap_end = overlap_cpu_addr + overlap_size; | ||||||
|         return cpu_addr < overlap_end && overlap_cpu_addr < cpu_addr_end; |         return cpu_addr < overlap_end && overlap_cpu_addr < cpu_addr_end; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     void CheckBadOverlapState(); | ||||||
|  |     void CheckAliasState(); | ||||||
|  | 
 | ||||||
|     ImageInfo info; |     ImageInfo info; | ||||||
| 
 | 
 | ||||||
|     u32 guest_size_bytes = 0; |     u32 guest_size_bytes = 0; | ||||||
|  | @ -72,6 +83,7 @@ struct ImageBase { | ||||||
|     std::vector<SubresourceBase> slice_subresources; |     std::vector<SubresourceBase> slice_subresources; | ||||||
| 
 | 
 | ||||||
|     std::vector<AliasedImage> aliased_images; |     std::vector<AliasedImage> aliased_images; | ||||||
|  |     std::vector<ImageId> overlapping_images; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| struct ImageAllocBase { | struct ImageAllocBase { | ||||||
|  |  | ||||||
|  | @ -5,6 +5,7 @@ | ||||||
| #pragma once | #pragma once | ||||||
| 
 | 
 | ||||||
| #include <array> | #include <array> | ||||||
|  | #include <bit> | ||||||
| #include <concepts> | #include <concepts> | ||||||
| #include <numeric> | #include <numeric> | ||||||
| #include <type_traits> | #include <type_traits> | ||||||
|  | @ -32,6 +33,60 @@ template <class T> | ||||||
| requires std::is_nothrow_move_assignable_v<T>&& | requires std::is_nothrow_move_assignable_v<T>&& | ||||||
|     std::is_nothrow_move_constructible_v<T> class SlotVector { |     std::is_nothrow_move_constructible_v<T> class SlotVector { | ||||||
| public: | public: | ||||||
|  |     class Iterator { | ||||||
|  |         friend SlotVector<T>; | ||||||
|  | 
 | ||||||
|  |     public: | ||||||
|  |         constexpr Iterator() = default; | ||||||
|  | 
 | ||||||
|  |         Iterator& operator++() noexcept { | ||||||
|  |             const u64* const bitset = slot_vector->stored_bitset.data(); | ||||||
|  |             const u32 size = static_cast<u32>(slot_vector->stored_bitset.size()) * 64; | ||||||
|  |             if (id.index < size) { | ||||||
|  |                 do { | ||||||
|  |                     ++id.index; | ||||||
|  |                 } while (id.index < size && !IsValid(bitset)); | ||||||
|  |                 if (id.index == size) { | ||||||
|  |                     id.index = SlotId::INVALID_INDEX; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             return *this; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         Iterator operator++(int) noexcept { | ||||||
|  |             const Iterator copy{*this}; | ||||||
|  |             ++*this; | ||||||
|  |             return copy; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         bool operator==(const Iterator& other) const noexcept { | ||||||
|  |             return id.index == other.id.index; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         bool operator!=(const Iterator& other) const noexcept { | ||||||
|  |             return id.index != other.id.index; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         std::pair<SlotId, T*> operator*() const noexcept { | ||||||
|  |             return {id, std::addressof((*slot_vector)[id])}; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         T* operator->() const noexcept { | ||||||
|  |             return std::addressof((*slot_vector)[id]); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |     private: | ||||||
|  |         Iterator(SlotVector<T>* slot_vector_, SlotId id_) noexcept | ||||||
|  |             : slot_vector{slot_vector_}, id{id_} {} | ||||||
|  | 
 | ||||||
|  |         bool IsValid(const u64* bitset) const noexcept { | ||||||
|  |             return ((bitset[id.index / 64] >> (id.index % 64)) & 1) != 0; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         SlotVector<T>* slot_vector; | ||||||
|  |         SlotId id; | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|     ~SlotVector() noexcept { |     ~SlotVector() noexcept { | ||||||
|         size_t index = 0; |         size_t index = 0; | ||||||
|         for (u64 bits : stored_bitset) { |         for (u64 bits : stored_bitset) { | ||||||
|  | @ -70,6 +125,20 @@ public: | ||||||
|         ResetStorageBit(id.index); |         ResetStorageBit(id.index); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     [[nodiscard]] Iterator begin() noexcept { | ||||||
|  |         const auto it = std::ranges::find_if(stored_bitset, [](u64 value) { return value != 0; }); | ||||||
|  |         if (it == stored_bitset.end()) { | ||||||
|  |             return end(); | ||||||
|  |         } | ||||||
|  |         const u32 word_index = static_cast<u32>(std::distance(it, stored_bitset.begin())); | ||||||
|  |         const SlotId first_id{word_index * 64 + static_cast<u32>(std::countr_zero(*it))}; | ||||||
|  |         return Iterator(this, first_id); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     [[nodiscard]] Iterator end() noexcept { | ||||||
|  |         return Iterator(this, SlotId{SlotId::INVALID_INDEX}); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
| private: | private: | ||||||
|     struct NonTrivialDummy { |     struct NonTrivialDummy { | ||||||
|         NonTrivialDummy() noexcept {} |         NonTrivialDummy() noexcept {} | ||||||
|  | @ -140,7 +209,6 @@ private: | ||||||
| 
 | 
 | ||||||
|     Entry* values = nullptr; |     Entry* values = nullptr; | ||||||
|     size_t values_capacity = 0; |     size_t values_capacity = 0; | ||||||
|     size_t values_size = 0; |  | ||||||
| 
 | 
 | ||||||
|     std::vector<u64> stored_bitset; |     std::vector<u64> stored_bitset; | ||||||
|     std::vector<u32> free_list; |     std::vector<u32> free_list; | ||||||
|  |  | ||||||
|  | @ -20,8 +20,10 @@ | ||||||
| 
 | 
 | ||||||
| #include "common/alignment.h" | #include "common/alignment.h" | ||||||
| #include "common/common_funcs.h" | #include "common/common_funcs.h" | ||||||
|  | #include "common/common_sizes.h" | ||||||
| #include "common/common_types.h" | #include "common/common_types.h" | ||||||
| #include "common/logging/log.h" | #include "common/logging/log.h" | ||||||
|  | #include "common/settings.h" | ||||||
| #include "video_core/compatible_formats.h" | #include "video_core/compatible_formats.h" | ||||||
| #include "video_core/delayed_destruction_ring.h" | #include "video_core/delayed_destruction_ring.h" | ||||||
| #include "video_core/dirty_flags.h" | #include "video_core/dirty_flags.h" | ||||||
|  | @ -69,12 +71,17 @@ class TextureCache { | ||||||
|     static constexpr bool FRAMEBUFFER_BLITS = P::FRAMEBUFFER_BLITS; |     static constexpr bool FRAMEBUFFER_BLITS = P::FRAMEBUFFER_BLITS; | ||||||
|     /// True when some copies have to be emulated
 |     /// True when some copies have to be emulated
 | ||||||
|     static constexpr bool HAS_EMULATED_COPIES = P::HAS_EMULATED_COPIES; |     static constexpr bool HAS_EMULATED_COPIES = P::HAS_EMULATED_COPIES; | ||||||
|  |     /// True when the API can provide info about the memory of the device.
 | ||||||
|  |     static constexpr bool HAS_DEVICE_MEMORY_INFO = P::HAS_DEVICE_MEMORY_INFO; | ||||||
| 
 | 
 | ||||||
|     /// Image view ID for null descriptors
 |     /// Image view ID for null descriptors
 | ||||||
|     static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0}; |     static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0}; | ||||||
|     /// Sampler ID for bugged sampler ids
 |     /// Sampler ID for bugged sampler ids
 | ||||||
|     static constexpr SamplerId NULL_SAMPLER_ID{0}; |     static constexpr SamplerId NULL_SAMPLER_ID{0}; | ||||||
| 
 | 
 | ||||||
|  |     static constexpr u64 DEFAULT_EXPECTED_MEMORY = Common::Size_1_GB; | ||||||
|  |     static constexpr u64 DEFAULT_CRITICAL_MEMORY = Common::Size_2_GB; | ||||||
|  | 
 | ||||||
|     using Runtime = typename P::Runtime; |     using Runtime = typename P::Runtime; | ||||||
|     using Image = typename P::Image; |     using Image = typename P::Image; | ||||||
|     using ImageAlloc = typename P::ImageAlloc; |     using ImageAlloc = typename P::ImageAlloc; | ||||||
|  | @ -103,6 +110,9 @@ public: | ||||||
|     /// Notify the cache that a new frame has been queued
 |     /// Notify the cache that a new frame has been queued
 | ||||||
|     void TickFrame(); |     void TickFrame(); | ||||||
| 
 | 
 | ||||||
|  |     /// Runs the Garbage Collector.
 | ||||||
|  |     void RunGarbageCollector(); | ||||||
|  | 
 | ||||||
|     /// Return a constant reference to the given image view id
 |     /// Return a constant reference to the given image view id
 | ||||||
|     [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept; |     [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept; | ||||||
| 
 | 
 | ||||||
|  | @ -333,6 +343,10 @@ private: | ||||||
|     std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> page_table; |     std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> page_table; | ||||||
| 
 | 
 | ||||||
|     bool has_deleted_images = false; |     bool has_deleted_images = false; | ||||||
|  |     u64 total_used_memory = 0; | ||||||
|  |     u64 minimum_memory; | ||||||
|  |     u64 expected_memory; | ||||||
|  |     u64 critical_memory; | ||||||
| 
 | 
 | ||||||
|     SlotVector<Image> slot_images; |     SlotVector<Image> slot_images; | ||||||
|     SlotVector<ImageView> slot_image_views; |     SlotVector<ImageView> slot_image_views; | ||||||
|  | @ -353,6 +367,7 @@ private: | ||||||
| 
 | 
 | ||||||
|     u64 modification_tick = 0; |     u64 modification_tick = 0; | ||||||
|     u64 frame_tick = 0; |     u64 frame_tick = 0; | ||||||
|  |     typename SlotVector<Image>::Iterator deletion_iterator; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| template <class P> | template <class P> | ||||||
|  | @ -373,11 +388,94 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& | ||||||
|     // This way the null resource becomes a compile time constant
 |     // This way the null resource becomes a compile time constant
 | ||||||
|     void(slot_image_views.insert(runtime, NullImageParams{})); |     void(slot_image_views.insert(runtime, NullImageParams{})); | ||||||
|     void(slot_samplers.insert(runtime, sampler_descriptor)); |     void(slot_samplers.insert(runtime, sampler_descriptor)); | ||||||
|  | 
 | ||||||
|  |     deletion_iterator = slot_images.begin(); | ||||||
|  | 
 | ||||||
|  |     if constexpr (HAS_DEVICE_MEMORY_INFO) { | ||||||
|  |         const auto device_memory = runtime.GetDeviceLocalMemory(); | ||||||
|  |         const u64 possible_expected_memory = (device_memory * 3) / 10; | ||||||
|  |         const u64 possible_critical_memory = (device_memory * 6) / 10; | ||||||
|  |         expected_memory = std::max(possible_expected_memory, DEFAULT_EXPECTED_MEMORY); | ||||||
|  |         critical_memory = std::max(possible_critical_memory, DEFAULT_CRITICAL_MEMORY); | ||||||
|  |         minimum_memory = 0; | ||||||
|  |     } else { | ||||||
|  |         // on OGL we can be more conservatives as the driver takes care.
 | ||||||
|  |         expected_memory = DEFAULT_EXPECTED_MEMORY + Common::Size_512_MB; | ||||||
|  |         critical_memory = DEFAULT_CRITICAL_MEMORY + Common::Size_1_GB; | ||||||
|  |         minimum_memory = expected_memory; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <class P> | ||||||
|  | void TextureCache<P>::RunGarbageCollector() { | ||||||
|  |     const bool high_priority_mode = total_used_memory >= expected_memory; | ||||||
|  |     const bool aggressive_mode = total_used_memory >= critical_memory; | ||||||
|  |     const u64 ticks_to_destroy = high_priority_mode ? 60 : 100; | ||||||
|  |     int num_iterations = aggressive_mode ? 256 : (high_priority_mode ? 128 : 64); | ||||||
|  |     for (; num_iterations > 0; --num_iterations) { | ||||||
|  |         if (deletion_iterator == slot_images.end()) { | ||||||
|  |             deletion_iterator = slot_images.begin(); | ||||||
|  |             if (deletion_iterator == slot_images.end()) { | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         auto [image_id, image_tmp] = *deletion_iterator; | ||||||
|  |         Image* image = image_tmp; // fix clang error.
 | ||||||
|  |         const bool is_alias = True(image->flags & ImageFlagBits::Alias); | ||||||
|  |         const bool is_bad_overlap = True(image->flags & ImageFlagBits::BadOverlap); | ||||||
|  |         const bool must_download = image->IsSafeDownload(); | ||||||
|  |         bool should_care = is_bad_overlap || is_alias || (high_priority_mode && !must_download); | ||||||
|  |         const u64 ticks_needed = | ||||||
|  |             is_bad_overlap | ||||||
|  |                 ? ticks_to_destroy >> 4 | ||||||
|  |                 : ((should_care && aggressive_mode) ? ticks_to_destroy >> 1 : ticks_to_destroy); | ||||||
|  |         should_care |= aggressive_mode; | ||||||
|  |         if (should_care && image->frame_tick + ticks_needed < frame_tick) { | ||||||
|  |             if (is_bad_overlap) { | ||||||
|  |                 const bool overlap_check = std::ranges::all_of( | ||||||
|  |                     image->overlapping_images, [&, image](const ImageId& overlap_id) { | ||||||
|  |                         auto& overlap = slot_images[overlap_id]; | ||||||
|  |                         return overlap.frame_tick >= image->frame_tick; | ||||||
|  |                     }); | ||||||
|  |                 if (!overlap_check) { | ||||||
|  |                     ++deletion_iterator; | ||||||
|  |                     continue; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             if (!is_bad_overlap && must_download) { | ||||||
|  |                 const bool alias_check = std::ranges::none_of( | ||||||
|  |                     image->aliased_images, [&, image](const AliasedImage& alias) { | ||||||
|  |                         auto& alias_image = slot_images[alias.id]; | ||||||
|  |                         return (alias_image.frame_tick < image->frame_tick) || | ||||||
|  |                                (alias_image.modification_tick < image->modification_tick); | ||||||
|  |                     }); | ||||||
|  | 
 | ||||||
|  |                 if (alias_check) { | ||||||
|  |                     auto map = runtime.DownloadStagingBuffer(image->unswizzled_size_bytes); | ||||||
|  |                     const auto copies = FullDownloadCopies(image->info); | ||||||
|  |                     image->DownloadMemory(map, copies); | ||||||
|  |                     runtime.Finish(); | ||||||
|  |                     SwizzleImage(gpu_memory, image->gpu_addr, image->info, copies, map.mapped_span); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             if (True(image->flags & ImageFlagBits::Tracked)) { | ||||||
|  |                 UntrackImage(*image); | ||||||
|  |             } | ||||||
|  |             UnregisterImage(image_id); | ||||||
|  |             DeleteImage(image_id); | ||||||
|  |             if (is_bad_overlap) { | ||||||
|  |                 ++num_iterations; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         ++deletion_iterator; | ||||||
|  |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| template <class P> | template <class P> | ||||||
| void TextureCache<P>::TickFrame() { | void TextureCache<P>::TickFrame() { | ||||||
|     // Tick sentenced resources in this order to ensure they are destroyed in the right order
 |     if (Settings::values.use_caches_gc.GetValue() && total_used_memory > minimum_memory) { | ||||||
|  |         RunGarbageCollector(); | ||||||
|  |     } | ||||||
|     sentenced_images.Tick(); |     sentenced_images.Tick(); | ||||||
|     sentenced_framebuffers.Tick(); |     sentenced_framebuffers.Tick(); | ||||||
|     sentenced_image_view.Tick(); |     sentenced_image_view.Tick(); | ||||||
|  | @ -568,17 +666,7 @@ template <class P> | ||||||
| void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) { | void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) { | ||||||
|     std::vector<ImageId> images; |     std::vector<ImageId> images; | ||||||
|     ForEachImageInRegion(cpu_addr, size, [this, &images](ImageId image_id, ImageBase& image) { |     ForEachImageInRegion(cpu_addr, size, [this, &images](ImageId image_id, ImageBase& image) { | ||||||
|         // Skip images that were not modified from the GPU
 |         if (!image.IsSafeDownload()) { | ||||||
|         if (False(image.flags & ImageFlagBits::GpuModified)) { |  | ||||||
|             return; |  | ||||||
|         } |  | ||||||
|         // Skip images that .are. modified from the CPU
 |  | ||||||
|         // We don't want to write sensitive data from the guest
 |  | ||||||
|         if (True(image.flags & ImageFlagBits::CpuModified)) { |  | ||||||
|             return; |  | ||||||
|         } |  | ||||||
|         if (image.info.num_samples > 1) { |  | ||||||
|             LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented"); |  | ||||||
|             return; |             return; | ||||||
|         } |         } | ||||||
|         image.flags &= ~ImageFlagBits::GpuModified; |         image.flags &= ~ImageFlagBits::GpuModified; | ||||||
|  | @ -967,6 +1055,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA | ||||||
|     std::vector<ImageId> overlap_ids; |     std::vector<ImageId> overlap_ids; | ||||||
|     std::vector<ImageId> left_aliased_ids; |     std::vector<ImageId> left_aliased_ids; | ||||||
|     std::vector<ImageId> right_aliased_ids; |     std::vector<ImageId> right_aliased_ids; | ||||||
|  |     std::vector<ImageId> bad_overlap_ids; | ||||||
|     ForEachImageInRegion(cpu_addr, size_bytes, [&](ImageId overlap_id, ImageBase& overlap) { |     ForEachImageInRegion(cpu_addr, size_bytes, [&](ImageId overlap_id, ImageBase& overlap) { | ||||||
|         if (info.type != overlap.info.type) { |         if (info.type != overlap.info.type) { | ||||||
|             return; |             return; | ||||||
|  | @ -992,9 +1081,14 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA | ||||||
|         const ImageBase new_image_base(new_info, gpu_addr, cpu_addr); |         const ImageBase new_image_base(new_info, gpu_addr, cpu_addr); | ||||||
|         if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) { |         if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) { | ||||||
|             left_aliased_ids.push_back(overlap_id); |             left_aliased_ids.push_back(overlap_id); | ||||||
|  |             overlap.flags |= ImageFlagBits::Alias; | ||||||
|         } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options, |         } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options, | ||||||
|                                  broken_views, native_bgr)) { |                                  broken_views, native_bgr)) { | ||||||
|             right_aliased_ids.push_back(overlap_id); |             right_aliased_ids.push_back(overlap_id); | ||||||
|  |             overlap.flags |= ImageFlagBits::Alias; | ||||||
|  |         } else { | ||||||
|  |             bad_overlap_ids.push_back(overlap_id); | ||||||
|  |             overlap.flags |= ImageFlagBits::BadOverlap; | ||||||
|         } |         } | ||||||
|     }); |     }); | ||||||
|     const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); |     const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); | ||||||
|  | @ -1022,10 +1116,18 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA | ||||||
|     for (const ImageId aliased_id : right_aliased_ids) { |     for (const ImageId aliased_id : right_aliased_ids) { | ||||||
|         ImageBase& aliased = slot_images[aliased_id]; |         ImageBase& aliased = slot_images[aliased_id]; | ||||||
|         AddImageAlias(new_image_base, aliased, new_image_id, aliased_id); |         AddImageAlias(new_image_base, aliased, new_image_id, aliased_id); | ||||||
|  |         new_image.flags |= ImageFlagBits::Alias; | ||||||
|     } |     } | ||||||
|     for (const ImageId aliased_id : left_aliased_ids) { |     for (const ImageId aliased_id : left_aliased_ids) { | ||||||
|         ImageBase& aliased = slot_images[aliased_id]; |         ImageBase& aliased = slot_images[aliased_id]; | ||||||
|         AddImageAlias(aliased, new_image_base, aliased_id, new_image_id); |         AddImageAlias(aliased, new_image_base, aliased_id, new_image_id); | ||||||
|  |         new_image.flags |= ImageFlagBits::Alias; | ||||||
|  |     } | ||||||
|  |     for (const ImageId aliased_id : bad_overlap_ids) { | ||||||
|  |         ImageBase& aliased = slot_images[aliased_id]; | ||||||
|  |         aliased.overlapping_images.push_back(new_image_id); | ||||||
|  |         new_image.overlapping_images.push_back(aliased_id); | ||||||
|  |         new_image.flags |= ImageFlagBits::BadOverlap; | ||||||
|     } |     } | ||||||
|     RegisterImage(new_image_id); |     RegisterImage(new_image_id); | ||||||
|     return new_image_id; |     return new_image_id; | ||||||
|  | @ -1195,6 +1297,13 @@ void TextureCache<P>::RegisterImage(ImageId image_id) { | ||||||
|     image.flags |= ImageFlagBits::Registered; |     image.flags |= ImageFlagBits::Registered; | ||||||
|     ForEachPage(image.cpu_addr, image.guest_size_bytes, |     ForEachPage(image.cpu_addr, image.guest_size_bytes, | ||||||
|                 [this, image_id](u64 page) { page_table[page].push_back(image_id); }); |                 [this, image_id](u64 page) { page_table[page].push_back(image_id); }); | ||||||
|  |     u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); | ||||||
|  |     if ((IsPixelFormatASTC(image.info.format) && | ||||||
|  |          True(image.flags & ImageFlagBits::AcceleratedUpload)) || | ||||||
|  |         True(image.flags & ImageFlagBits::Converted)) { | ||||||
|  |         tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); | ||||||
|  |     } | ||||||
|  |     total_used_memory += Common::AlignUp(tentative_size, 1024); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| template <class P> | template <class P> | ||||||
|  | @ -1203,6 +1312,14 @@ void TextureCache<P>::UnregisterImage(ImageId image_id) { | ||||||
|     ASSERT_MSG(True(image.flags & ImageFlagBits::Registered), |     ASSERT_MSG(True(image.flags & ImageFlagBits::Registered), | ||||||
|                "Trying to unregister an already registered image"); |                "Trying to unregister an already registered image"); | ||||||
|     image.flags &= ~ImageFlagBits::Registered; |     image.flags &= ~ImageFlagBits::Registered; | ||||||
|  |     image.flags &= ~ImageFlagBits::BadOverlap; | ||||||
|  |     u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); | ||||||
|  |     if ((IsPixelFormatASTC(image.info.format) && | ||||||
|  |          True(image.flags & ImageFlagBits::AcceleratedUpload)) || | ||||||
|  |         True(image.flags & ImageFlagBits::Converted)) { | ||||||
|  |         tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); | ||||||
|  |     } | ||||||
|  |     total_used_memory -= Common::AlignUp(tentative_size, 1024); | ||||||
|     ForEachPage(image.cpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { |     ForEachPage(image.cpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { | ||||||
|         const auto page_it = page_table.find(page); |         const auto page_it = page_table.find(page); | ||||||
|         if (page_it == page_table.end()) { |         if (page_it == page_table.end()) { | ||||||
|  | @ -1276,9 +1393,19 @@ void TextureCache<P>::DeleteImage(ImageId image_id) { | ||||||
|             std::erase_if(other_image.aliased_images, [image_id](const AliasedImage& other_alias) { |             std::erase_if(other_image.aliased_images, [image_id](const AliasedImage& other_alias) { | ||||||
|                 return other_alias.id == image_id; |                 return other_alias.id == image_id; | ||||||
|             }); |             }); | ||||||
|  |         other_image.CheckAliasState(); | ||||||
|         ASSERT_MSG(num_removed_aliases == 1, "Invalid number of removed aliases: {}", |         ASSERT_MSG(num_removed_aliases == 1, "Invalid number of removed aliases: {}", | ||||||
|                    num_removed_aliases); |                    num_removed_aliases); | ||||||
|     } |     } | ||||||
|  |     for (const ImageId overlap_id : image.overlapping_images) { | ||||||
|  |         ImageBase& other_image = slot_images[overlap_id]; | ||||||
|  |         [[maybe_unused]] const size_t num_removed_overlaps = std::erase_if( | ||||||
|  |             other_image.overlapping_images, | ||||||
|  |             [image_id](const ImageId other_overlap_id) { return other_overlap_id == image_id; }); | ||||||
|  |         other_image.CheckBadOverlapState(); | ||||||
|  |         ASSERT_MSG(num_removed_overlaps == 1, "Invalid number of removed overlapps: {}", | ||||||
|  |                    num_removed_overlaps); | ||||||
|  |     } | ||||||
|     for (const ImageViewId image_view_id : image_view_ids) { |     for (const ImageViewId image_view_id : image_view_ids) { | ||||||
|         sentenced_image_view.Push(std::move(slot_image_views[image_view_id])); |         sentenced_image_view.Push(std::move(slot_image_views[image_view_id])); | ||||||
|         slot_image_views.erase(image_view_id); |         slot_image_views.erase(image_view_id); | ||||||
|  |  | ||||||
|  | @ -581,6 +581,8 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr | ||||||
| 
 | 
 | ||||||
|     for (s32 layer = 0; layer < info.resources.layers; ++layer) { |     for (s32 layer = 0; layer < info.resources.layers; ++layer) { | ||||||
|         const std::span<const u8> src = input.subspan(host_offset); |         const std::span<const u8> src = input.subspan(host_offset); | ||||||
|  |         gpu_memory.ReadBlockUnsafe(gpu_addr + guest_offset, dst.data(), dst.size_bytes()); | ||||||
|  | 
 | ||||||
|         SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height, |         SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height, | ||||||
|                        num_tiles.depth, block.height, block.depth); |                        num_tiles.depth, block.height, block.depth); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -408,6 +408,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR | ||||||
|     } |     } | ||||||
|     logical = vk::Device::Create(physical, queue_cis, extensions, first_next, dld); |     logical = vk::Device::Create(physical, queue_cis, extensions, first_next, dld); | ||||||
| 
 | 
 | ||||||
|  |     CollectPhysicalMemoryInfo(); | ||||||
|     CollectTelemetryParameters(); |     CollectTelemetryParameters(); | ||||||
|     CollectToolingInfo(); |     CollectToolingInfo(); | ||||||
| 
 | 
 | ||||||
|  | @ -818,6 +819,17 @@ void Device::CollectTelemetryParameters() { | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | void Device::CollectPhysicalMemoryInfo() { | ||||||
|  |     const auto mem_properties = physical.GetMemoryProperties(); | ||||||
|  |     const std::size_t num_properties = mem_properties.memoryHeapCount; | ||||||
|  |     device_access_memory = 0; | ||||||
|  |     for (std::size_t element = 0; element < num_properties; element++) { | ||||||
|  |         if ((mem_properties.memoryHeaps[element].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) != 0) { | ||||||
|  |             device_access_memory += mem_properties.memoryHeaps[element].size; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| void Device::CollectToolingInfo() { | void Device::CollectToolingInfo() { | ||||||
|     if (!ext_tooling_info) { |     if (!ext_tooling_info) { | ||||||
|         return; |         return; | ||||||
|  |  | ||||||
|  | @ -225,6 +225,10 @@ public: | ||||||
|         return use_asynchronous_shaders; |         return use_asynchronous_shaders; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     u64 GetDeviceLocalMemory() const { | ||||||
|  |         return device_access_memory; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
| private: | private: | ||||||
|     /// Checks if the physical device is suitable.
 |     /// Checks if the physical device is suitable.
 | ||||||
|     void CheckSuitability(bool requires_swapchain) const; |     void CheckSuitability(bool requires_swapchain) const; | ||||||
|  | @ -244,6 +248,9 @@ private: | ||||||
|     /// Collects information about attached tools.
 |     /// Collects information about attached tools.
 | ||||||
|     void CollectToolingInfo(); |     void CollectToolingInfo(); | ||||||
| 
 | 
 | ||||||
|  |     /// Collects information about the device's local memory.
 | ||||||
|  |     void CollectPhysicalMemoryInfo(); | ||||||
|  | 
 | ||||||
|     /// Returns a list of queue initialization descriptors.
 |     /// Returns a list of queue initialization descriptors.
 | ||||||
|     std::vector<VkDeviceQueueCreateInfo> GetDeviceQueueCreateInfos() const; |     std::vector<VkDeviceQueueCreateInfo> GetDeviceQueueCreateInfos() const; | ||||||
| 
 | 
 | ||||||
|  | @ -302,6 +309,8 @@ private: | ||||||
| 
 | 
 | ||||||
|     /// Nsight Aftermath GPU crash tracker
 |     /// Nsight Aftermath GPU crash tracker
 | ||||||
|     std::unique_ptr<NsightAftermathTracker> nsight_aftermath_tracker; |     std::unique_ptr<NsightAftermathTracker> nsight_aftermath_tracker; | ||||||
|  | 
 | ||||||
|  |     u64 device_access_memory; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| } // namespace Vulkan
 | } // namespace Vulkan
 | ||||||
|  |  | ||||||
|  | @ -69,10 +69,10 @@ constexpr VkExportMemoryAllocateInfo EXPORT_ALLOCATE_INFO{ | ||||||
| 
 | 
 | ||||||
| class MemoryAllocation { | class MemoryAllocation { | ||||||
| public: | public: | ||||||
|     explicit MemoryAllocation(vk::DeviceMemory memory_, VkMemoryPropertyFlags properties, |     explicit MemoryAllocation(MemoryAllocator* const allocator_, vk::DeviceMemory memory_, | ||||||
|                               u64 allocation_size_, u32 type) |                               VkMemoryPropertyFlags properties, u64 allocation_size_, u32 type) | ||||||
|         : memory{std::move(memory_)}, allocation_size{allocation_size_}, property_flags{properties}, |         : allocator{allocator_}, memory{std::move(memory_)}, allocation_size{allocation_size_}, | ||||||
|           shifted_memory_type{1U << type} {} |           property_flags{properties}, shifted_memory_type{1U << type} {} | ||||||
| 
 | 
 | ||||||
| #if defined(_WIN32) || defined(__unix__) | #if defined(_WIN32) || defined(__unix__) | ||||||
|     ~MemoryAllocation() { |     ~MemoryAllocation() { | ||||||
|  | @ -106,6 +106,10 @@ public: | ||||||
|         const auto it = std::ranges::find(commits, begin, &Range::begin); |         const auto it = std::ranges::find(commits, begin, &Range::begin); | ||||||
|         ASSERT_MSG(it != commits.end(), "Invalid commit"); |         ASSERT_MSG(it != commits.end(), "Invalid commit"); | ||||||
|         commits.erase(it); |         commits.erase(it); | ||||||
|  |         if (commits.empty()) { | ||||||
|  |             // Do not call any code involving 'this' after this call, the object will be destroyed
 | ||||||
|  |             allocator->ReleaseMemory(this); | ||||||
|  |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     [[nodiscard]] std::span<u8> Map() { |     [[nodiscard]] std::span<u8> Map() { | ||||||
|  | @ -171,6 +175,7 @@ private: | ||||||
|         return candidate; |         return candidate; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     MemoryAllocator* const allocator;           ///< Parent memory allocation.
 | ||||||
|     const vk::DeviceMemory memory;              ///< Vulkan memory allocation handler.
 |     const vk::DeviceMemory memory;              ///< Vulkan memory allocation handler.
 | ||||||
|     const u64 allocation_size;                  ///< Size of this allocation.
 |     const u64 allocation_size;                  ///< Size of this allocation.
 | ||||||
|     const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags.
 |     const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags.
 | ||||||
|  | @ -275,10 +280,17 @@ bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, | ||||||
|             return false; |             return false; | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     allocations.push_back(std::make_unique<MemoryAllocation>(std::move(memory), flags, size, type)); |     allocations.push_back( | ||||||
|  |         std::make_unique<MemoryAllocation>(this, std::move(memory), flags, size, type)); | ||||||
|     return true; |     return true; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | void MemoryAllocator::ReleaseMemory(MemoryAllocation* alloc) { | ||||||
|  |     const auto it = std::ranges::find(allocations, alloc, &std::unique_ptr<MemoryAllocation>::get); | ||||||
|  |     ASSERT(it != allocations.end()); | ||||||
|  |     allocations.erase(it); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements, | std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements, | ||||||
|                                                        VkMemoryPropertyFlags flags) { |                                                        VkMemoryPropertyFlags flags) { | ||||||
|     for (auto& allocation : allocations) { |     for (auto& allocation : allocations) { | ||||||
|  |  | ||||||
|  | @ -69,6 +69,8 @@ private: | ||||||
| /// Memory allocator container.
 | /// Memory allocator container.
 | ||||||
| /// Allocates and releases memory allocations on demand.
 | /// Allocates and releases memory allocations on demand.
 | ||||||
| class MemoryAllocator { | class MemoryAllocator { | ||||||
|  |     friend MemoryAllocation; | ||||||
|  | 
 | ||||||
| public: | public: | ||||||
|     /**
 |     /**
 | ||||||
|      * Construct memory allocator |      * Construct memory allocator | ||||||
|  | @ -104,6 +106,9 @@ private: | ||||||
|     /// Tries to allocate a chunk of memory.
 |     /// Tries to allocate a chunk of memory.
 | ||||||
|     bool TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size); |     bool TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size); | ||||||
| 
 | 
 | ||||||
|  |     /// Releases a chunk of memory.
 | ||||||
|  |     void ReleaseMemory(MemoryAllocation* alloc); | ||||||
|  | 
 | ||||||
|     /// Tries to allocate a memory commit.
 |     /// Tries to allocate a memory commit.
 | ||||||
|     std::optional<MemoryCommit> TryCommit(const VkMemoryRequirements& requirements, |     std::optional<MemoryCommit> TryCommit(const VkMemoryRequirements& requirements, | ||||||
|                                           VkMemoryPropertyFlags flags); |                                           VkMemoryPropertyFlags flags); | ||||||
|  |  | ||||||
|  | @ -822,6 +822,7 @@ void Config::ReadRendererValues() { | ||||||
|                       QStringLiteral("use_asynchronous_shaders"), false); |                       QStringLiteral("use_asynchronous_shaders"), false); | ||||||
|     ReadSettingGlobal(Settings::values.use_fast_gpu_time, QStringLiteral("use_fast_gpu_time"), |     ReadSettingGlobal(Settings::values.use_fast_gpu_time, QStringLiteral("use_fast_gpu_time"), | ||||||
|                       true); |                       true); | ||||||
|  |     ReadSettingGlobal(Settings::values.use_caches_gc, QStringLiteral("use_caches_gc"), false); | ||||||
|     ReadSettingGlobal(Settings::values.bg_red, QStringLiteral("bg_red"), 0.0); |     ReadSettingGlobal(Settings::values.bg_red, QStringLiteral("bg_red"), 0.0); | ||||||
|     ReadSettingGlobal(Settings::values.bg_green, QStringLiteral("bg_green"), 0.0); |     ReadSettingGlobal(Settings::values.bg_green, QStringLiteral("bg_green"), 0.0); | ||||||
|     ReadSettingGlobal(Settings::values.bg_blue, QStringLiteral("bg_blue"), 0.0); |     ReadSettingGlobal(Settings::values.bg_blue, QStringLiteral("bg_blue"), 0.0); | ||||||
|  | @ -1410,6 +1411,7 @@ void Config::SaveRendererValues() { | ||||||
|                        Settings::values.use_asynchronous_shaders, false); |                        Settings::values.use_asynchronous_shaders, false); | ||||||
|     WriteSettingGlobal(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, |     WriteSettingGlobal(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, | ||||||
|                        true); |                        true); | ||||||
|  |     WriteSettingGlobal(QStringLiteral("use_caches_gc"), Settings::values.use_caches_gc, false); | ||||||
|     // Cast to double because Qt's written float values are not human-readable
 |     // Cast to double because Qt's written float values are not human-readable
 | ||||||
|     WriteSettingGlobal(QStringLiteral("bg_red"), Settings::values.bg_red, 0.0); |     WriteSettingGlobal(QStringLiteral("bg_red"), Settings::values.bg_red, 0.0); | ||||||
|     WriteSettingGlobal(QStringLiteral("bg_green"), Settings::values.bg_green, 0.0); |     WriteSettingGlobal(QStringLiteral("bg_green"), Settings::values.bg_green, 0.0); | ||||||
|  |  | ||||||
|  | @ -31,6 +31,7 @@ void ConfigureGraphicsAdvanced::SetConfiguration() { | ||||||
|     ui->disable_fps_limit->setChecked(Settings::values.disable_fps_limit.GetValue()); |     ui->disable_fps_limit->setChecked(Settings::values.disable_fps_limit.GetValue()); | ||||||
|     ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders.GetValue()); |     ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders.GetValue()); | ||||||
|     ui->use_asynchronous_shaders->setChecked(Settings::values.use_asynchronous_shaders.GetValue()); |     ui->use_asynchronous_shaders->setChecked(Settings::values.use_asynchronous_shaders.GetValue()); | ||||||
|  |     ui->use_caches_gc->setChecked(Settings::values.use_caches_gc.GetValue()); | ||||||
|     ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time.GetValue()); |     ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time.GetValue()); | ||||||
| 
 | 
 | ||||||
|     if (Settings::IsConfiguringGlobal()) { |     if (Settings::IsConfiguringGlobal()) { | ||||||
|  | @ -65,6 +66,8 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() { | ||||||
|     ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_shaders, |     ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_shaders, | ||||||
|                                              ui->use_asynchronous_shaders, |                                              ui->use_asynchronous_shaders, | ||||||
|                                              use_asynchronous_shaders); |                                              use_asynchronous_shaders); | ||||||
|  |     ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_caches_gc, ui->use_caches_gc, | ||||||
|  |                                              use_caches_gc); | ||||||
|     ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_fast_gpu_time, |     ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_fast_gpu_time, | ||||||
|                                              ui->use_fast_gpu_time, use_fast_gpu_time); |                                              ui->use_fast_gpu_time, use_fast_gpu_time); | ||||||
| 
 | 
 | ||||||
|  | @ -105,6 +108,7 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { | ||||||
|         ui->use_asynchronous_shaders->setEnabled( |         ui->use_asynchronous_shaders->setEnabled( | ||||||
|             Settings::values.use_asynchronous_shaders.UsingGlobal()); |             Settings::values.use_asynchronous_shaders.UsingGlobal()); | ||||||
|         ui->use_fast_gpu_time->setEnabled(Settings::values.use_fast_gpu_time.UsingGlobal()); |         ui->use_fast_gpu_time->setEnabled(Settings::values.use_fast_gpu_time.UsingGlobal()); | ||||||
|  |         ui->use_caches_gc->setEnabled(Settings::values.use_caches_gc.UsingGlobal()); | ||||||
|         ui->anisotropic_filtering_combobox->setEnabled( |         ui->anisotropic_filtering_combobox->setEnabled( | ||||||
|             Settings::values.max_anisotropy.UsingGlobal()); |             Settings::values.max_anisotropy.UsingGlobal()); | ||||||
| 
 | 
 | ||||||
|  | @ -121,6 +125,8 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { | ||||||
|                                             use_asynchronous_shaders); |                                             use_asynchronous_shaders); | ||||||
|     ConfigurationShared::SetColoredTristate(ui->use_fast_gpu_time, |     ConfigurationShared::SetColoredTristate(ui->use_fast_gpu_time, | ||||||
|                                             Settings::values.use_fast_gpu_time, use_fast_gpu_time); |                                             Settings::values.use_fast_gpu_time, use_fast_gpu_time); | ||||||
|  |     ConfigurationShared::SetColoredTristate(ui->use_caches_gc, Settings::values.use_caches_gc, | ||||||
|  |                                             use_caches_gc); | ||||||
|     ConfigurationShared::SetColoredComboBox( |     ConfigurationShared::SetColoredComboBox( | ||||||
|         ui->gpu_accuracy, ui->label_gpu_accuracy, |         ui->gpu_accuracy, ui->label_gpu_accuracy, | ||||||
|         static_cast<int>(Settings::values.gpu_accuracy.GetValue(true))); |         static_cast<int>(Settings::values.gpu_accuracy.GetValue(true))); | ||||||
|  |  | ||||||
|  | @ -39,4 +39,5 @@ private: | ||||||
|     ConfigurationShared::CheckState use_assembly_shaders; |     ConfigurationShared::CheckState use_assembly_shaders; | ||||||
|     ConfigurationShared::CheckState use_asynchronous_shaders; |     ConfigurationShared::CheckState use_asynchronous_shaders; | ||||||
|     ConfigurationShared::CheckState use_fast_gpu_time; |     ConfigurationShared::CheckState use_fast_gpu_time; | ||||||
|  |     ConfigurationShared::CheckState use_caches_gc; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | @ -121,6 +121,16 @@ | ||||||
|           </property> |           </property> | ||||||
|          </widget> |          </widget> | ||||||
|         </item> |         </item> | ||||||
|  |         <item> | ||||||
|  |          <widget class="QCheckBox" name="use_caches_gc"> | ||||||
|  |           <property name="toolTip"> | ||||||
|  |            <string>Enables garbage collection for the GPU caches, this will try to keep VRAM within 3-4 GB by flushing the least used textures/buffers. May cause issues in a few games.</string> | ||||||
|  |           </property> | ||||||
|  |           <property name="text"> | ||||||
|  |            <string>Enable GPU cache garbage collection (experimental)</string> | ||||||
|  |           </property> | ||||||
|  |          </widget> | ||||||
|  |         </item> | ||||||
|         <item> |         <item> | ||||||
|          <widget class="QWidget" name="af_layout" native="true"> |          <widget class="QWidget" name="af_layout" native="true"> | ||||||
|           <layout class="QHBoxLayout" name="horizontalLayout_1"> |           <layout class="QHBoxLayout" name="horizontalLayout_1"> | ||||||
|  |  | ||||||
|  | @ -227,6 +227,10 @@ use_asynchronous_gpu_emulation = | ||||||
| # 0: Off, 1 (default): On | # 0: Off, 1 (default): On | ||||||
| use_vsync = | use_vsync = | ||||||
| 
 | 
 | ||||||
|  | # Whether to use garbage collection or not for GPU caches. | ||||||
|  | # 0 (default): Off, 1: On | ||||||
|  | use_caches_gc = | ||||||
|  | 
 | ||||||
| # The clear color for the renderer. What shows up on the sides of the bottom screen. | # The clear color for the renderer. What shows up on the sides of the bottom screen. | ||||||
| # Must be in range of 0.0-1.0. Defaults to 1.0 for all. | # Must be in range of 0.0-1.0. Defaults to 1.0 for all. | ||||||
| bg_red = | bg_red = | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Mai M
						Mai M