forked from eden-emu/eden
		
	buffer_cache: Heuristically decide to skip cache on uniform buffers
Some games benefit from skipping caches (Pokémon Sword), and others don't (Animal Crossing: New Horizons). Add an heuristic to decide this at runtime. The cache hit ratio has to be ~98% or better to not skip the cache. There are 16 frames of buffer.
This commit is contained in:
		
							parent
							
								
									d384b5101c
								
							
						
					
					
						commit
						06028cda0c
					
				
					 2 changed files with 37 additions and 11 deletions
				
			
		|  | @ -9,6 +9,7 @@ | |||
| #include <deque> | ||||
| #include <memory> | ||||
| #include <mutex> | ||||
| #include <numeric> | ||||
| #include <span> | ||||
| #include <unordered_map> | ||||
| #include <vector> | ||||
|  | @ -91,7 +92,7 @@ class BufferCache { | |||
|     }; | ||||
| 
 | ||||
| public: | ||||
|     static constexpr u32 SKIP_CACHE_SIZE = 4096; | ||||
|     static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = 4096; | ||||
| 
 | ||||
|     explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, | ||||
|                          Tegra::Engines::Maxwell3D& maxwell3d_, | ||||
|  | @ -240,9 +241,9 @@ private: | |||
|     template <bool insert> | ||||
|     void ChangeRegister(BufferId buffer_id); | ||||
| 
 | ||||
|     void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); | ||||
|     bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); | ||||
| 
 | ||||
|     void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); | ||||
|     bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); | ||||
| 
 | ||||
|     void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, | ||||
|                       std::span<BufferCopy> copies); | ||||
|  | @ -297,6 +298,11 @@ private: | |||
| 
 | ||||
|     std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{}; | ||||
| 
 | ||||
|     std::array<u32, 16> uniform_cache_hits{}; | ||||
|     std::array<u32, 16> uniform_cache_shots{}; | ||||
| 
 | ||||
|     u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; | ||||
| 
 | ||||
|     bool has_deleted_buffers = false; | ||||
| 
 | ||||
|     std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty> | ||||
|  | @ -328,6 +334,19 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, | |||
| 
 | ||||
| template <class P> | ||||
| void BufferCache<P>::TickFrame() { | ||||
|     // Calculate hits and shots and move hit bits to the right
 | ||||
|     const u32 hits = std::reduce(uniform_cache_hits.begin(), uniform_cache_hits.end()); | ||||
|     const u32 shots = std::reduce(uniform_cache_shots.begin(), uniform_cache_shots.end()); | ||||
|     std::copy_n(uniform_cache_hits.begin(), uniform_cache_hits.size() - 1, | ||||
|                 uniform_cache_hits.begin() + 1); | ||||
|     std::copy_n(uniform_cache_shots.begin(), uniform_cache_shots.size() - 1, | ||||
|                 uniform_cache_shots.begin() + 1); | ||||
|     uniform_cache_hits[0] = 0; | ||||
|     uniform_cache_shots[0] = 0; | ||||
| 
 | ||||
|     const bool skip_preferred = hits * 256 < shots * 251; | ||||
|     uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; | ||||
| 
 | ||||
|     delayed_destruction_ring.Tick(); | ||||
| } | ||||
| 
 | ||||
|  | @ -671,7 +690,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 | |||
|     const VAddr cpu_addr = binding.cpu_addr; | ||||
|     const u32 size = binding.size; | ||||
|     Buffer& buffer = slot_buffers[binding.buffer_id]; | ||||
|     if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) { | ||||
|     if (size <= uniform_buffer_skip_cache_size && !buffer.IsRegionGpuModified(cpu_addr, size)) { | ||||
|         if constexpr (IS_OPENGL) { | ||||
|             if (runtime.HasFastBufferSubData()) { | ||||
|                 // Fast path for Nvidia
 | ||||
|  | @ -692,7 +711,12 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 | |||
|         return; | ||||
|     } | ||||
|     // Classic cached path
 | ||||
|     SynchronizeBuffer(buffer, cpu_addr, size); | ||||
|     const bool sync_cached = SynchronizeBuffer(buffer, cpu_addr, size); | ||||
|     if (sync_cached) { | ||||
|         ++uniform_cache_hits[0]; | ||||
|     } | ||||
|     ++uniform_cache_shots[0]; | ||||
| 
 | ||||
|     if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) { | ||||
|         // Skip binding if it's not needed and if the bound buffer is not the fast version
 | ||||
|         // This exists to avoid instances where the fast buffer is bound and a GPU write happens
 | ||||
|  | @ -1106,15 +1130,15 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { | |||
| } | ||||
| 
 | ||||
| template <class P> | ||||
| void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { | ||||
| bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { | ||||
|     if (buffer.CpuAddr() == 0) { | ||||
|         return; | ||||
|         return true; | ||||
|     } | ||||
|     SynchronizeBufferImpl(buffer, cpu_addr, size); | ||||
|     return SynchronizeBufferImpl(buffer, cpu_addr, size); | ||||
| } | ||||
| 
 | ||||
| template <class P> | ||||
| void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) { | ||||
| bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) { | ||||
|     boost::container::small_vector<BufferCopy, 4> copies; | ||||
|     u64 total_size_bytes = 0; | ||||
|     u64 largest_copy = 0; | ||||
|  | @ -1128,10 +1152,11 @@ void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s | |||
|         largest_copy = std::max(largest_copy, range_size); | ||||
|     }); | ||||
|     if (total_size_bytes == 0) { | ||||
|         return; | ||||
|         return true; | ||||
|     } | ||||
|     const std::span<BufferCopy> copies_span(copies.data(), copies.size()); | ||||
|     UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); | ||||
|     return false; | ||||
| } | ||||
| 
 | ||||
| template <class P> | ||||
|  |  | |||
|  | @ -73,7 +73,8 @@ BufferCacheRuntime::BufferCacheRuntime(const Device& device_) | |||
|     for (auto& stage_uniforms : fast_uniforms) { | ||||
|         for (OGLBuffer& buffer : stage_uniforms) { | ||||
|             buffer.Create(); | ||||
|             glNamedBufferData(buffer.handle, BufferCache::SKIP_CACHE_SIZE, nullptr, GL_STREAM_DRAW); | ||||
|             glNamedBufferData(buffer.handle, BufferCache::DEFAULT_SKIP_CACHE_SIZE, nullptr, | ||||
|                               GL_STREAM_DRAW); | ||||
|         } | ||||
|     } | ||||
|     for (auto& stage_uniforms : copy_uniforms) { | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 ReinUsesLisp
						ReinUsesLisp