forked from eden-emu/eden
		
	gl_rasterizer: Upload constant buffers with glNamedBufferSubData
Nvidia's OpenGL driver maps gl(Named)BufferSubData with some requirements to a fast. This path has an extra memcpy but updates the buffer without orphaning or waiting for previous calls. It can be seen as a better model for "push constants" that can upload a whole UBO instead of 256 bytes. This path has some requirements established here: http://on-demand.gputechconf.com/gtc/2014/presentations/S4379-opengl-44-scene-rendering-techniques.pdf#page=24 Instead of using the stream buffer, this commits moves constant buffers uploads to calls of glNamedBufferSubData and from my testing it brings a performance improvement. This is disabled when the vendor is not Nvidia since it brings performance regressions.
This commit is contained in:
		
							parent
							
								
									11e39da02b
								
							
						
					
					
						commit
						76ca2a5f82
					
				
					 6 changed files with 84 additions and 19 deletions
				
			
		|  | @ -30,7 +30,7 @@ public: | |||
|     using BufferInfo = std::pair<const TBufferType*, u64>; | ||||
| 
 | ||||
|     BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, | ||||
|                             bool is_written = false) { | ||||
|                             bool is_written = false, bool use_fast_cbuf = false) { | ||||
|         std::lock_guard lock{mutex}; | ||||
| 
 | ||||
|         auto& memory_manager = system.GPU().MemoryManager(); | ||||
|  | @ -43,9 +43,13 @@ public: | |||
|         // Cache management is a big overhead, so only cache entries with a given size.
 | ||||
|         // TODO: Figure out which size is the best for given games.
 | ||||
|         constexpr std::size_t max_stream_size = 0x800; | ||||
|         if (size < max_stream_size) { | ||||
|         if (use_fast_cbuf || size < max_stream_size) { | ||||
|             if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) { | ||||
|                 return StreamBufferUpload(host_ptr, size, alignment); | ||||
|                 if (use_fast_cbuf) { | ||||
|                     return ConstBufferUpload(host_ptr, size); | ||||
|                 } else { | ||||
|                     return StreamBufferUpload(host_ptr, size, alignment); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|  | @ -152,6 +156,10 @@ protected: | |||
|     virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset, | ||||
|                            std::size_t dst_offset, std::size_t size) = 0; | ||||
| 
 | ||||
|     virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { | ||||
|         return {}; | ||||
|     } | ||||
| 
 | ||||
|     /// Register an object into the cache
 | ||||
|     void Register(const MapInterval& new_map, bool inherit_written = false) { | ||||
|         const CacheAddr cache_ptr = new_map->GetStart(); | ||||
|  |  | |||
|  | @ -8,13 +8,17 @@ | |||
| 
 | ||||
| #include "common/assert.h" | ||||
| #include "common/microprofile.h" | ||||
| #include "video_core/engines/maxwell_3d.h" | ||||
| #include "video_core/rasterizer_interface.h" | ||||
| #include "video_core/renderer_opengl/gl_buffer_cache.h" | ||||
| #include "video_core/renderer_opengl/gl_device.h" | ||||
| #include "video_core/renderer_opengl/gl_rasterizer.h" | ||||
| #include "video_core/renderer_opengl/gl_resource_manager.h" | ||||
| 
 | ||||
| namespace OpenGL { | ||||
| 
 | ||||
| using Maxwell = Tegra::Engines::Maxwell3D::Regs; | ||||
| 
 | ||||
| MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); | ||||
| 
 | ||||
| CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size) | ||||
|  | @ -26,11 +30,22 @@ CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t siz | |||
| CachedBufferBlock::~CachedBufferBlock() = default; | ||||
| 
 | ||||
| OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, | ||||
|                                std::size_t stream_size) | ||||
|     : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{ | ||||
|           rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {} | ||||
|                                const Device& device, std::size_t stream_size) | ||||
|     : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} { | ||||
|     if (!device.HasFastBufferSubData()) { | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
| OGLBufferCache::~OGLBufferCache() = default; | ||||
|     static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); | ||||
|     glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); | ||||
|     for (const GLuint cbuf : cbufs) { | ||||
|         glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| OGLBufferCache::~OGLBufferCache() { | ||||
|     glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); | ||||
| } | ||||
| 
 | ||||
| Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) { | ||||
|     return std::make_shared<CachedBufferBlock>(cache_addr, size); | ||||
|  | @ -69,4 +84,12 @@ void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t | |||
|                              static_cast<GLsizeiptr>(size)); | ||||
| } | ||||
| 
 | ||||
| OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, | ||||
|                                                              std::size_t size) { | ||||
|     DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); | ||||
|     const GLuint& cbuf = cbufs[cbuf_cursor++]; | ||||
|     glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); | ||||
|     return {&cbuf, 0}; | ||||
| } | ||||
| 
 | ||||
| } // namespace OpenGL
 | ||||
|  |  | |||
|  | @ -4,10 +4,12 @@ | |||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include <array> | ||||
| #include <memory> | ||||
| 
 | ||||
| #include "common/common_types.h" | ||||
| #include "video_core/buffer_cache/buffer_cache.h" | ||||
| #include "video_core/engines/maxwell_3d.h" | ||||
| #include "video_core/rasterizer_cache.h" | ||||
| #include "video_core/renderer_opengl/gl_resource_manager.h" | ||||
| #include "video_core/renderer_opengl/gl_stream_buffer.h" | ||||
|  | @ -18,12 +20,14 @@ class System; | |||
| 
 | ||||
| namespace OpenGL { | ||||
| 
 | ||||
| class Device; | ||||
| class OGLStreamBuffer; | ||||
| class RasterizerOpenGL; | ||||
| 
 | ||||
| class CachedBufferBlock; | ||||
| 
 | ||||
| using Buffer = std::shared_ptr<CachedBufferBlock>; | ||||
| using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; | ||||
| 
 | ||||
| class CachedBufferBlock : public VideoCommon::BufferBlock { | ||||
| public: | ||||
|  | @ -38,14 +42,18 @@ private: | |||
|     OGLBuffer gl_buffer{}; | ||||
| }; | ||||
| 
 | ||||
| class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> { | ||||
| class OGLBufferCache final : public GenericBufferCache { | ||||
| public: | ||||
|     explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, | ||||
|                             std::size_t stream_size); | ||||
|                             const Device& device, std::size_t stream_size); | ||||
|     ~OGLBufferCache(); | ||||
| 
 | ||||
|     const GLuint* GetEmptyBuffer(std::size_t) override; | ||||
| 
 | ||||
|     void Acquire() noexcept { | ||||
|         cbuf_cursor = 0; | ||||
|     } | ||||
| 
 | ||||
| protected: | ||||
|     Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override; | ||||
| 
 | ||||
|  | @ -61,6 +69,14 @@ protected: | |||
| 
 | ||||
|     void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, | ||||
|                    std::size_t dst_offset, std::size_t size) override; | ||||
| 
 | ||||
|     BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; | ||||
| 
 | ||||
| private: | ||||
|     std::size_t cbuf_cursor = 0; | ||||
|     std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * | ||||
|                            Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram> | ||||
|         cbufs; | ||||
| }; | ||||
| 
 | ||||
| } // namespace OpenGL
 | ||||
|  |  | |||
|  | @ -51,8 +51,11 @@ bool HasExtension(const std::vector<std::string_view>& images, std::string_view | |||
| } // Anonymous namespace
 | ||||
| 
 | ||||
| Device::Device() { | ||||
|     const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); | ||||
|     const std::vector extensions = GetExtensions(); | ||||
| 
 | ||||
|     const bool is_nvidia = vendor == "NVIDIA Corporation"; | ||||
| 
 | ||||
|     uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); | ||||
|     shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); | ||||
|     max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); | ||||
|  | @ -64,6 +67,7 @@ Device::Device() { | |||
|     has_variable_aoffi = TestVariableAoffi(); | ||||
|     has_component_indexing_bug = TestComponentIndexingBug(); | ||||
|     has_precise_bug = TestPreciseBug(); | ||||
|     has_fast_buffer_sub_data = is_nvidia; | ||||
| 
 | ||||
|     LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); | ||||
|     LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); | ||||
|  |  | |||
|  | @ -54,6 +54,10 @@ public: | |||
|         return has_precise_bug; | ||||
|     } | ||||
| 
 | ||||
|     bool HasFastBufferSubData() const { | ||||
|         return has_fast_buffer_sub_data; | ||||
|     } | ||||
| 
 | ||||
| private: | ||||
|     static bool TestVariableAoffi(); | ||||
|     static bool TestComponentIndexingBug(); | ||||
|  | @ -69,6 +73,7 @@ private: | |||
|     bool has_variable_aoffi{}; | ||||
|     bool has_component_indexing_bug{}; | ||||
|     bool has_precise_bug{}; | ||||
|     bool has_fast_buffer_sub_data{}; | ||||
| }; | ||||
| 
 | ||||
| } // namespace OpenGL
 | ||||
|  |  | |||
|  | @ -67,7 +67,7 @@ static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buf | |||
| RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, | ||||
|                                    ScreenInfo& info) | ||||
|     : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device}, | ||||
|       system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} { | ||||
|       system{system}, screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { | ||||
|     shader_program_manager = std::make_unique<GLShader::ProgramManager>(); | ||||
|     state.draw.shader_program = 0; | ||||
|     state.Apply(); | ||||
|  | @ -558,6 +558,8 @@ void RasterizerOpenGL::DrawPrelude() { | |||
|     SyncPolygonOffset(); | ||||
|     SyncAlphaTest(); | ||||
| 
 | ||||
|     buffer_cache.Acquire(); | ||||
| 
 | ||||
|     // Draw the vertex batch
 | ||||
|     const bool is_indexed = accelerate_draw == AccelDraw::Indexed; | ||||
| 
 | ||||
|  | @ -573,9 +575,11 @@ void RasterizerOpenGL::DrawPrelude() { | |||
|                   (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * | ||||
|                       Maxwell::MaxShaderStage; | ||||
| 
 | ||||
|     // Add space for at least 18 constant buffers
 | ||||
|     buffer_size += Maxwell::MaxConstBuffers * | ||||
|                    (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); | ||||
|     if (!device.HasFastBufferSubData()) { | ||||
|         // Add space for at least 18 constant buffers
 | ||||
|         buffer_size += Maxwell::MaxConstBuffers * | ||||
|                        (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); | ||||
|     } | ||||
| 
 | ||||
|     // Prepare the vertex array.
 | ||||
|     buffer_cache.Map(buffer_size); | ||||
|  | @ -739,10 +743,12 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { | |||
|     state.draw.shader_program = program; | ||||
|     state.draw.program_pipeline = 0; | ||||
| 
 | ||||
|     const std::size_t buffer_size = | ||||
|         Tegra::Engines::KeplerCompute::NumConstBuffers * | ||||
|         (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); | ||||
|     buffer_cache.Map(buffer_size); | ||||
|     if (!device.HasFastBufferSubData()) { | ||||
|         const std::size_t buffer_size = | ||||
|             Tegra::Engines::KeplerCompute::NumConstBuffers * | ||||
|             (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); | ||||
|         buffer_cache.Map(buffer_size); | ||||
|     } | ||||
| 
 | ||||
|     bind_ubo_pushbuffer.Setup(0); | ||||
|     bind_ssbo_pushbuffer.Setup(0); | ||||
|  | @ -750,7 +756,9 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { | |||
|     SetupComputeConstBuffers(kernel); | ||||
|     SetupComputeGlobalMemory(kernel); | ||||
| 
 | ||||
|     buffer_cache.Unmap(); | ||||
|     if (!device.HasFastBufferSubData()) { | ||||
|         buffer_cache.Unmap(); | ||||
|     } | ||||
| 
 | ||||
|     bind_ubo_pushbuffer.Bind(); | ||||
|     bind_ssbo_pushbuffer.Bind(); | ||||
|  | @ -879,7 +887,8 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b | |||
|     const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); | ||||
| 
 | ||||
|     const auto alignment = device.GetUniformBufferAlignment(); | ||||
|     const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment); | ||||
|     const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, | ||||
|                                                           device.HasFastBufferSubData()); | ||||
|     bind_ubo_pushbuffer.Push(cbuf, offset, size); | ||||
| } | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 ReinUsesLisp
						ReinUsesLisp