forked from eden-emu/eden
		
	gl_shader_cache: Specialize shader workgroup
Drop the usage of ARB_compute_variable_group_size and specialize compute shaders instead. This permits compute to run on AMD and Intel proprietary drivers.
This commit is contained in:
		
							parent
							
								
									b12a7dbace
								
							
						
					
					
						commit
						b0b505984d
					
				
					 6 changed files with 75 additions and 69 deletions
				
			
		|  | @ -140,7 +140,7 @@ public: | |||
| 
 | ||||
|         INSERT_PADDING_WORDS(0x3); | ||||
| 
 | ||||
|         BitField<0, 16, u32> shared_alloc; | ||||
|         BitField<0, 18, u32> shared_alloc; | ||||
| 
 | ||||
|         BitField<16, 16, u32> block_dim_x; | ||||
|         union { | ||||
|  |  | |||
|  | @ -273,8 +273,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { | |||
|         SetupDrawGlobalMemory(stage, shader); | ||||
|         SetupDrawTextures(stage, shader, base_bindings); | ||||
| 
 | ||||
|         const ProgramVariant variant{base_bindings, primitive_mode}; | ||||
|         const auto [program_handle, next_bindings] = shader->GetProgramHandle(variant); | ||||
|         const ProgramVariant variant(base_bindings, primitive_mode); | ||||
|         const auto [program_handle, next_bindings] = shader->GetHandle(variant); | ||||
| 
 | ||||
|         switch (program) { | ||||
|         case Maxwell::ShaderProgram::VertexA: | ||||
|  | @ -725,18 +725,14 @@ bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) { | |||
| } | ||||
| 
 | ||||
| void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { | ||||
|     if (!GLAD_GL_ARB_compute_variable_group_size) { | ||||
|         LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the " | ||||
|                                  "lack of GL_ARB_compute_variable_group_size"); | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
|     auto kernel = shader_cache.GetComputeKernel(code_addr); | ||||
|     SetupComputeTextures(kernel); | ||||
|     SetupComputeImages(kernel); | ||||
| 
 | ||||
|     const auto [program, next_bindings] = kernel->GetProgramHandle({}); | ||||
|     state.draw.shader_program = program; | ||||
|     const auto& launch_desc = system.GPU().KeplerCompute().launch_description; | ||||
|     const ProgramVariant variant(launch_desc.block_dim_x, launch_desc.block_dim_y, | ||||
|                                  launch_desc.block_dim_z); | ||||
|     std::tie(state.draw.shader_program, std::ignore) = kernel->GetHandle(variant); | ||||
|     state.draw.program_pipeline = 0; | ||||
| 
 | ||||
|     const std::size_t buffer_size = | ||||
|  | @ -760,10 +756,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { | |||
|     state.ApplyShaderProgram(); | ||||
|     state.ApplyProgramPipeline(); | ||||
| 
 | ||||
|     const auto& launch_desc = system.GPU().KeplerCompute().launch_description; | ||||
|     glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y, | ||||
|                                   launch_desc.grid_dim_z, launch_desc.block_dim_x, | ||||
|                                   launch_desc.block_dim_y, launch_desc.block_dim_z); | ||||
|     glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); | ||||
| } | ||||
| 
 | ||||
| void RasterizerOpenGL::FlushAll() {} | ||||
|  |  | |||
|  | @ -255,7 +255,7 @@ void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) { | |||
| 
 | ||||
| CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramType program_type, | ||||
|                           const ProgramCode& program_code, const ProgramCode& program_code_b, | ||||
|                           const ProgramVariant& variant, ConstBufferLocker& locker, | ||||
|                           ConstBufferLocker& locker, const ProgramVariant& variant, | ||||
|                           bool hint_retrievable = false) { | ||||
|     LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, program_type)); | ||||
| 
 | ||||
|  | @ -268,17 +268,11 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy | |||
|     } | ||||
|     const auto entries = GLShader::GetEntries(ir); | ||||
| 
 | ||||
|     auto base_bindings{variant.base_bindings}; | ||||
|     const auto primitive_mode{variant.primitive_mode}; | ||||
| 
 | ||||
|     std::string source = fmt::format(R"(// {} | ||||
| #version 430 core | ||||
| #extension GL_ARB_separate_shader_objects : enable | ||||
| )", | ||||
|                                      GetShaderId(unique_identifier, program_type)); | ||||
|     if (is_compute) { | ||||
|         source += "#extension GL_ARB_compute_variable_group_size : require\n"; | ||||
|     } | ||||
|     if (device.HasShaderBallot()) { | ||||
|         source += "#extension GL_ARB_shader_ballot : require\n"; | ||||
|     } | ||||
|  | @ -295,6 +289,7 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy | |||
|     } | ||||
|     source += '\n'; | ||||
| 
 | ||||
|     auto base_bindings = variant.base_bindings; | ||||
|     if (!is_compute) { | ||||
|         source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++); | ||||
|     } | ||||
|  | @ -318,13 +313,15 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy | |||
| 
 | ||||
|     if (program_type == ProgramType::Geometry) { | ||||
|         const auto [glsl_topology, debug_name, max_vertices] = | ||||
|             GetPrimitiveDescription(primitive_mode); | ||||
|             GetPrimitiveDescription(variant.primitive_mode); | ||||
| 
 | ||||
|         source += "layout (" + std::string(glsl_topology) + ") in;\n\n"; | ||||
|         source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n'; | ||||
|         source += fmt::format("layout ({}) in;\n\n", glsl_topology); | ||||
|         source += fmt::format("#define MAX_VERTEX_INPUT {}\n", max_vertices); | ||||
|     } | ||||
|     if (program_type == ProgramType::Compute) { | ||||
|         source += "layout (local_size_variable) in;\n"; | ||||
|         source += | ||||
|             fmt::format("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;\n", | ||||
|                         variant.block_x, variant.block_y, variant.block_z); | ||||
|     } | ||||
| 
 | ||||
|     source += '\n'; | ||||
|  | @ -422,58 +419,53 @@ Shader CachedShader::CreateFromCache(const ShaderParameters& params, | |||
|                                                           unspecialized.code_b)); | ||||
| } | ||||
| 
 | ||||
| std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) { | ||||
|     UpdateVariant(); | ||||
| std::tuple<GLuint, BaseBindings> CachedShader::GetHandle(const ProgramVariant& variant) { | ||||
|     EnsureValidLockerVariant(); | ||||
| 
 | ||||
|     const auto [entry, is_cache_miss] = curr_variant->programs.try_emplace(variant); | ||||
|     const auto [entry, is_cache_miss] = curr_locker_variant->programs.try_emplace(variant); | ||||
|     auto& program = entry->second; | ||||
|     if (is_cache_miss) { | ||||
|         program = BuildShader(device, unique_identifier, program_type, program_code, program_code_b, | ||||
|                               variant, *curr_variant->locker); | ||||
|         disk_cache.SaveUsage(GetUsage(variant, *curr_variant->locker)); | ||||
|                               *curr_locker_variant->locker, variant); | ||||
|         disk_cache.SaveUsage(GetUsage(variant, *curr_locker_variant->locker)); | ||||
| 
 | ||||
|         LabelGLObject(GL_PROGRAM, program->handle, cpu_addr); | ||||
|     } | ||||
| 
 | ||||
|     auto base_bindings = variant.base_bindings; | ||||
|     base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()); | ||||
|     if (program_type != ProgramType::Compute) { | ||||
|         base_bindings.cbuf += STAGE_RESERVED_UBOS; | ||||
|     } | ||||
|     base_bindings.cbuf += STAGE_RESERVED_UBOS; | ||||
|     base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size()); | ||||
|     base_bindings.sampler += static_cast<u32>(entries.samplers.size()); | ||||
| 
 | ||||
|     return {program->handle, base_bindings}; | ||||
| } | ||||
| 
 | ||||
| void CachedShader::UpdateVariant() { | ||||
|     if (curr_variant && !curr_variant->locker->IsConsistent()) { | ||||
|         curr_variant = nullptr; | ||||
| bool CachedShader::EnsureValidLockerVariant() { | ||||
|     const auto previous_variant = curr_locker_variant; | ||||
|     if (curr_locker_variant && !curr_locker_variant->locker->IsConsistent()) { | ||||
|         curr_locker_variant = nullptr; | ||||
|     } | ||||
|     if (!curr_variant) { | ||||
|     if (!curr_locker_variant) { | ||||
|         for (auto& variant : locker_variants) { | ||||
|             if (variant->locker->IsConsistent()) { | ||||
|                 curr_variant = variant.get(); | ||||
|                 curr_locker_variant = variant.get(); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     if (!curr_variant) { | ||||
|     if (!curr_locker_variant) { | ||||
|         auto& new_variant = locker_variants.emplace_back(); | ||||
|         new_variant = std::make_unique<LockerVariant>(); | ||||
|         new_variant->locker = MakeLocker(system, program_type); | ||||
|         curr_variant = new_variant.get(); | ||||
|         curr_locker_variant = new_variant.get(); | ||||
|     } | ||||
|     return previous_variant == curr_locker_variant; | ||||
| } | ||||
| 
 | ||||
| ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant, | ||||
|                                             const ConstBufferLocker& locker) const { | ||||
|     ShaderDiskCacheUsage usage; | ||||
|     usage.unique_identifier = unique_identifier; | ||||
|     usage.variant = variant; | ||||
|     usage.keys = locker.GetKeys(); | ||||
|     usage.bound_samplers = locker.GetBoundSamplers(); | ||||
|     usage.bindless_samplers = locker.GetBindlessSamplers(); | ||||
|     return usage; | ||||
|     return ShaderDiskCacheUsage{unique_identifier, variant, locker.GetKeys(), | ||||
|                                 locker.GetBoundSamplers(), locker.GetBindlessSamplers()}; | ||||
| } | ||||
| 
 | ||||
| ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, | ||||
|  | @ -534,9 +526,10 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, | |||
|             if (!shader) { | ||||
|                 auto locker{MakeLocker(system, unspecialized.program_type)}; | ||||
|                 FillLocker(*locker, usage); | ||||
| 
 | ||||
|                 shader = BuildShader(device, usage.unique_identifier, unspecialized.program_type, | ||||
|                                      unspecialized.code, unspecialized.code_b, usage.variant, | ||||
|                                      *locker, true); | ||||
|                                      unspecialized.code, unspecialized.code_b, *locker, | ||||
|                                      usage.variant, true); | ||||
|             } | ||||
| 
 | ||||
|             std::scoped_lock lock{mutex}; | ||||
|  |  | |||
|  | @ -86,7 +86,7 @@ public: | |||
|     } | ||||
| 
 | ||||
|     /// Gets the GL program handle for the shader
 | ||||
|     std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant); | ||||
|     std::tuple<GLuint, BaseBindings> GetHandle(const ProgramVariant& variant); | ||||
| 
 | ||||
| private: | ||||
|     struct LockerVariant { | ||||
|  | @ -98,7 +98,7 @@ private: | |||
|                           GLShader::ShaderEntries entries, ProgramCode program_code, | ||||
|                           ProgramCode program_code_b); | ||||
| 
 | ||||
|     void UpdateVariant(); | ||||
|     bool EnsureValidLockerVariant(); | ||||
| 
 | ||||
|     ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant, | ||||
|                                   const VideoCommon::Shader::ConstBufferLocker& locker) const; | ||||
|  | @ -117,7 +117,7 @@ private: | |||
|     ProgramCode program_code; | ||||
|     ProgramCode program_code_b; | ||||
| 
 | ||||
|     LockerVariant* curr_variant = nullptr; | ||||
|     LockerVariant* curr_locker_variant = nullptr; | ||||
|     std::vector<std::unique_ptr<LockerVariant>> locker_variants; | ||||
| }; | ||||
| 
 | ||||
|  |  | |||
|  | @ -52,11 +52,11 @@ struct BindlessSamplerKey { | |||
|     Tegra::Engines::SamplerDescriptor sampler{}; | ||||
| }; | ||||
| 
 | ||||
| constexpr u32 NativeVersion = 6; | ||||
| constexpr u32 NativeVersion = 7; | ||||
| 
 | ||||
| // Making sure sizes doesn't change by accident
 | ||||
| static_assert(sizeof(BaseBindings) == 16); | ||||
| static_assert(sizeof(ProgramVariant) == 20); | ||||
| static_assert(sizeof(ProgramVariant) == 28); | ||||
| 
 | ||||
| ShaderCacheVersionHash GetShaderCacheVersionHash() { | ||||
|     ShaderCacheVersionHash hash{}; | ||||
|  |  | |||
|  | @ -44,32 +44,49 @@ struct BaseBindings { | |||
|     u32 sampler{}; | ||||
|     u32 image{}; | ||||
| 
 | ||||
|     bool operator==(const BaseBindings& rhs) const { | ||||
|     bool operator==(const BaseBindings& rhs) const noexcept { | ||||
|         return std::tie(cbuf, gmem, sampler, image) == | ||||
|                std::tie(rhs.cbuf, rhs.gmem, rhs.sampler, rhs.image); | ||||
|     } | ||||
| 
 | ||||
|     bool operator!=(const BaseBindings& rhs) const { | ||||
|     bool operator!=(const BaseBindings& rhs) const noexcept { | ||||
|         return !operator==(rhs); | ||||
|     } | ||||
| }; | ||||
| static_assert(std::is_trivially_copyable_v<BaseBindings>); | ||||
| 
 | ||||
| /// Describes the different variants a single program can be compiled.
 | ||||
| struct ProgramVariant { | ||||
|     BaseBindings base_bindings; | ||||
|     GLenum primitive_mode{}; | ||||
| /// Describes the different variants a program can be compiled with.
 | ||||
| struct ProgramVariant final { | ||||
|     ProgramVariant() = default; | ||||
| 
 | ||||
|     bool operator==(const ProgramVariant& rhs) const { | ||||
|         return std::tie(base_bindings, primitive_mode) == | ||||
|                std::tie(rhs.base_bindings, rhs.primitive_mode); | ||||
|     /// Graphics constructor.
 | ||||
|     explicit constexpr ProgramVariant(BaseBindings base_bindings, GLenum primitive_mode) noexcept | ||||
|         : base_bindings{base_bindings}, primitive_mode{primitive_mode} {} | ||||
| 
 | ||||
|     /// Compute constructor.
 | ||||
|     explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z) noexcept | ||||
|         : block_x{block_x}, block_y{static_cast<u16>(block_y)}, block_z{static_cast<u16>(block_z)} { | ||||
|     } | ||||
| 
 | ||||
|     bool operator!=(const ProgramVariant& rhs) const { | ||||
|     // Graphics specific parameters.
 | ||||
|     BaseBindings base_bindings{}; | ||||
|     GLenum primitive_mode{}; | ||||
| 
 | ||||
|     // Compute specific parameters.
 | ||||
|     u32 block_x{}; | ||||
|     u16 block_y{}; | ||||
|     u16 block_z{}; | ||||
| 
 | ||||
|     bool operator==(const ProgramVariant& rhs) const noexcept { | ||||
|         return std::tie(base_bindings, primitive_mode, block_x, block_y, block_z) == | ||||
|                std::tie(rhs.base_bindings, rhs.primitive_mode, rhs.block_x, rhs.block_y, | ||||
|                         rhs.block_z); | ||||
|     } | ||||
| 
 | ||||
|     bool operator!=(const ProgramVariant& rhs) const noexcept { | ||||
|         return !operator==(rhs); | ||||
|     } | ||||
| }; | ||||
| 
 | ||||
| static_assert(std::is_trivially_copyable_v<ProgramVariant>); | ||||
| 
 | ||||
| /// Describes how a shader is used.
 | ||||
|  | @ -108,8 +125,11 @@ struct hash<OpenGL::BaseBindings> { | |||
| template <> | ||||
| struct hash<OpenGL::ProgramVariant> { | ||||
|     std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept { | ||||
|         return std::hash<OpenGL::BaseBindings>()(variant.base_bindings) ^ | ||||
|                (static_cast<std::size_t>(variant.primitive_mode) << 6); | ||||
|         return std::hash<OpenGL::BaseBindings>{}(variant.base_bindings) ^ | ||||
|                (static_cast<std::size_t>(variant.primitive_mode) << 6) ^ | ||||
|                static_cast<std::size_t>(variant.block_x) ^ | ||||
|                (static_cast<std::size_t>(variant.block_y) << 32) ^ | ||||
|                (static_cast<std::size_t>(variant.block_z) << 48); | ||||
|     } | ||||
| }; | ||||
| 
 | ||||
|  | @ -117,7 +137,7 @@ template <> | |||
| struct hash<OpenGL::ShaderDiskCacheUsage> { | ||||
|     std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept { | ||||
|         return static_cast<std::size_t>(usage.unique_identifier) ^ | ||||
|                std::hash<OpenGL::ProgramVariant>()(usage.variant); | ||||
|                std::hash<OpenGL::ProgramVariant>{}(usage.variant); | ||||
|     } | ||||
| }; | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 ReinUsesLisp
						ReinUsesLisp