forked from eden-emu/eden
		
	Merge pull request #2092 from ReinUsesLisp/stg
shader/memory: Implement STG and global memory flushing
This commit is contained in:
		
						commit
						1614c97d78
					
				
					 11 changed files with 186 additions and 89 deletions
				
			
		|  | @ -791,6 +791,12 @@ union Instruction { | |||
|         BitField<20, 24, s64> immediate_offset; | ||||
|     } ldg; | ||||
| 
 | ||||
|     union { | ||||
|         BitField<48, 3, UniformType> type; | ||||
|         BitField<46, 2, u64> cache_mode; | ||||
|         BitField<20, 24, s64> immediate_offset; | ||||
|     } stg; | ||||
| 
 | ||||
|     union { | ||||
|         BitField<0, 3, u64> pred0; | ||||
|         BitField<3, 3, u64> pred3; | ||||
|  |  | |||
|  | @ -14,28 +14,28 @@ | |||
| 
 | ||||
| namespace OpenGL { | ||||
| 
 | ||||
| CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr) | ||||
|     : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size} { | ||||
| CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size) | ||||
|     : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size}, | ||||
|       max_size{max_size} { | ||||
|     buffer.Create(); | ||||
|     // Bind and unbind the buffer so it gets allocated by the driver
 | ||||
|     glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle); | ||||
|     glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); | ||||
|     LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory"); | ||||
| } | ||||
| 
 | ||||
| void CachedGlobalRegion::Reload(u32 size_) { | ||||
|     constexpr auto max_size = static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize); | ||||
| CachedGlobalRegion::~CachedGlobalRegion() = default; | ||||
| 
 | ||||
| void CachedGlobalRegion::Reload(u32 size_) { | ||||
|     size = size_; | ||||
|     if (size > max_size) { | ||||
|         size = max_size; | ||||
|         LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the expected size {}!", size_, | ||||
|         LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_, | ||||
|                      max_size); | ||||
|     } | ||||
|     glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW); | ||||
| } | ||||
| 
 | ||||
|     // TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer
 | ||||
|     glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle); | ||||
|     glBufferData(GL_SHADER_STORAGE_BUFFER, size, GetHostPtr(), GL_DYNAMIC_DRAW); | ||||
| void CachedGlobalRegion::Flush() { | ||||
|     LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr); | ||||
|     glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr); | ||||
| } | ||||
| 
 | ||||
| GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const { | ||||
|  | @ -46,14 +46,16 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, | |||
|     return search->second; | ||||
| } | ||||
| 
 | ||||
| GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u32 size, | ||||
|                                                               u8* host_ptr) { | ||||
| GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, | ||||
|                                                               u32 size) { | ||||
|     GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)}; | ||||
|     if (!region) { | ||||
|         // No reserved surface available, create a new one and reserve it
 | ||||
|         auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; | ||||
|         const auto cpu_addr = *memory_manager.GpuToCpuAddress(addr); | ||||
|         region = std::make_shared<CachedGlobalRegion>(cpu_addr, size, host_ptr); | ||||
|         const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)}; | ||||
|         ASSERT(cpu_addr); | ||||
| 
 | ||||
|         region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size); | ||||
|         ReserveGlobalRegion(region); | ||||
|     } | ||||
|     region->Reload(size); | ||||
|  | @ -65,7 +67,11 @@ void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) { | |||
| } | ||||
| 
 | ||||
| GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer) | ||||
|     : RasterizerCache{rasterizer} {} | ||||
|     : RasterizerCache{rasterizer} { | ||||
|     GLint max_ssbo_size_; | ||||
|     glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_); | ||||
|     max_ssbo_size = static_cast<u32>(max_ssbo_size_); | ||||
| } | ||||
| 
 | ||||
| GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( | ||||
|     const GLShader::GlobalMemoryEntry& global_region, | ||||
|  | @ -73,7 +79,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( | |||
| 
 | ||||
|     auto& gpu{Core::System::GetInstance().GPU()}; | ||||
|     auto& memory_manager{gpu.MemoryManager()}; | ||||
|     const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<u64>(stage)]}; | ||||
|     const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]}; | ||||
|     const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address + | ||||
|                     global_region.GetCbufOffset()}; | ||||
|     const auto actual_addr{memory_manager.Read<u64>(addr)}; | ||||
|  | @ -85,7 +91,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( | |||
| 
 | ||||
|     if (!region) { | ||||
|         // No global region found - create a new one
 | ||||
|         region = GetUncachedGlobalRegion(actual_addr, size, host_ptr); | ||||
|         region = GetUncachedGlobalRegion(actual_addr, host_ptr, size); | ||||
|         Register(region); | ||||
|     } | ||||
| 
 | ||||
|  |  | |||
|  | @ -19,7 +19,7 @@ namespace OpenGL { | |||
| 
 | ||||
| namespace GLShader { | ||||
| class GlobalMemoryEntry; | ||||
| } // namespace GLShader
 | ||||
| } | ||||
| 
 | ||||
| class RasterizerOpenGL; | ||||
| class CachedGlobalRegion; | ||||
|  | @ -27,7 +27,8 @@ using GlobalRegion = std::shared_ptr<CachedGlobalRegion>; | |||
| 
 | ||||
| class CachedGlobalRegion final : public RasterizerCacheObject { | ||||
| public: | ||||
|     explicit CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr); | ||||
|     explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size); | ||||
|     ~CachedGlobalRegion(); | ||||
| 
 | ||||
|     VAddr GetCpuAddr() const override { | ||||
|         return cpu_addr; | ||||
|  | @ -45,14 +46,14 @@ public: | |||
|     /// Reloads the global region from guest memory
 | ||||
|     void Reload(u32 size_); | ||||
| 
 | ||||
|     // TODO(Rodrigo): When global memory is written (STG), implement flushing
 | ||||
|     void Flush() override { | ||||
|         UNIMPLEMENTED(); | ||||
|     } | ||||
|     void Flush() override; | ||||
| 
 | ||||
| private: | ||||
|     VAddr cpu_addr{}; | ||||
|     u8* host_ptr{}; | ||||
|     u32 size{}; | ||||
|     u32 max_size{}; | ||||
| 
 | ||||
|     OGLBuffer buffer; | ||||
| }; | ||||
| 
 | ||||
|  | @ -66,10 +67,11 @@ public: | |||
| 
 | ||||
| private: | ||||
|     GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const; | ||||
|     GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u32 size, u8* host_ptr); | ||||
|     GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size); | ||||
|     void ReserveGlobalRegion(GlobalRegion region); | ||||
| 
 | ||||
|     std::unordered_map<CacheAddr, GlobalRegion> reserve; | ||||
|     u32 max_ssbo_size{}; | ||||
| }; | ||||
| 
 | ||||
| } // namespace OpenGL
 | ||||
|  |  | |||
|  | @ -756,6 +756,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) { | |||
|         return; | ||||
|     } | ||||
|     res_cache.FlushRegion(addr, size); | ||||
|     global_cache.FlushRegion(addr, size); | ||||
| } | ||||
| 
 | ||||
| void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { | ||||
|  | @ -953,6 +954,9 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade | |||
|     for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { | ||||
|         const auto& entry{entries[bindpoint]}; | ||||
|         const auto& region{global_cache.GetGlobalRegion(entry, stage)}; | ||||
|         if (entry.IsWritten()) { | ||||
|             region->MarkAsModified(true, global_cache); | ||||
|         } | ||||
|         bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0, | ||||
|                                   static_cast<GLsizeiptr>(region->GetSizeInBytes())); | ||||
|     } | ||||
|  |  | |||
|  | @ -71,10 +71,6 @@ public: | |||
|     static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0, | ||||
|                   "The maximum size of a constbuffer must be a multiple of the size of GLvec4"); | ||||
| 
 | ||||
|     static constexpr std::size_t MaxGlobalMemorySize = 0x10000; | ||||
|     static_assert(MaxGlobalMemorySize % sizeof(float) == 0, | ||||
|                   "The maximum size of a global memory must be a multiple of the size of float"); | ||||
| 
 | ||||
| private: | ||||
|     class SamplerInfo { | ||||
|     public: | ||||
|  |  | |||
|  | @ -45,8 +45,6 @@ using TextureIR = std::variant<TextureAoffi, TextureArgument>; | |||
| enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 }; | ||||
| constexpr u32 MAX_CONSTBUFFER_ELEMENTS = | ||||
|     static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float)); | ||||
| constexpr u32 MAX_GLOBALMEMORY_ELEMENTS = | ||||
|     static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float); | ||||
| 
 | ||||
| class ShaderWriter { | ||||
| public: | ||||
|  | @ -208,8 +206,10 @@ public: | |||
|         for (const auto& sampler : ir.GetSamplers()) { | ||||
|             entries.samplers.emplace_back(sampler); | ||||
|         } | ||||
|         for (const auto& gmem : ir.GetGlobalMemoryBases()) { | ||||
|             entries.global_memory_entries.emplace_back(gmem.cbuf_index, gmem.cbuf_offset); | ||||
|         for (const auto& gmem_pair : ir.GetGlobalMemory()) { | ||||
|             const auto& [base, usage] = gmem_pair; | ||||
|             entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset, | ||||
|                                                        usage.is_read, usage.is_written); | ||||
|         } | ||||
|         entries.clip_distances = ir.GetClipDistances(); | ||||
|         entries.shader_length = ir.GetLength(); | ||||
|  | @ -380,12 +380,22 @@ private: | |||
|     } | ||||
| 
 | ||||
|     void DeclareGlobalMemory() { | ||||
|         for (const auto& entry : ir.GetGlobalMemoryBases()) { | ||||
|         for (const auto& gmem : ir.GetGlobalMemory()) { | ||||
|             const auto& [base, usage] = gmem; | ||||
| 
 | ||||
|             // Since we don't know how the shader will use the shader, hint the driver to disable as
 | ||||
|             // much optimizations as possible
 | ||||
|             std::string qualifier = "coherent volatile"; | ||||
|             if (usage.is_read && !usage.is_written) | ||||
|                 qualifier += " readonly"; | ||||
|             else if (usage.is_written && !usage.is_read) | ||||
|                 qualifier += " writeonly"; | ||||
| 
 | ||||
|             const std::string binding = | ||||
|                 fmt::format("GMEM_BINDING_{}_{}", entry.cbuf_index, entry.cbuf_offset); | ||||
|             code.AddLine("layout (std430, binding = " + binding + ") buffer " + | ||||
|                          GetGlobalMemoryBlock(entry) + " {"); | ||||
|             code.AddLine("    float " + GetGlobalMemory(entry) + "[MAX_GLOBALMEMORY_ELEMENTS];"); | ||||
|                 fmt::format("GMEM_BINDING_{}_{}", base.cbuf_index, base.cbuf_offset); | ||||
|             code.AddLine("layout (std430, binding = " + binding + ") " + qualifier + " buffer " + | ||||
|                          GetGlobalMemoryBlock(base) + " {"); | ||||
|             code.AddLine("    float " + GetGlobalMemory(base) + "[];"); | ||||
|             code.AddLine("};"); | ||||
|             code.AddNewLine(); | ||||
|         } | ||||
|  | @ -868,6 +878,12 @@ private: | |||
|         } else if (const auto lmem = std::get_if<LmemNode>(dest)) { | ||||
|             target = GetLocalMemory() + "[ftou(" + Visit(lmem->GetAddress()) + ") / 4]"; | ||||
| 
 | ||||
|         } else if (const auto gmem = std::get_if<GmemNode>(dest)) { | ||||
|             const std::string real = Visit(gmem->GetRealAddress()); | ||||
|             const std::string base = Visit(gmem->GetBaseAddress()); | ||||
|             const std::string final_offset = "(ftou(" + real + ") - ftou(" + base + ")) / 4"; | ||||
|             target = fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset); | ||||
| 
 | ||||
|         } else { | ||||
|             UNREACHABLE_MSG("Assign called without a proper target"); | ||||
|         } | ||||
|  | @ -1621,9 +1637,7 @@ private: | |||
| 
 | ||||
| std::string GetCommonDeclarations() { | ||||
|     const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS); | ||||
|     const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS); | ||||
|     return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" + | ||||
|            "#define MAX_GLOBALMEMORY_ELEMENTS " + gmem + "\n" + | ||||
|            "#define ftoi floatBitsToInt\n" | ||||
|            "#define ftou floatBitsToUint\n" | ||||
|            "#define itof intBitsToFloat\n" | ||||
|  |  | |||
|  | @ -39,8 +39,9 @@ private: | |||
| 
 | ||||
| class GlobalMemoryEntry { | ||||
| public: | ||||
|     explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset) | ||||
|         : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset} {} | ||||
|     explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, bool is_written) | ||||
|         : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{ | ||||
|                                                                                   is_written} {} | ||||
| 
 | ||||
|     u32 GetCbufIndex() const { | ||||
|         return cbuf_index; | ||||
|  | @ -50,9 +51,19 @@ public: | |||
|         return cbuf_offset; | ||||
|     } | ||||
| 
 | ||||
|     bool IsRead() const { | ||||
|         return is_read; | ||||
|     } | ||||
| 
 | ||||
|     bool IsWritten() const { | ||||
|         return is_written; | ||||
|     } | ||||
| 
 | ||||
| private: | ||||
|     u32 cbuf_index{}; | ||||
|     u32 cbuf_offset{}; | ||||
|     bool is_read{}; | ||||
|     bool is_written{}; | ||||
| }; | ||||
| 
 | ||||
| struct ShaderEntries { | ||||
|  |  | |||
|  | @ -337,11 +337,16 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn | |||
|     for (u32 i = 0; i < global_memory_count; ++i) { | ||||
|         u32 cbuf_index{}; | ||||
|         u32 cbuf_offset{}; | ||||
|         u8 is_read{}; | ||||
|         u8 is_written{}; | ||||
|         if (file.ReadBytes(&cbuf_index, sizeof(u32)) != sizeof(u32) || | ||||
|             file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32)) { | ||||
|             file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32) || | ||||
|             file.ReadBytes(&is_read, sizeof(u8)) != sizeof(u8) || | ||||
|             file.ReadBytes(&is_written, sizeof(u8)) != sizeof(u8)) { | ||||
|             return {}; | ||||
|         } | ||||
|         entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset); | ||||
|         entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read != 0, | ||||
|                                                          is_written != 0); | ||||
|     } | ||||
| 
 | ||||
|     for (auto& clip_distance : entry.entries.clip_distances) { | ||||
|  | @ -397,7 +402,9 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(FileUtil::IOFile& file, u64 uniqu | |||
|         return false; | ||||
|     for (const auto& gmem : entries.global_memory_entries) { | ||||
|         if (file.WriteObject(static_cast<u32>(gmem.GetCbufIndex())) != 1 || | ||||
|             file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1) { | ||||
|             file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1 || | ||||
|             file.WriteObject(static_cast<u8>(gmem.IsRead() ? 1 : 0)) != 1 || | ||||
|             file.WriteObject(static_cast<u8>(gmem.IsWritten() ? 1 : 0)) != 1) { | ||||
|             return false; | ||||
|         } | ||||
|     } | ||||
|  |  | |||
|  | @ -191,8 +191,9 @@ public: | |||
|         for (const auto& cbuf : ir.GetConstantBuffers()) { | ||||
|             entries.const_buffers.emplace_back(cbuf.second, cbuf.first); | ||||
|         } | ||||
|         for (const auto& gmem : ir.GetGlobalMemoryBases()) { | ||||
|             entries.global_buffers.emplace_back(gmem.cbuf_index, gmem.cbuf_offset); | ||||
|         for (const auto& gmem_pair : ir.GetGlobalMemory()) { | ||||
|             const auto& [base, usage] = gmem_pair; | ||||
|             entries.global_buffers.emplace_back(base.cbuf_index, base.cbuf_offset); | ||||
|         } | ||||
|         for (const auto& sampler : ir.GetSamplers()) { | ||||
|             entries.samplers.emplace_back(sampler); | ||||
|  | @ -225,7 +226,7 @@ private: | |||
|             return current_binding; | ||||
|         }; | ||||
|         const_buffers_base_binding = Allocate(ir.GetConstantBuffers().size()); | ||||
|         global_buffers_base_binding = Allocate(ir.GetGlobalMemoryBases().size()); | ||||
|         global_buffers_base_binding = Allocate(ir.GetGlobalMemory().size()); | ||||
|         samplers_base_binding = Allocate(ir.GetSamplers().size()); | ||||
| 
 | ||||
|         ASSERT_MSG(binding_iterator - binding_base < STAGE_BINDING_STRIDE, | ||||
|  | @ -390,14 +391,15 @@ private: | |||
| 
 | ||||
|     void DeclareGlobalBuffers() { | ||||
|         u32 binding = global_buffers_base_binding; | ||||
|         for (const auto& entry : ir.GetGlobalMemoryBases()) { | ||||
|         for (const auto& entry : ir.GetGlobalMemory()) { | ||||
|             const auto [base, usage] = entry; | ||||
|             const Id id = OpVariable(t_gmem_ssbo, spv::StorageClass::StorageBuffer); | ||||
|             AddGlobalVariable( | ||||
|                 Name(id, fmt::format("gmem_{}_{}", entry.cbuf_index, entry.cbuf_offset))); | ||||
|                 Name(id, fmt::format("gmem_{}_{}", base.cbuf_index, base.cbuf_offset))); | ||||
| 
 | ||||
|             Decorate(id, spv::Decoration::Binding, binding++); | ||||
|             Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); | ||||
|             global_buffers.emplace(entry, id); | ||||
|             global_buffers.emplace(base, id); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|  |  | |||
|  | @ -18,6 +18,23 @@ using Tegra::Shader::Instruction; | |||
| using Tegra::Shader::OpCode; | ||||
| using Tegra::Shader::Register; | ||||
| 
 | ||||
| namespace { | ||||
| u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) { | ||||
|     switch (uniform_type) { | ||||
|     case Tegra::Shader::UniformType::Single: | ||||
|         return 1; | ||||
|     case Tegra::Shader::UniformType::Double: | ||||
|         return 2; | ||||
|     case Tegra::Shader::UniformType::Quad: | ||||
|     case Tegra::Shader::UniformType::UnsignedQuad: | ||||
|         return 4; | ||||
|     default: | ||||
|         UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type)); | ||||
|         return 1; | ||||
|     } | ||||
| } | ||||
| } // namespace
 | ||||
| 
 | ||||
| u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { | ||||
|     const Instruction instr = {program_code[pc]}; | ||||
|     const auto opcode = OpCode::Decode(instr); | ||||
|  | @ -126,45 +143,15 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { | |||
|         break; | ||||
|     } | ||||
|     case OpCode::Id::LDG: { | ||||
|         const u32 count = [&]() { | ||||
|             switch (instr.ldg.type) { | ||||
|             case Tegra::Shader::UniformType::Single: | ||||
|                 return 1; | ||||
|             case Tegra::Shader::UniformType::Double: | ||||
|                 return 2; | ||||
|             case Tegra::Shader::UniformType::Quad: | ||||
|             case Tegra::Shader::UniformType::UnsignedQuad: | ||||
|                 return 4; | ||||
|             default: | ||||
|                 UNIMPLEMENTED_MSG("Unimplemented LDG size!"); | ||||
|                 return 1; | ||||
|             } | ||||
|         }(); | ||||
| 
 | ||||
|         const Node addr_register = GetRegister(instr.gpr8); | ||||
|         const Node base_address = | ||||
|             TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size())); | ||||
|         const auto cbuf = std::get_if<CbufNode>(base_address); | ||||
|         ASSERT(cbuf != nullptr); | ||||
|         const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset()); | ||||
|         ASSERT(cbuf_offset_imm != nullptr); | ||||
|         const auto cbuf_offset = cbuf_offset_imm->GetValue(); | ||||
| 
 | ||||
|         bb.push_back(Comment( | ||||
|             fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset))); | ||||
| 
 | ||||
|         const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset}; | ||||
|         used_global_memory_bases.insert(descriptor); | ||||
| 
 | ||||
|         const Node immediate_offset = | ||||
|             Immediate(static_cast<u32>(instr.ldg.immediate_offset.Value())); | ||||
|         const Node base_real_address = | ||||
|             Operation(OperationCode::UAdd, NO_PRECISE, immediate_offset, addr_register); | ||||
|         const auto [real_address_base, base_address, descriptor] = | ||||
|             TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8), | ||||
|                                     static_cast<u32>(instr.ldg.immediate_offset.Value()), false); | ||||
| 
 | ||||
|         const u32 count = GetUniformTypeElementsCount(instr.ldg.type); | ||||
|         for (u32 i = 0; i < count; ++i) { | ||||
|             const Node it_offset = Immediate(i * 4); | ||||
|             const Node real_address = | ||||
|                 Operation(OperationCode::UAdd, NO_PRECISE, base_real_address, it_offset); | ||||
|                 Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset); | ||||
|             const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor)); | ||||
| 
 | ||||
|             SetTemporal(bb, i, gmem); | ||||
|  | @ -174,6 +161,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { | |||
|         } | ||||
|         break; | ||||
|     } | ||||
|     case OpCode::Id::STG: { | ||||
|         const auto [real_address_base, base_address, descriptor] = | ||||
|             TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8), | ||||
|                                     static_cast<u32>(instr.stg.immediate_offset.Value()), true); | ||||
| 
 | ||||
|         // Encode in temporary registers like this: real_base_address, {registers_to_be_written...}
 | ||||
|         SetTemporal(bb, 0, real_address_base); | ||||
| 
 | ||||
|         const u32 count = GetUniformTypeElementsCount(instr.stg.type); | ||||
|         for (u32 i = 0; i < count; ++i) { | ||||
|             SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i)); | ||||
|         } | ||||
|         for (u32 i = 0; i < count; ++i) { | ||||
|             const Node it_offset = Immediate(i * 4); | ||||
|             const Node real_address = | ||||
|                 Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset); | ||||
|             const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor)); | ||||
| 
 | ||||
|             bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1))); | ||||
|         } | ||||
|         break; | ||||
|     } | ||||
|     case OpCode::Id::ST_A: { | ||||
|         UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex, | ||||
|                              "Indirect attribute loads are not supported"); | ||||
|  | @ -236,4 +245,34 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { | |||
|     return pc; | ||||
| } | ||||
| 
 | ||||
| std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeBlock& bb, | ||||
|                                                                            Node addr_register, | ||||
|                                                                            u32 immediate_offset, | ||||
|                                                                            bool is_write) { | ||||
|     const Node base_address{ | ||||
|         TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))}; | ||||
|     const auto cbuf = std::get_if<CbufNode>(base_address); | ||||
|     ASSERT(cbuf != nullptr); | ||||
|     const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset()); | ||||
|     ASSERT(cbuf_offset_imm != nullptr); | ||||
|     const auto cbuf_offset = cbuf_offset_imm->GetValue(); | ||||
| 
 | ||||
|     bb.push_back( | ||||
|         Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset))); | ||||
| 
 | ||||
|     const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset}; | ||||
|     const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor); | ||||
|     auto& usage = entry->second; | ||||
|     if (is_write) { | ||||
|         usage.is_written = true; | ||||
|     } else { | ||||
|         usage.is_read = true; | ||||
|     } | ||||
| 
 | ||||
|     const auto real_address = | ||||
|         Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register); | ||||
| 
 | ||||
|     return {real_address, base_address, descriptor}; | ||||
| } | ||||
| 
 | ||||
| } // namespace VideoCommon::Shader
 | ||||
|  |  | |||
|  | @ -276,6 +276,11 @@ struct GlobalMemoryBase { | |||
|     } | ||||
| }; | ||||
| 
 | ||||
| struct GlobalMemoryUsage { | ||||
|     bool is_read{}; | ||||
|     bool is_written{}; | ||||
| }; | ||||
| 
 | ||||
| struct MetaArithmetic { | ||||
|     bool precise{}; | ||||
| }; | ||||
|  | @ -578,8 +583,8 @@ public: | |||
|         return used_clip_distances; | ||||
|     } | ||||
| 
 | ||||
|     const std::set<GlobalMemoryBase>& GetGlobalMemoryBases() const { | ||||
|         return used_global_memory_bases; | ||||
|     const std::map<GlobalMemoryBase, GlobalMemoryUsage>& GetGlobalMemory() const { | ||||
|         return used_global_memory; | ||||
|     } | ||||
| 
 | ||||
|     std::size_t GetLength() const { | ||||
|  | @ -781,6 +786,11 @@ private: | |||
| 
 | ||||
|     std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor); | ||||
| 
 | ||||
|     std::tuple<Node, Node, GlobalMemoryBase> TrackAndGetGlobalMemory(NodeBlock& bb, | ||||
|                                                                      Node addr_register, | ||||
|                                                                      u32 immediate_offset, | ||||
|                                                                      bool is_write); | ||||
| 
 | ||||
|     template <typename... T> | ||||
|     Node Operation(OperationCode code, const T*... operands) { | ||||
|         return StoreNode(OperationNode(code, operands...)); | ||||
|  | @ -834,7 +844,7 @@ private: | |||
|     std::map<u32, ConstBuffer> used_cbufs; | ||||
|     std::set<Sampler> used_samplers; | ||||
|     std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{}; | ||||
|     std::set<GlobalMemoryBase> used_global_memory_bases; | ||||
|     std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory; | ||||
| 
 | ||||
|     Tegra::Shader::Header header; | ||||
| }; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 bunnei
						bunnei