forked from eden-emu/eden
		
	Merge pull request #9849 from ameerj/async-astc
texture_cache: Add asynchronous ASTC texture decoding
This commit is contained in:
		
						commit
						26c1edf2f0
					
				
					 15 changed files with 138 additions and 8 deletions
				
			
		|  | @ -23,6 +23,7 @@ public: | |||
|           buffer{Common::make_unique_for_overwrite<T[]>(initial_capacity)} {} | ||||
| 
 | ||||
|     ~ScratchBuffer() = default; | ||||
|     ScratchBuffer(ScratchBuffer&&) = default; | ||||
| 
 | ||||
|     /// This will only grow the buffer's capacity if size is greater than the current capacity.
 | ||||
|     /// The previously held data will remain intact.
 | ||||
|  |  | |||
|  | @ -59,6 +59,7 @@ void LogSettings() { | |||
|                 values.use_asynchronous_gpu_emulation.GetValue()); | ||||
|     log_setting("Renderer_NvdecEmulation", values.nvdec_emulation.GetValue()); | ||||
|     log_setting("Renderer_AccelerateASTC", values.accelerate_astc.GetValue()); | ||||
|     log_setting("Renderer_AsyncASTC", values.async_astc.GetValue()); | ||||
|     log_setting("Renderer_UseVsync", values.use_vsync.GetValue()); | ||||
|     log_setting("Renderer_ShaderBackend", values.shader_backend.GetValue()); | ||||
|     log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue()); | ||||
|  | @ -219,6 +220,7 @@ void RestoreGlobalState(bool is_powered_on) { | |||
|     values.use_asynchronous_gpu_emulation.SetGlobal(true); | ||||
|     values.nvdec_emulation.SetGlobal(true); | ||||
|     values.accelerate_astc.SetGlobal(true); | ||||
|     values.async_astc.SetGlobal(true); | ||||
|     values.use_vsync.SetGlobal(true); | ||||
|     values.shader_backend.SetGlobal(true); | ||||
|     values.use_asynchronous_shaders.SetGlobal(true); | ||||
|  |  | |||
|  | @ -453,6 +453,7 @@ struct Values { | |||
|     SwitchableSetting<bool> use_asynchronous_gpu_emulation{true, "use_asynchronous_gpu_emulation"}; | ||||
|     SwitchableSetting<NvdecEmulation> nvdec_emulation{NvdecEmulation::GPU, "nvdec_emulation"}; | ||||
|     SwitchableSetting<bool> accelerate_astc{true, "accelerate_astc"}; | ||||
|     SwitchableSetting<bool> async_astc{false, "async_astc"}; | ||||
|     SwitchableSetting<bool> use_vsync{true, "use_vsync"}; | ||||
|     SwitchableSetting<ShaderBackend, true> shader_backend{ShaderBackend::GLSL, ShaderBackend::GLSL, | ||||
|                                                           ShaderBackend::SPIRV, "shader_backend"}; | ||||
|  |  | |||
|  | @ -228,8 +228,9 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 | |||
| 
 | ||||
| [[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, | ||||
|                                     const VideoCommon::ImageInfo& info) { | ||||
|     if (IsPixelFormatASTC(info.format)) { | ||||
|         return !runtime.HasNativeASTC() && Settings::values.accelerate_astc.GetValue(); | ||||
|     if (IsPixelFormatASTC(info.format) && !runtime.HasNativeASTC()) { | ||||
|         return Settings::values.accelerate_astc.GetValue() && | ||||
|                !Settings::values.async_astc.GetValue(); | ||||
|     } | ||||
|     // Disable other accelerated uploads for now as they don't implement swizzled uploads
 | ||||
|     return false; | ||||
|  | @ -258,6 +259,14 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 | |||
|     return format_info.compatibility_class == store_class; | ||||
| } | ||||
| 
 | ||||
| [[nodiscard]] bool CanBeDecodedAsync(const TextureCacheRuntime& runtime, | ||||
|                                      const VideoCommon::ImageInfo& info) { | ||||
|     if (IsPixelFormatASTC(info.format) && !runtime.HasNativeASTC()) { | ||||
|         return Settings::values.async_astc.GetValue(); | ||||
|     } | ||||
|     return false; | ||||
| } | ||||
| 
 | ||||
| [[nodiscard]] CopyOrigin MakeCopyOrigin(VideoCommon::Offset3D offset, | ||||
|                                         VideoCommon::SubresourceLayers subresource, GLenum target) { | ||||
|     switch (target) { | ||||
|  | @ -721,7 +730,9 @@ std::optional<size_t> TextureCacheRuntime::StagingBuffers::FindBuffer(size_t req | |||
| Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, | ||||
|              VAddr cpu_addr_) | ||||
|     : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), runtime{&runtime_} { | ||||
|     if (CanBeAccelerated(*runtime, info)) { | ||||
|     if (CanBeDecodedAsync(*runtime, info)) { | ||||
|         flags |= ImageFlagBits::AsynchronousDecode; | ||||
|     } else if (CanBeAccelerated(*runtime, info)) { | ||||
|         flags |= ImageFlagBits::AcceleratedUpload; | ||||
|     } | ||||
|     if (IsConverted(runtime->device, info.format, info.type)) { | ||||
|  |  | |||
|  | @ -1256,11 +1256,12 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu | |||
|       commit(runtime_.memory_allocator.Commit(original_image, MemoryUsage::DeviceLocal)), | ||||
|       aspect_mask(ImageAspectMask(info.format)) { | ||||
|     if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) { | ||||
|         if (Settings::values.accelerate_astc.GetValue()) { | ||||
|         if (Settings::values.async_astc.GetValue()) { | ||||
|             flags |= VideoCommon::ImageFlagBits::AsynchronousDecode; | ||||
|         } else if (Settings::values.accelerate_astc.GetValue()) { | ||||
|             flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; | ||||
|         } else { | ||||
|             flags |= VideoCommon::ImageFlagBits::Converted; | ||||
|         } | ||||
|         flags |= VideoCommon::ImageFlagBits::Converted; | ||||
|         flags |= VideoCommon::ImageFlagBits::CostlyLoad; | ||||
|     } | ||||
|     if (runtime->device.HasDebuggingToolAttached()) { | ||||
|  |  | |||
|  | @ -38,6 +38,9 @@ enum class ImageFlagBits : u32 { | |||
|     Rescaled = 1 << 13, | ||||
|     CheckingRescalable = 1 << 14, | ||||
|     IsRescalable = 1 << 15, | ||||
| 
 | ||||
|     AsynchronousDecode = 1 << 16, | ||||
|     IsDecoding = 1 << 17, ///< Is currently being decoded asynchornously.
 | ||||
| }; | ||||
| DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) | ||||
| 
 | ||||
|  |  | |||
|  | @ -85,6 +85,11 @@ void TextureCache<P>::RunGarbageCollector() { | |||
|         } | ||||
|         --num_iterations; | ||||
|         auto& image = slot_images[image_id]; | ||||
|         if (True(image.flags & ImageFlagBits::IsDecoding)) { | ||||
|             // This image is still being decoded, deleting it will invalidate the slot
 | ||||
|             // used by the async decoder thread.
 | ||||
|             return false; | ||||
|         } | ||||
|         const bool must_download = | ||||
|             image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap); | ||||
|         if (!high_priority_mode && | ||||
|  | @ -133,6 +138,8 @@ void TextureCache<P>::TickFrame() { | |||
|     sentenced_images.Tick(); | ||||
|     sentenced_framebuffers.Tick(); | ||||
|     sentenced_image_view.Tick(); | ||||
|     TickAsyncDecode(); | ||||
| 
 | ||||
|     runtime.TickFrame(); | ||||
|     critical_gc = 0; | ||||
|     ++frame_tick; | ||||
|  | @ -777,6 +784,10 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) { | |||
|         LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); | ||||
|         return; | ||||
|     } | ||||
|     if (True(image.flags & ImageFlagBits::AsynchronousDecode)) { | ||||
|         QueueAsyncDecode(image, image_id); | ||||
|         return; | ||||
|     } | ||||
|     auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image)); | ||||
|     UploadImageContents(image, staging); | ||||
|     runtime.InsertUploadMemoryBarrier(); | ||||
|  | @ -989,6 +1000,65 @@ u64 TextureCache<P>::GetScaledImageSizeBytes(const ImageBase& image) { | |||
|     return fitted_size; | ||||
| } | ||||
| 
 | ||||
| template <class P> | ||||
| void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) { | ||||
|     UNIMPLEMENTED_IF(False(image.flags & ImageFlagBits::Converted)); | ||||
|     LOG_INFO(HW_GPU, "Queuing async texture decode"); | ||||
| 
 | ||||
|     image.flags |= ImageFlagBits::IsDecoding; | ||||
|     auto decode = std::make_unique<AsyncDecodeContext>(); | ||||
|     auto* decode_ptr = decode.get(); | ||||
|     decode->image_id = image_id; | ||||
|     async_decodes.push_back(std::move(decode)); | ||||
| 
 | ||||
|     Common::ScratchBuffer<u8> local_unswizzle_data_buffer(image.unswizzled_size_bytes); | ||||
|     const size_t guest_size_bytes = image.guest_size_bytes; | ||||
|     swizzle_data_buffer.resize_destructive(guest_size_bytes); | ||||
|     gpu_memory->ReadBlockUnsafe(image.gpu_addr, swizzle_data_buffer.data(), guest_size_bytes); | ||||
|     auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data_buffer, | ||||
|                                  local_unswizzle_data_buffer); | ||||
|     const size_t out_size = MapSizeBytes(image); | ||||
| 
 | ||||
|     auto func = [out_size, copies, info = image.info, | ||||
|                  input = std::move(local_unswizzle_data_buffer), | ||||
|                  async_decode = decode_ptr]() mutable { | ||||
|         async_decode->decoded_data.resize_destructive(out_size); | ||||
|         std::span copies_span{copies.data(), copies.size()}; | ||||
|         ConvertImage(input, info, async_decode->decoded_data, copies_span); | ||||
| 
 | ||||
|         // TODO: Do we need this lock?
 | ||||
|         std::unique_lock lock{async_decode->mutex}; | ||||
|         async_decode->copies = std::move(copies); | ||||
|         async_decode->complete = true; | ||||
|     }; | ||||
|     texture_decode_worker.QueueWork(std::move(func)); | ||||
| } | ||||
| 
 | ||||
| template <class P> | ||||
| void TextureCache<P>::TickAsyncDecode() { | ||||
|     bool has_uploads{}; | ||||
|     auto i = async_decodes.begin(); | ||||
|     while (i != async_decodes.end()) { | ||||
|         auto* async_decode = i->get(); | ||||
|         std::unique_lock lock{async_decode->mutex}; | ||||
|         if (!async_decode->complete) { | ||||
|             ++i; | ||||
|             continue; | ||||
|         } | ||||
|         Image& image = slot_images[async_decode->image_id]; | ||||
|         auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image)); | ||||
|         std::memcpy(staging.mapped_span.data(), async_decode->decoded_data.data(), | ||||
|                     async_decode->decoded_data.size()); | ||||
|         image.UploadMemory(staging, async_decode->copies); | ||||
|         image.flags &= ~ImageFlagBits::IsDecoding; | ||||
|         has_uploads = true; | ||||
|         i = async_decodes.erase(i); | ||||
|     } | ||||
|     if (has_uploads) { | ||||
|         runtime.InsertUploadMemoryBarrier(); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| template <class P> | ||||
| bool TextureCache<P>::ScaleUp(Image& image) { | ||||
|     const bool has_copy = image.HasScaled(); | ||||
|  |  | |||
|  | @ -3,6 +3,7 @@ | |||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include <atomic> | ||||
| #include <deque> | ||||
| #include <limits> | ||||
| #include <mutex> | ||||
|  | @ -18,6 +19,7 @@ | |||
| #include "common/lru_cache.h" | ||||
| #include "common/polyfill_ranges.h" | ||||
| #include "common/scratch_buffer.h" | ||||
| #include "common/thread_worker.h" | ||||
| #include "video_core/compatible_formats.h" | ||||
| #include "video_core/control/channel_state_cache.h" | ||||
| #include "video_core/delayed_destruction_ring.h" | ||||
|  | @ -54,6 +56,14 @@ struct ImageViewInOut { | |||
|     ImageViewId id{}; | ||||
| }; | ||||
| 
 | ||||
| struct AsyncDecodeContext { | ||||
|     ImageId image_id; | ||||
|     Common::ScratchBuffer<u8> decoded_data; | ||||
|     std::vector<BufferImageCopy> copies; | ||||
|     std::mutex mutex; | ||||
|     std::atomic_bool complete; | ||||
| }; | ||||
| 
 | ||||
| using TextureCacheGPUMap = std::unordered_map<u64, std::vector<ImageId>, Common::IdentityHash<u64>>; | ||||
| 
 | ||||
| class TextureCacheChannelInfo : public ChannelInfo { | ||||
|  | @ -377,6 +387,9 @@ private: | |||
|     bool ScaleDown(Image& image); | ||||
|     u64 GetScaledImageSizeBytes(const ImageBase& image); | ||||
| 
 | ||||
|     void QueueAsyncDecode(Image& image, ImageId image_id); | ||||
|     void TickAsyncDecode(); | ||||
| 
 | ||||
|     Runtime& runtime; | ||||
| 
 | ||||
|     VideoCore::RasterizerInterface& rasterizer; | ||||
|  | @ -430,6 +443,9 @@ private: | |||
| 
 | ||||
|     u64 modification_tick = 0; | ||||
|     u64 frame_tick = 0; | ||||
| 
 | ||||
|     Common::ThreadWorker texture_decode_worker{1, "TextureDecoder"}; | ||||
|     std::vector<std::unique_ptr<AsyncDecodeContext>> async_decodes; | ||||
| }; | ||||
| 
 | ||||
| } // namespace VideoCommon
 | ||||
|  |  | |||
|  | @ -1656,7 +1656,7 @@ void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, | |||
|     const u32 rows = Common::DivideUp(height, block_height); | ||||
|     const u32 cols = Common::DivideUp(width, block_width); | ||||
| 
 | ||||
|     Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2, | ||||
|     static Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2, | ||||
|                                         "ASTCDecompress"}; | ||||
| 
 | ||||
|     for (u32 z = 0; z < depth; ++z) { | ||||
|  |  | |||
|  | @ -702,6 +702,7 @@ void Config::ReadRendererValues() { | |||
|     ReadGlobalSetting(Settings::values.use_asynchronous_gpu_emulation); | ||||
|     ReadGlobalSetting(Settings::values.nvdec_emulation); | ||||
|     ReadGlobalSetting(Settings::values.accelerate_astc); | ||||
|     ReadGlobalSetting(Settings::values.async_astc); | ||||
|     ReadGlobalSetting(Settings::values.use_vsync); | ||||
|     ReadGlobalSetting(Settings::values.shader_backend); | ||||
|     ReadGlobalSetting(Settings::values.use_asynchronous_shaders); | ||||
|  | @ -1343,6 +1344,7 @@ void Config::SaveRendererValues() { | |||
|                  static_cast<u32>(Settings::values.nvdec_emulation.GetDefault()), | ||||
|                  Settings::values.nvdec_emulation.UsingGlobal()); | ||||
|     WriteGlobalSetting(Settings::values.accelerate_astc); | ||||
|     WriteGlobalSetting(Settings::values.async_astc); | ||||
|     WriteGlobalSetting(Settings::values.use_vsync); | ||||
|     WriteSetting(QString::fromStdString(Settings::values.shader_backend.GetLabel()), | ||||
|                  static_cast<u32>(Settings::values.shader_backend.GetValue(global)), | ||||
|  |  | |||
|  | @ -23,11 +23,13 @@ void ConfigureGraphicsAdvanced::SetConfiguration() { | |||
|     const bool runtime_lock = !system.IsPoweredOn(); | ||||
|     ui->use_vsync->setEnabled(runtime_lock); | ||||
|     ui->renderer_force_max_clock->setEnabled(runtime_lock); | ||||
|     ui->async_astc->setEnabled(runtime_lock); | ||||
|     ui->use_asynchronous_shaders->setEnabled(runtime_lock); | ||||
|     ui->anisotropic_filtering_combobox->setEnabled(runtime_lock); | ||||
| 
 | ||||
|     ui->renderer_force_max_clock->setChecked(Settings::values.renderer_force_max_clock.GetValue()); | ||||
|     ui->use_vsync->setChecked(Settings::values.use_vsync.GetValue()); | ||||
|     ui->async_astc->setChecked(Settings::values.async_astc.GetValue()); | ||||
|     ui->use_asynchronous_shaders->setChecked(Settings::values.use_asynchronous_shaders.GetValue()); | ||||
|     ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time.GetValue()); | ||||
|     ui->use_pessimistic_flushes->setChecked(Settings::values.use_pessimistic_flushes.GetValue()); | ||||
|  | @ -58,6 +60,8 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() { | |||
|     ConfigurationShared::ApplyPerGameSetting(&Settings::values.max_anisotropy, | ||||
|                                              ui->anisotropic_filtering_combobox); | ||||
|     ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_vsync, ui->use_vsync, use_vsync); | ||||
|     ConfigurationShared::ApplyPerGameSetting(&Settings::values.async_astc, ui->async_astc, | ||||
|                                              async_astc); | ||||
|     ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_shaders, | ||||
|                                              ui->use_asynchronous_shaders, | ||||
|                                              use_asynchronous_shaders); | ||||
|  | @ -89,6 +93,7 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { | |||
|         ui->renderer_force_max_clock->setEnabled( | ||||
|             Settings::values.renderer_force_max_clock.UsingGlobal()); | ||||
|         ui->use_vsync->setEnabled(Settings::values.use_vsync.UsingGlobal()); | ||||
|         ui->async_astc->setEnabled(Settings::values.async_astc.UsingGlobal()); | ||||
|         ui->use_asynchronous_shaders->setEnabled( | ||||
|             Settings::values.use_asynchronous_shaders.UsingGlobal()); | ||||
|         ui->use_fast_gpu_time->setEnabled(Settings::values.use_fast_gpu_time.UsingGlobal()); | ||||
|  | @ -106,6 +111,8 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { | |||
|                                             Settings::values.renderer_force_max_clock, | ||||
|                                             renderer_force_max_clock); | ||||
|     ConfigurationShared::SetColoredTristate(ui->use_vsync, Settings::values.use_vsync, use_vsync); | ||||
|     ConfigurationShared::SetColoredTristate(ui->async_astc, Settings::values.async_astc, | ||||
|                                             async_astc); | ||||
|     ConfigurationShared::SetColoredTristate(ui->use_asynchronous_shaders, | ||||
|                                             Settings::values.use_asynchronous_shaders, | ||||
|                                             use_asynchronous_shaders); | ||||
|  |  | |||
|  | @ -38,6 +38,7 @@ private: | |||
| 
 | ||||
|     ConfigurationShared::CheckState renderer_force_max_clock; | ||||
|     ConfigurationShared::CheckState use_vsync; | ||||
|     ConfigurationShared::CheckState async_astc; | ||||
|     ConfigurationShared::CheckState use_asynchronous_shaders; | ||||
|     ConfigurationShared::CheckState use_fast_gpu_time; | ||||
|     ConfigurationShared::CheckState use_pessimistic_flushes; | ||||
|  |  | |||
|  | @ -89,6 +89,16 @@ | |||
|           </property> | ||||
|          </widget> | ||||
|         </item> | ||||
|         <item> | ||||
|          <widget class="QCheckBox" name="async_astc"> | ||||
|           <property name="toolTip"> | ||||
|            <string>Enables asynchronous ASTC texture decoding, which may reduce load time stutter. This feature is experimental.</string> | ||||
|           </property> | ||||
|           <property name="text"> | ||||
|            <string>Decode ASTC textures asynchronously (Hack)</string> | ||||
|           </property> | ||||
|          </widget> | ||||
|         </item> | ||||
|         <item> | ||||
|          <widget class="QCheckBox" name="use_asynchronous_shaders"> | ||||
|           <property name="toolTip"> | ||||
|  |  | |||
|  | @ -324,6 +324,7 @@ void Config::ReadValues() { | |||
|     ReadSetting("Renderer", Settings::values.use_asynchronous_shaders); | ||||
|     ReadSetting("Renderer", Settings::values.nvdec_emulation); | ||||
|     ReadSetting("Renderer", Settings::values.accelerate_astc); | ||||
|     ReadSetting("Renderer", Settings::values.async_astc); | ||||
|     ReadSetting("Renderer", Settings::values.use_fast_gpu_time); | ||||
|     ReadSetting("Renderer", Settings::values.use_pessimistic_flushes); | ||||
|     ReadSetting("Renderer", Settings::values.use_vulkan_driver_pipeline_cache); | ||||
|  |  | |||
|  | @ -342,6 +342,10 @@ nvdec_emulation = | |||
| # 0: Off, 1 (default): On | ||||
| accelerate_astc = | ||||
| 
 | ||||
| # Decode ASTC textures asynchronously. | ||||
| # 0 (default): Off, 1: On | ||||
| async_astc = | ||||
| 
 | ||||
| # Turns on the speed limiter, which will limit the emulation speed to the desired speed limit value | ||||
| # 0: Off, 1: On (default) | ||||
| use_speed_limit = | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 liamwhite
						liamwhite