From 435483f7c12f4c08df0bb119ab7980587f5084bd Mon Sep 17 00:00:00 2001 From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com> Date: Thu, 24 Jul 2025 00:40:58 +0200 Subject: [PATCH 1/5] [GPU]: Implement Fast GPU Path --- .../features/settings/model/BooleanSetting.kt | 1 + .../settings/model/view/SettingsItem.kt | 7 ++++ .../settings/ui/SettingsFragmentPresenter.kt | 1 + .../app/src/main/res/values/strings.xml | 2 ++ src/common/settings.h | 7 ++++ src/video_core/gpu.cpp | 33 ++++++++++++++++++- src/yuzu/configuration/shared_translation.cpp | 5 +++ 7 files changed, 55 insertions(+), 1 deletion(-) diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt index 92a49a1de7..ec2984e434 100644 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt @@ -17,6 +17,7 @@ enum class BooleanSetting(override val key: String) : AbstractBooleanSetting { RENDERER_USE_SPEED_LIMIT("use_speed_limit"), USE_FAST_CPU_TIME("use_fast_cpu_time"), USE_CUSTOM_CPU_TICKS("use_custom_cpu_ticks"), + FAST_GPU_PATH("fast_gpu_path"), SKIP_CPU_INNER_INVALIDATION("skip_cpu_inner_invalidation"), USE_DOCKED_MODE("use_docked_mode"), USE_AUTO_STUB("use_auto_stub"), diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt index d4335ddcd8..589efd5c58 100644 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt @@ -652,6 +652,13 @@ abstract class SettingsItem( max = 65535 ) ) + put( + SwitchSetting( + BooleanSetting.FAST_GPU_PATH, + titleId = R.string.fast_gpu_path, + descriptionId = R.string.fast_gpu_path_description + ) + ) put( SwitchSetting( BooleanSetting.SKIP_CPU_INNER_INVALIDATION, diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt index 8555b334ee..adabf67744 100644 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt @@ -464,6 +464,7 @@ class SettingsFragmentPresenter( add(IntSetting.FAST_CPU_TIME.key) add(BooleanSetting.USE_CUSTOM_CPU_TICKS.key) add(IntSetting.CPU_TICKS.key) + add(BooleanSetting.FAST_GPU_PATH.key) add(BooleanSetting.SKIP_CPU_INNER_INVALIDATION.key) add(BooleanSetting.USE_LRU_CACHE.key) add(BooleanSetting.CORE_SYNC_CORE_SPEED.key) diff --git a/src/android/app/src/main/res/values/strings.xml b/src/android/app/src/main/res/values/strings.xml index c78487e327..a73f0a1a15 100644 --- a/src/android/app/src/main/res/values/strings.xml +++ b/src/android/app/src/main/res/values/strings.xml @@ -101,6 +101,8 @@ Custom CPU Ticks Set a custom value of CPU ticks. Higher values can increase performance, but may also cause the game to freeze. A range of 77–21000 is recommended. Ticks + Fast GPU Path + Bypasses all CPU–GPU synchronization and fence handling, reducing overhead and improving the performance. This may cause glitches or crashes on some games. Skip CPU Inner Invalidation Skips certain CPU-side cache invalidations during memory updates, reducing CPU usage and improving it\'s performance. This may cause glitches or crashes on some games. CPU Clock diff --git a/src/common/settings.h b/src/common/settings.h index e3c2bd57cc..9ac06e526e 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -450,6 +450,13 @@ struct Values { VramUsageMode::Aggressive, "vram_usage_mode", Category::RendererAdvanced}; + SwitchableSetting fast_gpu_path{linkage, + false, + "fast_gpu_path", + Category::RendererAdvanced, + Specialization::Default, + true, + true}; SwitchableSetting skip_cpu_inner_invalidation{linkage, true, "skip_cpu_inner_invalidation", diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 7c34005a12..e99ead284c 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -40,7 +40,8 @@ struct GPU::Impl { explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_) : gpu{gpu_}, system{system_}, host1x{system.Host1x()}, use_nvdec{use_nvdec_}, shader_notify{std::make_unique()}, is_async{is_async_}, - gpu_thread{system_, is_async_}, scheduler{std::make_unique(gpu)} {} + gpu_thread{system_, is_async_}, scheduler{std::make_unique(gpu)}, + fast_path{Settings::values.fast_gpu_path.GetValue()} {} ~Impl() = default; @@ -110,6 +111,11 @@ struct GPU::Impl { /// Request a host GPU memory flush from the CPU. template [[nodiscard]] u64 RequestSyncOperation(Func&& action) { + if (fast_path) { + // Just bump the fence counter, but do NOT enqueue + return ++last_sync_fence; + } + std::unique_lock lck{sync_request_mutex}; const u64 fence = ++last_sync_fence; sync_requests.emplace_back(action); @@ -122,12 +128,25 @@ struct GPU::Impl { } void WaitForSyncOperation(const u64 fence) { + if (fast_path) { + // Never block + return; + } + std::unique_lock lck{sync_request_mutex}; sync_request_cv.wait(lck, [this, fence] { return CurrentSyncRequestFence() >= fence; }); } /// Tick pending requests within the GPU. void TickWork() { + if (fast_path) { + // Drop all pending requests in one go + sync_requests.clear(); + current_sync_fence.store(last_sync_fence, std::memory_order_relaxed); + sync_request_cv.notify_all(); + return; + } + std::unique_lock lck{sync_request_mutex}; while (!sync_requests.empty()) { auto request = std::move(sync_requests.front()); @@ -289,6 +308,11 @@ struct GPU::Impl { void RequestComposite(std::vector&& layers, std::vector&& fences) { + if (fast_path) { + renderer->Composite(layers); + return; + } + size_t num_fences{fences.size()}; size_t current_request_counter{}; { @@ -327,6 +351,10 @@ struct GPU::Impl { } std::vector GetAppletCaptureBuffer() { + if (fast_path) { + return renderer->GetAppletCaptureBuffer(); + } + std::vector out; const auto wait_fence = @@ -372,6 +400,9 @@ struct GPU::Impl { std::unique_ptr cpu_context; std::unique_ptr scheduler; + + const bool fast_path; + std::unordered_map> channels; Tegra::Control::ChannelState* current_channel; s32 bound_channel{-1}; diff --git a/src/yuzu/configuration/shared_translation.cpp b/src/yuzu/configuration/shared_translation.cpp index 770a16a481..9af0b71210 100644 --- a/src/yuzu/configuration/shared_translation.cpp +++ b/src/yuzu/configuration/shared_translation.cpp @@ -250,6 +250,11 @@ std::unique_ptr InitializeTranslations(QWidget* parent) "of available video memory for performance. Has no effect on integrated graphics. " "Aggressive mode may severely impact the performance of other applications such as " "recording software.")); + INSERT(Settings, + fast_gpu_path, + tr("Fast GPU Path"), + tr("Bypasses all CPU–GPU synchronization and fence handling, reducing overhead and improving " + "the performance. This may cause glitches or crashes on some games.")); INSERT(Settings, skip_cpu_inner_invalidation, tr("Skip CPU Inner Invalidation"), From 50f6bfb6bd9e8c9ac4aec0d0001c08da8933e5be Mon Sep 17 00:00:00 2001 From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com> Date: Thu, 24 Jul 2025 00:51:55 +0200 Subject: [PATCH 2/5] [GPU]: Add license header --- src/video_core/gpu.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index e99ead284c..0e0126285b 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later From 6103d2b7f45210f43ee47c76a141974867ccdf75 Mon Sep 17 00:00:00 2001 From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com> Date: Thu, 24 Jul 2025 01:48:39 +0200 Subject: [PATCH 3/5] [GPU]: Hack rewrite Hopefully this works properly now --- src/video_core/gpu.cpp | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 0e0126285b..a4635a8ad7 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -100,6 +100,7 @@ struct GPU::Impl { /// Synchronizes CPU writes with Host GPU memory. void InvalidateGPUCache() { + if (fast_path) return; std::function callback_writes( [this](PAddr address, size_t size) { rasterizer->OnCacheInvalidation(address, size); }); system.GatherGPUDirtyMemory(callback_writes); @@ -115,10 +116,10 @@ struct GPU::Impl { template [[nodiscard]] u64 RequestSyncOperation(Func&& action) { if (fast_path) { - // Just bump the fence counter, but do NOT enqueue + // Execute immediately, increment fence, skip queueing + action(); return ++last_sync_fence; } - std::unique_lock lck{sync_request_mutex}; const u64 fence = ++last_sync_fence; sync_requests.emplace_back(action); @@ -131,11 +132,6 @@ struct GPU::Impl { } void WaitForSyncOperation(const u64 fence) { - if (fast_path) { - // Never block - return; - } - std::unique_lock lck{sync_request_mutex}; sync_request_cv.wait(lck, [this, fence] { return CurrentSyncRequestFence() >= fence; }); } @@ -143,10 +139,13 @@ struct GPU::Impl { /// Tick pending requests within the GPU. void TickWork() { if (fast_path) { - // Drop all pending requests in one go - sync_requests.clear(); - current_sync_fence.store(last_sync_fence, std::memory_order_relaxed); - sync_request_cv.notify_all(); + // Drain queue without waiting on condition variables + while (!sync_requests.empty()) { + auto req = std::move(sync_requests.front()); + sync_requests.pop_front(); + req(); + current_sync_fence.fetch_add(1, std::memory_order_release); + } return; } @@ -281,6 +280,13 @@ struct GPU::Impl { } VideoCore::RasterizerDownloadArea OnCPURead(DAddr addr, u64 size) { + if (fast_path) { + // Bypass fence/tick entirely + auto raster_area = rasterizer->GetFlushArea(addr, size); + rasterizer->FlushRegion(raster_area.start_address, raster_area.end_address - raster_area.start_address); + raster_area.preemtive = true; + return raster_area; + } auto raster_area = rasterizer->GetFlushArea(addr, size); if (raster_area.preemtive) { return raster_area; @@ -311,11 +317,6 @@ struct GPU::Impl { void RequestComposite(std::vector&& layers, std::vector&& fences) { - if (fast_path) { - renderer->Composite(layers); - return; - } - size_t num_fences{fences.size()}; size_t current_request_counter{}; { @@ -354,10 +355,6 @@ struct GPU::Impl { } std::vector GetAppletCaptureBuffer() { - if (fast_path) { - return renderer->GetAppletCaptureBuffer(); - } - std::vector out; const auto wait_fence = From eed010964ec8f72951340b190de959df6913270c Mon Sep 17 00:00:00 2001 From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com> Date: Thu, 24 Jul 2025 14:46:13 +0200 Subject: [PATCH 4/5] [GPU]: Try to fix deadlock --- src/video_core/gpu.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index a4635a8ad7..a8bb584114 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -116,9 +116,13 @@ struct GPU::Impl { template [[nodiscard]] u64 RequestSyncOperation(Func&& action) { if (fast_path) { - // Execute immediately, increment fence, skip queueing + // Execute immediately and publish the result action(); - return ++last_sync_fence; + const u64 fence = ++last_sync_fence; + // Mirror the normal path: advance current and wake any waiters + current_sync_fence.store(fence, std::memory_order_release); + sync_request_cv.notify_all(); + return fence; } std::unique_lock lck{sync_request_mutex}; const u64 fence = ++last_sync_fence; @@ -132,6 +136,10 @@ struct GPU::Impl { } void WaitForSyncOperation(const u64 fence) { + if (fast_path) { + // Don’t block when the hack is on + return; + } std::unique_lock lck{sync_request_mutex}; sync_request_cv.wait(lck, [this, fence] { return CurrentSyncRequestFence() >= fence; }); } @@ -141,9 +149,9 @@ struct GPU::Impl { if (fast_path) { // Drain queue without waiting on condition variables while (!sync_requests.empty()) { - auto req = std::move(sync_requests.front()); + auto request = std::move(sync_requests.front()); sync_requests.pop_front(); - req(); + request(); current_sync_fence.fetch_add(1, std::memory_order_release); } return; From c0a1818bbf8e4d91a2565fa988af7d657381471b Mon Sep 17 00:00:00 2001 From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com> Date: Thu, 24 Jul 2025 15:52:13 +0200 Subject: [PATCH 5/5] [GPU]: Call TickGPU This is my last try lol --- src/video_core/gpu.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index a8bb584114..59d26dba6e 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -118,6 +118,7 @@ struct GPU::Impl { if (fast_path) { // Execute immediately and publish the result action(); + gpu_thread.TickGPU(); const u64 fence = ++last_sync_fence; // Mirror the normal path: advance current and wake any waiters current_sync_fence.store(fence, std::memory_order_release); @@ -293,6 +294,8 @@ struct GPU::Impl { auto raster_area = rasterizer->GetFlushArea(addr, size); rasterizer->FlushRegion(raster_area.start_address, raster_area.end_address - raster_area.start_address); raster_area.preemtive = true; + // Give GPU thread a chance to run that flush + gpu_thread.TickGPU(); return raster_area; } auto raster_area = rasterizer->GetFlushArea(addr, size);