From 435483f7c12f4c08df0bb119ab7980587f5084bd Mon Sep 17 00:00:00 2001
From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com>
Date: Thu, 24 Jul 2025 00:40:58 +0200
Subject: [PATCH 1/5] [GPU]: Implement Fast GPU Path

---
 .../features/settings/model/BooleanSetting.kt |  1 +
 .../settings/model/view/SettingsItem.kt       |  7 ++++
 .../settings/ui/SettingsFragmentPresenter.kt  |  1 +
 .../app/src/main/res/values/strings.xml       |  2 ++
 src/common/settings.h                         |  7 ++++
 src/video_core/gpu.cpp                        | 33 ++++++++++++++++++-
 src/yuzu/configuration/shared_translation.cpp |  5 +++
 7 files changed, 55 insertions(+), 1 deletion(-)
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
index 92a49a1de7..ec2984e434 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
@@ -17,6 +17,7 @@ enum class BooleanSetting(override val key: String) : AbstractBooleanSetting {
     RENDERER_USE_SPEED_LIMIT("use_speed_limit"),
     USE_FAST_CPU_TIME("use_fast_cpu_time"),
     USE_CUSTOM_CPU_TICKS("use_custom_cpu_ticks"),
+    FAST_GPU_PATH("fast_gpu_path"),
     SKIP_CPU_INNER_INVALIDATION("skip_cpu_inner_invalidation"),
     USE_DOCKED_MODE("use_docked_mode"),
     USE_AUTO_STUB("use_auto_stub"),
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
index d4335ddcd8..589efd5c58 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
@@ -652,6 +652,13 @@ abstract class SettingsItem(
                     max = 65535
                 )
             )
+            put(
+                SwitchSetting(
+                    BooleanSetting.FAST_GPU_PATH,
+                    titleId = R.string.fast_gpu_path,
+                    descriptionId = R.string.fast_gpu_path_description
+                )
+            )
             put(
                 SwitchSetting(
                     BooleanSetting.SKIP_CPU_INNER_INVALIDATION,
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
index 8555b334ee..adabf67744 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
@@ -464,6 +464,7 @@ class SettingsFragmentPresenter(
             add(IntSetting.FAST_CPU_TIME.key)
             add(BooleanSetting.USE_CUSTOM_CPU_TICKS.key)
             add(IntSetting.CPU_TICKS.key)
+            add(BooleanSetting.FAST_GPU_PATH.key)
             add(BooleanSetting.SKIP_CPU_INNER_INVALIDATION.key)
             add(BooleanSetting.USE_LRU_CACHE.key)
             add(BooleanSetting.CORE_SYNC_CORE_SPEED.key)
diff --git a/src/android/app/src/main/res/values/strings.xml b/src/android/app/src/main/res/values/strings.xml
index c78487e327..a73f0a1a15 100644
--- a/src/android/app/src/main/res/values/strings.xml
+++ b/src/android/app/src/main/res/values/strings.xml
@@ -101,6 +101,8 @@
     <string name="custom_cpu_ticks">Custom CPU Ticks</string>
     <string name="custom_cpu_ticks_description">Set a custom value of CPU ticks. Higher values can increase performance, but may also cause the game to freeze. A range of 77–21000 is recommended.</string>
     <string name="cpu_ticks">Ticks</string>
+    <string name="fast_gpu_path">Fast GPU Path</string>
+    <string name="fast_gpu_path_description">Bypasses all CPU–GPU synchronization and fence handling, reducing overhead and improving the performance. This may cause glitches or crashes on some games.</string>
     <string name="skip_cpu_inner_invalidation">Skip CPU Inner Invalidation</string>
     <string name="skip_cpu_inner_invalidation_description">Skips certain CPU-side cache invalidations during memory updates, reducing CPU usage and improving it\'s performance. This may cause glitches or crashes on some games.</string>
     <string name="fast_cpu_time">CPU Clock</string>
diff --git a/src/common/settings.h b/src/common/settings.h
index e3c2bd57cc..9ac06e526e 100644
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -450,6 +450,13 @@ struct Values {
                                                            VramUsageMode::Aggressive,
                                                            "vram_usage_mode",
                                                            Category::RendererAdvanced};
+    SwitchableSetting<bool> fast_gpu_path{linkage,
+                                          false,
+                                          "fast_gpu_path",
+                                          Category::RendererAdvanced,
+                                          Specialization::Default,
+                                          true,
+                                          true};
     SwitchableSetting<bool> skip_cpu_inner_invalidation{linkage,
                                                         true,
                                                         "skip_cpu_inner_invalidation",
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 7c34005a12..e99ead284c 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -40,7 +40,8 @@ struct GPU::Impl {
     explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_)
         : gpu{gpu_}, system{system_}, host1x{system.Host1x()}, use_nvdec{use_nvdec_},
           shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
-          gpu_thread{system_, is_async_}, scheduler{std::make_unique<Control::Scheduler>(gpu)} {}
+          gpu_thread{system_, is_async_}, scheduler{std::make_unique<Control::Scheduler>(gpu)},
+          fast_path{Settings::values.fast_gpu_path.GetValue()} {}
 
     ~Impl() = default;
 
@@ -110,6 +111,11 @@ struct GPU::Impl {
     /// Request a host GPU memory flush from the CPU.
     template <typename Func>
     [[nodiscard]] u64 RequestSyncOperation(Func&& action) {
+        if (fast_path) {
+            // Just bump the fence counter, but do NOT enqueue
+            return ++last_sync_fence;
+        }
+
         std::unique_lock lck{sync_request_mutex};
         const u64 fence = ++last_sync_fence;
         sync_requests.emplace_back(action);
@@ -122,12 +128,25 @@ struct GPU::Impl {
     }
 
     void WaitForSyncOperation(const u64 fence) {
+        if (fast_path) {
+            // Never block
+            return;
+        }
+
         std::unique_lock lck{sync_request_mutex};
         sync_request_cv.wait(lck, [this, fence] { return CurrentSyncRequestFence() >= fence; });
     }
 
     /// Tick pending requests within the GPU.
     void TickWork() {
+        if (fast_path) {
+            // Drop all pending requests in one go
+            sync_requests.clear();
+            current_sync_fence.store(last_sync_fence, std::memory_order_relaxed);
+            sync_request_cv.notify_all();
+            return;
+        }
+
         std::unique_lock lck{sync_request_mutex};
         while (!sync_requests.empty()) {
             auto request = std::move(sync_requests.front());
@@ -289,6 +308,11 @@ struct GPU::Impl {
 
     void RequestComposite(std::vector<Tegra::FramebufferConfig>&& layers,
                           std::vector<Service::Nvidia::NvFence>&& fences) {
+        if (fast_path) {
+            renderer->Composite(layers);
+            return;
+        }
+
         size_t num_fences{fences.size()};
         size_t current_request_counter{};
         {
@@ -327,6 +351,10 @@ struct GPU::Impl {
     }
 
     std::vector<u8> GetAppletCaptureBuffer() {
+        if (fast_path) {
+            return renderer->GetAppletCaptureBuffer();
+        }
+
         std::vector<u8> out;
 
         const auto wait_fence =
@@ -372,6 +400,9 @@ struct GPU::Impl {
     std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
 
     std::unique_ptr<Tegra::Control::Scheduler> scheduler;
+
+    const bool fast_path;
+
     std::unordered_map<s32, std::shared_ptr<Tegra::Control::ChannelState>> channels;
     Tegra::Control::ChannelState* current_channel;
     s32 bound_channel{-1};
diff --git a/src/yuzu/configuration/shared_translation.cpp b/src/yuzu/configuration/shared_translation.cpp
index 770a16a481..9af0b71210 100644
--- a/src/yuzu/configuration/shared_translation.cpp
+++ b/src/yuzu/configuration/shared_translation.cpp
@@ -250,6 +250,11 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QWidget* parent)
               "of available video memory for performance. Has no effect on integrated graphics. "
               "Aggressive mode may severely impact the performance of other applications such as "
               "recording software."));
+    INSERT(Settings,
+           fast_gpu_path,
+           tr("Fast GPU Path"),
+           tr("Bypasses all CPU–GPU synchronization and fence handling, reducing overhead and improving "
+              "the performance. This may cause glitches or crashes on some games."));
     INSERT(Settings,
            skip_cpu_inner_invalidation,
            tr("Skip CPU Inner Invalidation"),

From 50f6bfb6bd9e8c9ac4aec0d0001c08da8933e5be Mon Sep 17 00:00:00 2001
From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com>
Date: Thu, 24 Jul 2025 00:51:55 +0200
Subject: [PATCH 2/5] [GPU]: Add license header

---
 src/video_core/gpu.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index e99ead284c..0e0126285b 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 

From 6103d2b7f45210f43ee47c76a141974867ccdf75 Mon Sep 17 00:00:00 2001
From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com>
Date: Thu, 24 Jul 2025 01:48:39 +0200
Subject: [PATCH 3/5] [GPU]: Hack rewrite

Hopefully this works properly now
---
 src/video_core/gpu.cpp | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 0e0126285b..a4635a8ad7 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -100,6 +100,7 @@ struct GPU::Impl {
 
     /// Synchronizes CPU writes with Host GPU memory.
     void InvalidateGPUCache() {
+        if (fast_path) return;
         std::function<void(PAddr, size_t)> callback_writes(
             [this](PAddr address, size_t size) { rasterizer->OnCacheInvalidation(address, size); });
         system.GatherGPUDirtyMemory(callback_writes);
@@ -115,10 +116,10 @@ struct GPU::Impl {
     template <typename Func>
     [[nodiscard]] u64 RequestSyncOperation(Func&& action) {
         if (fast_path) {
-            // Just bump the fence counter, but do NOT enqueue
+            // Execute immediately, increment fence, skip queueing
+            action();
             return ++last_sync_fence;
         }
-
         std::unique_lock lck{sync_request_mutex};
         const u64 fence = ++last_sync_fence;
         sync_requests.emplace_back(action);
@@ -131,11 +132,6 @@ struct GPU::Impl {
     }
 
     void WaitForSyncOperation(const u64 fence) {
-        if (fast_path) {
-            // Never block
-            return;
-        }
-
         std::unique_lock lck{sync_request_mutex};
         sync_request_cv.wait(lck, [this, fence] { return CurrentSyncRequestFence() >= fence; });
     }
@@ -143,10 +139,13 @@ struct GPU::Impl {
     /// Tick pending requests within the GPU.
     void TickWork() {
         if (fast_path) {
-            // Drop all pending requests in one go
-            sync_requests.clear();
-            current_sync_fence.store(last_sync_fence, std::memory_order_relaxed);
-            sync_request_cv.notify_all();
+            // Drain queue without waiting on condition variables
+            while (!sync_requests.empty()) {
+                auto req = std::move(sync_requests.front());
+                sync_requests.pop_front();
+                req();
+                current_sync_fence.fetch_add(1, std::memory_order_release);
+            }
             return;
         }
 
@@ -281,6 +280,13 @@ struct GPU::Impl {
     }
 
     VideoCore::RasterizerDownloadArea OnCPURead(DAddr addr, u64 size) {
+        if (fast_path) {
+            // Bypass fence/tick entirely
+            auto raster_area = rasterizer->GetFlushArea(addr, size);
+            rasterizer->FlushRegion(raster_area.start_address, raster_area.end_address - raster_area.start_address);
+            raster_area.preemtive = true;
+            return raster_area;
+        }
         auto raster_area = rasterizer->GetFlushArea(addr, size);
         if (raster_area.preemtive) {
             return raster_area;
@@ -311,11 +317,6 @@ struct GPU::Impl {
 
     void RequestComposite(std::vector<Tegra::FramebufferConfig>&& layers,
                           std::vector<Service::Nvidia::NvFence>&& fences) {
-        if (fast_path) {
-            renderer->Composite(layers);
-            return;
-        }
-
         size_t num_fences{fences.size()};
         size_t current_request_counter{};
         {
@@ -354,10 +355,6 @@ struct GPU::Impl {
     }
 
     std::vector<u8> GetAppletCaptureBuffer() {
-        if (fast_path) {
-            return renderer->GetAppletCaptureBuffer();
-        }
-
         std::vector<u8> out;
 
         const auto wait_fence =

From eed010964ec8f72951340b190de959df6913270c Mon Sep 17 00:00:00 2001
From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com>
Date: Thu, 24 Jul 2025 14:46:13 +0200
Subject: [PATCH 4/5] [GPU]: Try to fix deadlock

---
 src/video_core/gpu.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index a4635a8ad7..a8bb584114 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -116,9 +116,13 @@ struct GPU::Impl {
     template <typename Func>
     [[nodiscard]] u64 RequestSyncOperation(Func&& action) {
         if (fast_path) {
-            // Execute immediately, increment fence, skip queueing
+            // Execute immediately and publish the result
             action();
-            return ++last_sync_fence;
+            const u64 fence = ++last_sync_fence;
+            // Mirror the normal path: advance current and wake any waiters
+            current_sync_fence.store(fence, std::memory_order_release);
+            sync_request_cv.notify_all();
+            return fence;
         }
         std::unique_lock lck{sync_request_mutex};
         const u64 fence = ++last_sync_fence;
@@ -132,6 +136,10 @@ struct GPU::Impl {
     }
 
     void WaitForSyncOperation(const u64 fence) {
+        if (fast_path) {
+            // Don’t block when the hack is on
+            return;
+        }
         std::unique_lock lck{sync_request_mutex};
         sync_request_cv.wait(lck, [this, fence] { return CurrentSyncRequestFence() >= fence; });
     }
@@ -141,9 +149,9 @@ struct GPU::Impl {
         if (fast_path) {
             // Drain queue without waiting on condition variables
             while (!sync_requests.empty()) {
-                auto req = std::move(sync_requests.front());
+                auto request = std::move(sync_requests.front());
                 sync_requests.pop_front();
-                req();
+                request();
                 current_sync_fence.fetch_add(1, std::memory_order_release);
             }
             return;

From c0a1818bbf8e4d91a2565fa988af7d657381471b Mon Sep 17 00:00:00 2001
From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com>
Date: Thu, 24 Jul 2025 15:52:13 +0200
Subject: [PATCH 5/5] [GPU]: Call TickGPU

This is my last try lol
---
 src/video_core/gpu.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index a8bb584114..59d26dba6e 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -118,6 +118,7 @@ struct GPU::Impl {
         if (fast_path) {
             // Execute immediately and publish the result
             action();
+            gpu_thread.TickGPU();
             const u64 fence = ++last_sync_fence;
             // Mirror the normal path: advance current and wake any waiters
             current_sync_fence.store(fence, std::memory_order_release);
@@ -293,6 +294,8 @@ struct GPU::Impl {
             auto raster_area = rasterizer->GetFlushArea(addr, size);
             rasterizer->FlushRegion(raster_area.start_address, raster_area.end_address - raster_area.start_address);
             raster_area.preemtive = true;
+            // Give GPU thread a chance to run that flush
+            gpu_thread.TickGPU();
             return raster_area;
         }
         auto raster_area = rasterizer->GetFlushArea(addr, size);