[GPU]: Implement Fast GPU Path

2025-07-24 00:40:58 +02:00 · 2025-07-24 00:40:58 +02:00 · 435483f7c1
commit 435483f7c1
parent bdfcb6c950
7 changed files with 55 additions and 1 deletions
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
@ -17,6 +17,7 @@ enum class BooleanSetting(override val key: String) : AbstractBooleanSetting {
    RENDERER_USE_SPEED_LIMIT("use_speed_limit"),
    USE_FAST_CPU_TIME("use_fast_cpu_time"),
    USE_CUSTOM_CPU_TICKS("use_custom_cpu_ticks"),
+    FAST_GPU_PATH("fast_gpu_path"),
    SKIP_CPU_INNER_INVALIDATION("skip_cpu_inner_invalidation"),
    USE_DOCKED_MODE("use_docked_mode"),
    USE_AUTO_STUB("use_auto_stub"),
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
@ -652,6 +652,13 @@ abstract class SettingsItem(
                    max = 65535
                )
            )
+            put(
+                SwitchSetting(
+                    BooleanSetting.FAST_GPU_PATH,
+                    titleId = R.string.fast_gpu_path,
+                    descriptionId = R.string.fast_gpu_path_description
+                )
+            )
            put(
                SwitchSetting(
                    BooleanSetting.SKIP_CPU_INNER_INVALIDATION,
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
@ -464,6 +464,7 @@ class SettingsFragmentPresenter(
            add(IntSetting.FAST_CPU_TIME.key)
            add(BooleanSetting.USE_CUSTOM_CPU_TICKS.key)
            add(IntSetting.CPU_TICKS.key)
+            add(BooleanSetting.FAST_GPU_PATH.key)
            add(BooleanSetting.SKIP_CPU_INNER_INVALIDATION.key)
            add(BooleanSetting.USE_LRU_CACHE.key)
            add(BooleanSetting.CORE_SYNC_CORE_SPEED.key)
--- a/src/android/app/src/main/res/values/strings.xml
+++ b/src/android/app/src/main/res/values/strings.xml
@ -101,6 +101,8 @@
    <string name="custom_cpu_ticks">Custom CPU Ticks</string>
    <string name="custom_cpu_ticks_description">Set a custom value of CPU ticks. Higher values can increase performance, but may also cause the game to freeze. A range of 77–21000 is recommended.</string>
    <string name="cpu_ticks">Ticks</string>
+    <string name="fast_gpu_path">Fast GPU Path</string>
+    <string name="fast_gpu_path_description">Bypasses all CPU–GPU synchronization and fence handling, reducing overhead and improving the performance. This may cause glitches or crashes on some games.</string>
    <string name="skip_cpu_inner_invalidation">Skip CPU Inner Invalidation</string>
    <string name="skip_cpu_inner_invalidation_description">Skips certain CPU-side cache invalidations during memory updates, reducing CPU usage and improving it\'s performance. This may cause glitches or crashes on some games.</string>
    <string name="fast_cpu_time">CPU Clock</string>
--- a/src/common/settings.h
+++ b/src/common/settings.h
@ -450,6 +450,13 @@ struct Values {
                                                           VramUsageMode::Aggressive,
                                                           "vram_usage_mode",
                                                           Category::RendererAdvanced};
+    SwitchableSetting<bool> fast_gpu_path{linkage,
+                                          false,
+                                          "fast_gpu_path",
+                                          Category::RendererAdvanced,
+                                          Specialization::Default,
+                                          true,
+                                          true};
    SwitchableSetting<bool> skip_cpu_inner_invalidation{linkage,
                                                        true,
                                                        "skip_cpu_inner_invalidation",
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@ -40,7 +40,8 @@ struct GPU::Impl {
    explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_)
        : gpu{gpu_}, system{system_}, host1x{system.Host1x()}, use_nvdec{use_nvdec_},
          shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
-          gpu_thread{system_, is_async_}, scheduler{std::make_unique<Control::Scheduler>(gpu)} {}
+          gpu_thread{system_, is_async_}, scheduler{std::make_unique<Control::Scheduler>(gpu)},
+          fast_path{Settings::values.fast_gpu_path.GetValue()} {}

    ~Impl() = default;

@ -110,6 +111,11 @@ struct GPU::Impl {
    /// Request a host GPU memory flush from the CPU.
    template <typename Func>
    [[nodiscard]] u64 RequestSyncOperation(Func&& action) {
+        if (fast_path) {
+            // Just bump the fence counter, but do NOT enqueue
+            return ++last_sync_fence;
+        }
+
        std::unique_lock lck{sync_request_mutex};
        const u64 fence = ++last_sync_fence;
        sync_requests.emplace_back(action);
@ -122,12 +128,25 @@ struct GPU::Impl {
    }

    void WaitForSyncOperation(const u64 fence) {
+        if (fast_path) {
+            // Never block
+            return;
+        }
+
        std::unique_lock lck{sync_request_mutex};
        sync_request_cv.wait(lck, [this, fence] { return CurrentSyncRequestFence() >= fence; });
    }

    /// Tick pending requests within the GPU.
    void TickWork() {
+        if (fast_path) {
+            // Drop all pending requests in one go
+            sync_requests.clear();
+            current_sync_fence.store(last_sync_fence, std::memory_order_relaxed);
+            sync_request_cv.notify_all();
+            return;
+        }
+
        std::unique_lock lck{sync_request_mutex};
        while (!sync_requests.empty()) {
            auto request = std::move(sync_requests.front());
@ -289,6 +308,11 @@ struct GPU::Impl {

    void RequestComposite(std::vector<Tegra::FramebufferConfig>&& layers,
                          std::vector<Service::Nvidia::NvFence>&& fences) {
+        if (fast_path) {
+            renderer->Composite(layers);
+            return;
+        }
+
        size_t num_fences{fences.size()};
        size_t current_request_counter{};
        {
@ -327,6 +351,10 @@ struct GPU::Impl {
    }

    std::vector<u8> GetAppletCaptureBuffer() {
+        if (fast_path) {
+            return renderer->GetAppletCaptureBuffer();
+        }
+
        std::vector<u8> out;

        const auto wait_fence =
@ -372,6 +400,9 @@ struct GPU::Impl {
    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;

    std::unique_ptr<Tegra::Control::Scheduler> scheduler;
+
+    const bool fast_path;
+
    std::unordered_map<s32, std::shared_ptr<Tegra::Control::ChannelState>> channels;
    Tegra::Control::ChannelState* current_channel;
    s32 bound_channel{-1};
--- a/src/yuzu/configuration/shared_translation.cpp
+++ b/src/yuzu/configuration/shared_translation.cpp
@ -250,6 +250,11 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QWidget* parent)
              "of available video memory for performance. Has no effect on integrated graphics. "
              "Aggressive mode may severely impact the performance of other applications such as "
              "recording software."));
+    INSERT(Settings,
+           fast_gpu_path,
+           tr("Fast GPU Path"),
+           tr("Bypasses all CPU–GPU synchronization and fence handling, reducing overhead and improving "
+              "the performance. This may cause glitches or crashes on some games."));
    INSERT(Settings,
           skip_cpu_inner_invalidation,
           tr("Skip CPU Inner Invalidation"),