[GPU]: Implement Fast GPU Path

This commit is contained in:
Gamer64 2025-07-24 00:40:58 +02:00 committed by crueter
parent bdfcb6c950
commit 435483f7c1
7 changed files with 55 additions and 1 deletions

View file

@ -17,6 +17,7 @@ enum class BooleanSetting(override val key: String) : AbstractBooleanSetting {
RENDERER_USE_SPEED_LIMIT("use_speed_limit"),
USE_FAST_CPU_TIME("use_fast_cpu_time"),
USE_CUSTOM_CPU_TICKS("use_custom_cpu_ticks"),
FAST_GPU_PATH("fast_gpu_path"),
SKIP_CPU_INNER_INVALIDATION("skip_cpu_inner_invalidation"),
USE_DOCKED_MODE("use_docked_mode"),
USE_AUTO_STUB("use_auto_stub"),

View file

@ -652,6 +652,13 @@ abstract class SettingsItem(
max = 65535
)
)
put(
SwitchSetting(
BooleanSetting.FAST_GPU_PATH,
titleId = R.string.fast_gpu_path,
descriptionId = R.string.fast_gpu_path_description
)
)
put(
SwitchSetting(
BooleanSetting.SKIP_CPU_INNER_INVALIDATION,

View file

@ -464,6 +464,7 @@ class SettingsFragmentPresenter(
add(IntSetting.FAST_CPU_TIME.key)
add(BooleanSetting.USE_CUSTOM_CPU_TICKS.key)
add(IntSetting.CPU_TICKS.key)
add(BooleanSetting.FAST_GPU_PATH.key)
add(BooleanSetting.SKIP_CPU_INNER_INVALIDATION.key)
add(BooleanSetting.USE_LRU_CACHE.key)
add(BooleanSetting.CORE_SYNC_CORE_SPEED.key)

View file

@ -101,6 +101,8 @@
<string name="custom_cpu_ticks">Custom CPU Ticks</string>
<string name="custom_cpu_ticks_description">Set a custom value of CPU ticks. Higher values can increase performance, but may also cause the game to freeze. A range of 7721000 is recommended.</string>
<string name="cpu_ticks">Ticks</string>
<string name="fast_gpu_path">Fast GPU Path</string>
<string name="fast_gpu_path_description">Bypasses all CPUGPU synchronization and fence handling, reducing overhead and improving the performance. This may cause glitches or crashes on some games.</string>
<string name="skip_cpu_inner_invalidation">Skip CPU Inner Invalidation</string>
<string name="skip_cpu_inner_invalidation_description">Skips certain CPU-side cache invalidations during memory updates, reducing CPU usage and improving it\'s performance. This may cause glitches or crashes on some games.</string>
<string name="fast_cpu_time">CPU Clock</string>

View file

@ -450,6 +450,13 @@ struct Values {
VramUsageMode::Aggressive,
"vram_usage_mode",
Category::RendererAdvanced};
SwitchableSetting<bool> fast_gpu_path{linkage,
false,
"fast_gpu_path",
Category::RendererAdvanced,
Specialization::Default,
true,
true};
SwitchableSetting<bool> skip_cpu_inner_invalidation{linkage,
true,
"skip_cpu_inner_invalidation",

View file

@ -40,7 +40,8 @@ struct GPU::Impl {
explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_)
: gpu{gpu_}, system{system_}, host1x{system.Host1x()}, use_nvdec{use_nvdec_},
shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
gpu_thread{system_, is_async_}, scheduler{std::make_unique<Control::Scheduler>(gpu)} {}
gpu_thread{system_, is_async_}, scheduler{std::make_unique<Control::Scheduler>(gpu)},
fast_path{Settings::values.fast_gpu_path.GetValue()} {}
~Impl() = default;
@ -110,6 +111,11 @@ struct GPU::Impl {
/// Request a host GPU memory flush from the CPU.
template <typename Func>
[[nodiscard]] u64 RequestSyncOperation(Func&& action) {
if (fast_path) {
// Just bump the fence counter, but do NOT enqueue
return ++last_sync_fence;
}
std::unique_lock lck{sync_request_mutex};
const u64 fence = ++last_sync_fence;
sync_requests.emplace_back(action);
@ -122,12 +128,25 @@ struct GPU::Impl {
}
void WaitForSyncOperation(const u64 fence) {
if (fast_path) {
// Never block
return;
}
std::unique_lock lck{sync_request_mutex};
sync_request_cv.wait(lck, [this, fence] { return CurrentSyncRequestFence() >= fence; });
}
/// Tick pending requests within the GPU.
void TickWork() {
if (fast_path) {
// Drop all pending requests in one go
sync_requests.clear();
current_sync_fence.store(last_sync_fence, std::memory_order_relaxed);
sync_request_cv.notify_all();
return;
}
std::unique_lock lck{sync_request_mutex};
while (!sync_requests.empty()) {
auto request = std::move(sync_requests.front());
@ -289,6 +308,11 @@ struct GPU::Impl {
void RequestComposite(std::vector<Tegra::FramebufferConfig>&& layers,
std::vector<Service::Nvidia::NvFence>&& fences) {
if (fast_path) {
renderer->Composite(layers);
return;
}
size_t num_fences{fences.size()};
size_t current_request_counter{};
{
@ -327,6 +351,10 @@ struct GPU::Impl {
}
std::vector<u8> GetAppletCaptureBuffer() {
if (fast_path) {
return renderer->GetAppletCaptureBuffer();
}
std::vector<u8> out;
const auto wait_fence =
@ -372,6 +400,9 @@ struct GPU::Impl {
std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
std::unique_ptr<Tegra::Control::Scheduler> scheduler;
const bool fast_path;
std::unordered_map<s32, std::shared_ptr<Tegra::Control::ChannelState>> channels;
Tegra::Control::ChannelState* current_channel;
s32 bound_channel{-1};

View file

@ -250,6 +250,11 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QWidget* parent)
"of available video memory for performance. Has no effect on integrated graphics. "
"Aggressive mode may severely impact the performance of other applications such as "
"recording software."));
INSERT(Settings,
fast_gpu_path,
tr("Fast GPU Path"),
tr("Bypasses all CPUGPU synchronization and fence handling, reducing overhead and improving "
"the performance. This may cause glitches or crashes on some games."));
INSERT(Settings,
skip_cpu_inner_invalidation,
tr("Skip CPU Inner Invalidation"),