From 00de315a8c417c137b5c668cae3ace5cf8355941 Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 03:17:42 +0000 Subject: [PATCH 01/17] [vk, opengl] add lanczo filtering Signed-off-by: lizzie --- src/common/settings_enums.h | 2 +- src/qt_common/shared_translation.cpp | 1 + src/qt_common/shared_translation.h | 2 + src/video_core/host_shaders/CMakeLists.txt | 5 ++- .../host_shaders/present_lanczo.frag | 43 +++++++++++++++++++ .../renderer_opengl/gl_blit_screen.cpp | 3 ++ .../renderer_opengl/present/filters.cpp | 5 +++ .../renderer_opengl/present/filters.h | 1 + .../renderer_vulkan/present/filters.cpp | 6 +++ .../renderer_vulkan/present/filters.h | 1 + .../renderer_vulkan/vk_blit_screen.cpp | 3 ++ 11 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 src/video_core/host_shaders/present_lanczo.frag diff --git a/src/common/settings_enums.h b/src/common/settings_enums.h index 41133a7819..8d93c61ec1 100644 --- a/src/common/settings_enums.h +++ b/src/common/settings_enums.h @@ -166,7 +166,7 @@ ENUM(ResolutionSetup, Res7X, Res8X); -ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Gaussian, ScaleForce, Fsr, Area, MaxEnum); +ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Gaussian, Lanczo, ScaleForce, Fsr, Area, MaxEnum); ENUM(AntiAliasing, None, Fxaa, Smaa, MaxEnum); diff --git a/src/qt_common/shared_translation.cpp b/src/qt_common/shared_translation.cpp index cdc05e60e0..74d4dafc59 100644 --- a/src/qt_common/shared_translation.cpp +++ b/src/qt_common/shared_translation.cpp @@ -576,6 +576,7 @@ std::unique_ptr ComboboxEnumeration(QObject* parent) PAIR(ScalingFilter, Bilinear, tr("Bilinear")), PAIR(ScalingFilter, Bicubic, tr("Bicubic")), PAIR(ScalingFilter, Gaussian, tr("Gaussian")), + PAIR(ScalingFilter, Lanczo, tr("Lanczo")), PAIR(ScalingFilter, ScaleForce, tr("ScaleForce")), PAIR(ScalingFilter, Fsr, tr("AMD FidelityFX™️ Super Resolution")), PAIR(ScalingFilter, Area, tr("Area")), diff --git a/src/qt_common/shared_translation.h b/src/qt_common/shared_translation.h index 48a2cb5205..a894da290a 100644 --- a/src/qt_common/shared_translation.h +++ b/src/qt_common/shared_translation.h @@ -40,6 +40,8 @@ static const std::map scaling_filter_texts_map {Settings::ScalingFilter::Bicubic, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Bicubic"))}, {Settings::ScalingFilter::Gaussian, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Gaussian"))}, + {Settings::ScalingFilter::Lanczo, + QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Lanczo"))}, {Settings::ScalingFilter::ScaleForce, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "ScaleForce"))}, {Settings::ScalingFilter::Fsr, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "FSR"))}, diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 688e10d2e4..e7dac21f98 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -1,5 +1,5 @@ -# SPDX-FileCopyrightText: 2018 yuzu Emulator Project -# SPDX-License-Identifier: GPL-2.0-or-later +# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +# SPDX-License-Identifier: GPL-3.0-or-later set(FIDELITYFX_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/externals/FidelityFX-FSR/ffx-fsr) @@ -45,6 +45,7 @@ set(SHADER_FILES present_area.frag present_bicubic.frag present_gaussian.frag + present_lanczo.frag queries_prefix_scan_sum.comp queries_prefix_scan_sum_nosubgroups.comp resolve_conditional_render.comp diff --git a/src/video_core/host_shaders/present_lanczo.frag b/src/video_core/host_shaders/present_lanczo.frag new file mode 100644 index 0000000000..5afc985bc3 --- /dev/null +++ b/src/video_core/host_shaders/present_lanczo.frag @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +// https://en.wikipedia.org/wiki/Lanczos_resampling + +#version 460 core + +layout (location = 0) in vec2 frag_tex_coord; +layout (location = 0) out vec4 color; +layout (binding = 0) uniform sampler2D color_texture; + +#define PI 3.1415926535897932384626433 + +float sinc(float x) { + return x == 0.0f ? 1.0f : sin(PI * x) / (PI * x); +} + +float lanczos(vec2 v, float a) { + float d = sqrt(v.x * v.x + v.y * v.y); + return sinc(d) / sinc(d / a); +} + +vec4 textureLanczos(sampler2D textureSampler, vec2 p) { + const int r = 1; //radius (1 = 3 steps) + vec3 c_sum = vec3(0.0f); + float w_sum = 0.0f; + vec2 res = vec2(textureSize(textureSampler, 0)); + vec2 cc = floor(p * res) / res; + // kernel size = (r * 2 + 1) * (r * 2 + 1) + for (int x = -r; x <= r; x++) + for (int y = -r; y <= r; y++) { + vec2 kp = 0.5f * (vec2(x, y) / res); // 0.5 = half-pixel level resampling + vec2 uv = cc + kp; + float w = lanczos(kp, float(r)); + c_sum += w * texture(textureSampler, p + kp).rgb; + w_sum += w; + } + return vec4(c_sum / w_sum, 1.0f); +} + +void main() { + color = textureLanczos(color_texture, frag_tex_coord); +} diff --git a/src/video_core/renderer_opengl/gl_blit_screen.cpp b/src/video_core/renderer_opengl/gl_blit_screen.cpp index 2071fe8d15..9fff39143e 100644 --- a/src/video_core/renderer_opengl/gl_blit_screen.cpp +++ b/src/video_core/renderer_opengl/gl_blit_screen.cpp @@ -89,6 +89,9 @@ void BlitScreen::CreateWindowAdapt() { case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device); break; + case Settings::ScalingFilter::Lanczo: + window_adapt = MakeLanczo(device); + break; case Settings::ScalingFilter::ScaleForce: window_adapt = MakeScaleForce(device); break; diff --git a/src/video_core/renderer_opengl/present/filters.cpp b/src/video_core/renderer_opengl/present/filters.cpp index c5ac8e7823..a9b3cdd0d9 100644 --- a/src/video_core/renderer_opengl/present/filters.cpp +++ b/src/video_core/renderer_opengl/present/filters.cpp @@ -37,6 +37,11 @@ std::unique_ptr MakeGaussian(const Device& device) { HostShaders::PRESENT_GAUSSIAN_FRAG); } +std::unique_ptr MakeLanczo(const Device& device) { + return std::make_unique(device, CreateBilinearSampler(), + HostShaders::PRESENT_LANCZO_FRAG); +} + std::unique_ptr MakeScaleForce(const Device& device) { return std::make_unique( device, CreateBilinearSampler(), diff --git a/src/video_core/renderer_opengl/present/filters.h b/src/video_core/renderer_opengl/present/filters.h index be2ce24842..c098d0da2e 100644 --- a/src/video_core/renderer_opengl/present/filters.h +++ b/src/video_core/renderer_opengl/present/filters.h @@ -18,6 +18,7 @@ std::unique_ptr MakeNearestNeighbor(const Device& device); std::unique_ptr MakeBilinear(const Device& device); std::unique_ptr MakeBicubic(const Device& device); std::unique_ptr MakeGaussian(const Device& device); +std::unique_ptr MakeLanczo(const Device& device); std::unique_ptr MakeScaleForce(const Device& device); std::unique_ptr MakeArea(const Device& device); diff --git a/src/video_core/renderer_vulkan/present/filters.cpp b/src/video_core/renderer_vulkan/present/filters.cpp index 7843f38d2c..a3a6bfc2f6 100644 --- a/src/video_core/renderer_vulkan/present/filters.cpp +++ b/src/video_core/renderer_vulkan/present/filters.cpp @@ -12,6 +12,7 @@ #include "video_core/host_shaders/present_area_frag_spv.h" #include "video_core/host_shaders/present_bicubic_frag_spv.h" #include "video_core/host_shaders/present_gaussian_frag_spv.h" +#include "video_core/host_shaders/present_lanczso_frag_spv.h" #include "video_core/host_shaders/vulkan_present_frag_spv.h" #include "video_core/host_shaders/vulkan_present_scaleforce_fp16_frag_spv.h" #include "video_core/host_shaders/vulkan_present_scaleforce_fp32_frag_spv.h" @@ -59,6 +60,11 @@ std::unique_ptr MakeGaussian(const Device& device, VkFormat fra BuildShader(device, PRESENT_GAUSSIAN_FRAG_SPV)); } +std::unique_ptr MakeLanczo(const Device& device, VkFormat frame_format) { + return std::make_unique(device, frame_format, CreateBilinearSampler(device), + BuildShader(device, PRESENT_LANCZO_FRAG_SPV)); +} + std::unique_ptr MakeScaleForce(const Device& device, VkFormat frame_format) { return std::make_unique(device, frame_format, CreateBilinearSampler(device), SelectScaleForceShader(device)); diff --git a/src/video_core/renderer_vulkan/present/filters.h b/src/video_core/renderer_vulkan/present/filters.h index c8259487f8..c51938db24 100644 --- a/src/video_core/renderer_vulkan/present/filters.h +++ b/src/video_core/renderer_vulkan/present/filters.h @@ -19,6 +19,7 @@ std::unique_ptr MakeNearestNeighbor(const Device& device, VkFor std::unique_ptr MakeBilinear(const Device& device, VkFormat frame_format); std::unique_ptr MakeBicubic(const Device& device, VkFormat frame_format); std::unique_ptr MakeGaussian(const Device& device, VkFormat frame_format); +std::unique_ptr MakeLanczo(const Device& device, VkFormat frame_format); std::unique_ptr MakeScaleForce(const Device& device, VkFormat frame_format); std::unique_ptr MakeArea(const Device& device, VkFormat frame_format); diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 39f07b966d..b398062dae 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -46,6 +46,9 @@ void BlitScreen::SetWindowAdaptPass() { case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device, swapchain_view_format); break; + case Settings::ScalingFilter::Lanczo: + window_adapt = MakeLanczo(device, swapchain_view_format); + break; case Settings::ScalingFilter::ScaleForce: window_adapt = MakeScaleForce(device, swapchain_view_format); break; From 8abe4b269705751df46ff8876752830df24fa58b Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 03:26:07 +0000 Subject: [PATCH 02/17] fix mispell Signed-off-by: lizzie --- src/common/settings_enums.h | 2 +- src/qt_common/shared_translation.cpp | 2 +- src/qt_common/shared_translation.h | 4 ++-- src/video_core/host_shaders/CMakeLists.txt | 2 +- .../{present_lanczo.frag => present_lanczos.frag} | 0 src/video_core/renderer_opengl/gl_blit_screen.cpp | 4 ++-- src/video_core/renderer_opengl/present/filters.cpp | 3 ++- src/video_core/renderer_opengl/present/filters.h | 2 +- src/video_core/renderer_vulkan/present/filters.cpp | 4 ++-- src/video_core/renderer_vulkan/present/filters.h | 2 +- src/video_core/renderer_vulkan/vk_blit_screen.cpp | 4 ++-- 11 files changed, 15 insertions(+), 14 deletions(-) rename src/video_core/host_shaders/{present_lanczo.frag => present_lanczos.frag} (100%) diff --git a/src/common/settings_enums.h b/src/common/settings_enums.h index 8d93c61ec1..c768c23cda 100644 --- a/src/common/settings_enums.h +++ b/src/common/settings_enums.h @@ -166,7 +166,7 @@ ENUM(ResolutionSetup, Res7X, Res8X); -ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Gaussian, Lanczo, ScaleForce, Fsr, Area, MaxEnum); +ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Gaussian, Lanczos, ScaleForce, Fsr, Area, MaxEnum); ENUM(AntiAliasing, None, Fxaa, Smaa, MaxEnum); diff --git a/src/qt_common/shared_translation.cpp b/src/qt_common/shared_translation.cpp index 74d4dafc59..dfda88ba74 100644 --- a/src/qt_common/shared_translation.cpp +++ b/src/qt_common/shared_translation.cpp @@ -576,7 +576,7 @@ std::unique_ptr ComboboxEnumeration(QObject* parent) PAIR(ScalingFilter, Bilinear, tr("Bilinear")), PAIR(ScalingFilter, Bicubic, tr("Bicubic")), PAIR(ScalingFilter, Gaussian, tr("Gaussian")), - PAIR(ScalingFilter, Lanczo, tr("Lanczo")), + PAIR(ScalingFilter, Lanczos, tr("Lanczos")), PAIR(ScalingFilter, ScaleForce, tr("ScaleForce")), PAIR(ScalingFilter, Fsr, tr("AMD FidelityFX™️ Super Resolution")), PAIR(ScalingFilter, Area, tr("Area")), diff --git a/src/qt_common/shared_translation.h b/src/qt_common/shared_translation.h index a894da290a..ea8e7fe1bd 100644 --- a/src/qt_common/shared_translation.h +++ b/src/qt_common/shared_translation.h @@ -40,8 +40,8 @@ static const std::map scaling_filter_texts_map {Settings::ScalingFilter::Bicubic, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Bicubic"))}, {Settings::ScalingFilter::Gaussian, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Gaussian"))}, - {Settings::ScalingFilter::Lanczo, - QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Lanczo"))}, + {Settings::ScalingFilter::Lanczos, + QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Lanczos"))}, {Settings::ScalingFilter::ScaleForce, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "ScaleForce"))}, {Settings::ScalingFilter::Fsr, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "FSR"))}, diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index e7dac21f98..d8ea826498 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -45,7 +45,7 @@ set(SHADER_FILES present_area.frag present_bicubic.frag present_gaussian.frag - present_lanczo.frag + present_lanczos.frag queries_prefix_scan_sum.comp queries_prefix_scan_sum_nosubgroups.comp resolve_conditional_render.comp diff --git a/src/video_core/host_shaders/present_lanczo.frag b/src/video_core/host_shaders/present_lanczos.frag similarity index 100% rename from src/video_core/host_shaders/present_lanczo.frag rename to src/video_core/host_shaders/present_lanczos.frag diff --git a/src/video_core/renderer_opengl/gl_blit_screen.cpp b/src/video_core/renderer_opengl/gl_blit_screen.cpp index 9fff39143e..5d2246ada1 100644 --- a/src/video_core/renderer_opengl/gl_blit_screen.cpp +++ b/src/video_core/renderer_opengl/gl_blit_screen.cpp @@ -89,8 +89,8 @@ void BlitScreen::CreateWindowAdapt() { case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device); break; - case Settings::ScalingFilter::Lanczo: - window_adapt = MakeLanczo(device); + case Settings::ScalingFilter::Lanczos: + window_adapt = MakeLanczos(device); break; case Settings::ScalingFilter::ScaleForce: window_adapt = MakeScaleForce(device); diff --git a/src/video_core/renderer_opengl/present/filters.cpp b/src/video_core/renderer_opengl/present/filters.cpp index a9b3cdd0d9..8464123be0 100644 --- a/src/video_core/renderer_opengl/present/filters.cpp +++ b/src/video_core/renderer_opengl/present/filters.cpp @@ -12,6 +12,7 @@ #include "video_core/host_shaders/present_area_frag.h" #include "video_core/host_shaders/present_bicubic_frag.h" #include "video_core/host_shaders/present_gaussian_frag.h" +#include "video_core/host_shaders/present_lanczos_frag.h" #include "video_core/renderer_opengl/present/filters.h" #include "video_core/renderer_opengl/present/util.h" @@ -37,7 +38,7 @@ std::unique_ptr MakeGaussian(const Device& device) { HostShaders::PRESENT_GAUSSIAN_FRAG); } -std::unique_ptr MakeLanczo(const Device& device) { +std::unique_ptr MakeLanczos(const Device& device) { return std::make_unique(device, CreateBilinearSampler(), HostShaders::PRESENT_LANCZO_FRAG); } diff --git a/src/video_core/renderer_opengl/present/filters.h b/src/video_core/renderer_opengl/present/filters.h index c098d0da2e..f71b5f93d3 100644 --- a/src/video_core/renderer_opengl/present/filters.h +++ b/src/video_core/renderer_opengl/present/filters.h @@ -18,7 +18,7 @@ std::unique_ptr MakeNearestNeighbor(const Device& device); std::unique_ptr MakeBilinear(const Device& device); std::unique_ptr MakeBicubic(const Device& device); std::unique_ptr MakeGaussian(const Device& device); -std::unique_ptr MakeLanczo(const Device& device); +std::unique_ptr MakeLanczos(const Device& device); std::unique_ptr MakeScaleForce(const Device& device); std::unique_ptr MakeArea(const Device& device); diff --git a/src/video_core/renderer_vulkan/present/filters.cpp b/src/video_core/renderer_vulkan/present/filters.cpp index a3a6bfc2f6..5ab3ac3114 100644 --- a/src/video_core/renderer_vulkan/present/filters.cpp +++ b/src/video_core/renderer_vulkan/present/filters.cpp @@ -12,7 +12,7 @@ #include "video_core/host_shaders/present_area_frag_spv.h" #include "video_core/host_shaders/present_bicubic_frag_spv.h" #include "video_core/host_shaders/present_gaussian_frag_spv.h" -#include "video_core/host_shaders/present_lanczso_frag_spv.h" +#include "video_core/host_shaders/present_lanczos_frag_spv.h" #include "video_core/host_shaders/vulkan_present_frag_spv.h" #include "video_core/host_shaders/vulkan_present_scaleforce_fp16_frag_spv.h" #include "video_core/host_shaders/vulkan_present_scaleforce_fp32_frag_spv.h" @@ -60,7 +60,7 @@ std::unique_ptr MakeGaussian(const Device& device, VkFormat fra BuildShader(device, PRESENT_GAUSSIAN_FRAG_SPV)); } -std::unique_ptr MakeLanczo(const Device& device, VkFormat frame_format) { +std::unique_ptr MakeLanczos(const Device& device, VkFormat frame_format) { return std::make_unique(device, frame_format, CreateBilinearSampler(device), BuildShader(device, PRESENT_LANCZO_FRAG_SPV)); } diff --git a/src/video_core/renderer_vulkan/present/filters.h b/src/video_core/renderer_vulkan/present/filters.h index c51938db24..8b0630e748 100644 --- a/src/video_core/renderer_vulkan/present/filters.h +++ b/src/video_core/renderer_vulkan/present/filters.h @@ -19,7 +19,7 @@ std::unique_ptr MakeNearestNeighbor(const Device& device, VkFor std::unique_ptr MakeBilinear(const Device& device, VkFormat frame_format); std::unique_ptr MakeBicubic(const Device& device, VkFormat frame_format); std::unique_ptr MakeGaussian(const Device& device, VkFormat frame_format); -std::unique_ptr MakeLanczo(const Device& device, VkFormat frame_format); +std::unique_ptr MakeLanczos(const Device& device, VkFormat frame_format); std::unique_ptr MakeScaleForce(const Device& device, VkFormat frame_format); std::unique_ptr MakeArea(const Device& device, VkFormat frame_format); diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index b398062dae..3a003a871e 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -46,8 +46,8 @@ void BlitScreen::SetWindowAdaptPass() { case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device, swapchain_view_format); break; - case Settings::ScalingFilter::Lanczo: - window_adapt = MakeLanczo(device, swapchain_view_format); + case Settings::ScalingFilter::Lanczos: + window_adapt = MakeLanczos(device, swapchain_view_format); break; case Settings::ScalingFilter::ScaleForce: window_adapt = MakeScaleForce(device, swapchain_view_format); From 5ec5c5e19ba3af62fb7485144de45b271d421e5d Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 03:39:37 +0000 Subject: [PATCH 03/17] fix Signed-off-by: lizzie --- src/video_core/renderer_opengl/present/filters.cpp | 2 +- src/video_core/renderer_vulkan/present/filters.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_opengl/present/filters.cpp b/src/video_core/renderer_opengl/present/filters.cpp index 8464123be0..3424a52d80 100644 --- a/src/video_core/renderer_opengl/present/filters.cpp +++ b/src/video_core/renderer_opengl/present/filters.cpp @@ -40,7 +40,7 @@ std::unique_ptr MakeGaussian(const Device& device) { std::unique_ptr MakeLanczos(const Device& device) { return std::make_unique(device, CreateBilinearSampler(), - HostShaders::PRESENT_LANCZO_FRAG); + HostShaders::PRESENT_LANCZOS_FRAG); } std::unique_ptr MakeScaleForce(const Device& device) { diff --git a/src/video_core/renderer_vulkan/present/filters.cpp b/src/video_core/renderer_vulkan/present/filters.cpp index 5ab3ac3114..8fed222504 100644 --- a/src/video_core/renderer_vulkan/present/filters.cpp +++ b/src/video_core/renderer_vulkan/present/filters.cpp @@ -62,7 +62,7 @@ std::unique_ptr MakeGaussian(const Device& device, VkFormat fra std::unique_ptr MakeLanczos(const Device& device, VkFormat frame_format) { return std::make_unique(device, frame_format, CreateBilinearSampler(device), - BuildShader(device, PRESENT_LANCZO_FRAG_SPV)); + BuildShader(device, PRESENT_LANCZOS_FRAG_SPV)); } std::unique_ptr MakeScaleForce(const Device& device, VkFormat frame_format) { From 725407b989b5f9021ae7f1aae664f7204cc94a47 Mon Sep 17 00:00:00 2001 From: MaranBr Date: Sat, 20 Sep 2025 14:17:07 +0200 Subject: [PATCH 04/17] [video_core] Add ability for integrated devices to control the amount of memory used by the emulator (#2528) This adds the ability for integrated devices to control the amount of memory used by the emulator. Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2528 Reviewed-by: Lizzie Reviewed-by: Shinmegumi Co-authored-by: MaranBr Co-committed-by: MaranBr --- src/qt_common/shared_translation.cpp | 5 +---- src/video_core/vulkan_common/vulkan_device.cpp | 15 ++++++--------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/qt_common/shared_translation.cpp b/src/qt_common/shared_translation.cpp index cdc05e60e0..eb413f28e9 100644 --- a/src/qt_common/shared_translation.cpp +++ b/src/qt_common/shared_translation.cpp @@ -246,10 +246,7 @@ std::unique_ptr InitializeTranslations(QObject* parent) INSERT(Settings, vram_usage_mode, tr("VRAM Usage Mode:"), - tr("Selects whether the emulator should prefer to conserve memory or make maximum usage " - "of available video memory for performance.\nHas no effect on integrated graphics. " - "Aggressive mode may severely impact the performance of other applications such as " - "recording software.")); + tr("Selects whether the emulator should prefer to conserve memory or make maximum usage of available video memory for performance.\nAggressive mode may severely impact the performance of other applications such as recording software.")); INSERT(Settings, skip_cpu_inner_invalidation, tr("Skip CPU Inner Invalidation"), diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 6d7c33099b..41917a1b90 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -1395,23 +1395,20 @@ void Device::CollectPhysicalMemoryInfo() { } device_access_memory += mem_properties.memoryHeaps[element].size; } - if (!is_integrated) { + if (is_integrated) { + const s64 available_memory = static_cast(device_access_memory - device_initial_usage); + const u64 memory_size = Settings::values.vram_usage_mode.GetValue() == Settings::VramUsageMode::Aggressive ? 6_GiB : 4_GiB; + device_access_memory = static_cast(std::max(std::min(available_memory - 8_GiB, memory_size), std::min(local_memory, memory_size))); + } else { const u64 reserve_memory = std::min(device_access_memory / 8, 1_GiB); device_access_memory -= reserve_memory; - if (Settings::values.vram_usage_mode.GetValue() != Settings::VramUsageMode::Aggressive) { // Account for resolution scaling in memory limits const size_t normal_memory = 6_GiB; const size_t scaler_memory = 1_GiB * Settings::values.resolution_info.ScaleUp(1); - device_access_memory = - std::min(device_access_memory, normal_memory + scaler_memory); + device_access_memory = std::min(device_access_memory, normal_memory + scaler_memory); } - - return; } - const s64 available_memory = static_cast(device_access_memory - device_initial_usage); - device_access_memory = static_cast(std::max( - std::min(available_memory - 8_GiB, 6_GiB), std::min(local_memory, 6_GiB))); } void Device::CollectToolingInfo() { From 4436a87d31f854ceec1e374b4f5fd60605a0ffec Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 13:00:46 +0000 Subject: [PATCH 05/17] optimize with precomputed kernel Signed-off-by: lizzie --- .../host_shaders/present_lanczos.frag | 35 +++++--------- tools/lanczos_gen.c | 48 +++++++++++++++++++ 2 files changed, 61 insertions(+), 22 deletions(-) create mode 100644 tools/lanczos_gen.c diff --git a/src/video_core/host_shaders/present_lanczos.frag b/src/video_core/host_shaders/present_lanczos.frag index 5afc985bc3..9501b7ca33 100644 --- a/src/video_core/host_shaders/present_lanczos.frag +++ b/src/video_core/host_shaders/present_lanczos.frag @@ -9,32 +9,23 @@ layout (location = 0) in vec2 frag_tex_coord; layout (location = 0) out vec4 color; layout (binding = 0) uniform sampler2D color_texture; -#define PI 3.1415926535897932384626433 - -float sinc(float x) { - return x == 0.0f ? 1.0f : sin(PI * x) / (PI * x); -} - -float lanczos(vec2 v, float a) { - float d = sqrt(v.x * v.x + v.y * v.y); - return sinc(d) / sinc(d / a); -} - +// precomputed kernel +const float w_kernel[49] = float[] ( + -0.238811f, 0.531959f, 0.961865f, 1.000000f, 0.961865f, 0.531959f, -0.238811f, 0.531959f, 0.957419f, 0.313883f, -0.000000f, 0.313883f, 0.957419f, 0.531959f, 0.961865f, 0.313883f, -0.322602f, 0.000000f, -0.322602f, 0.313883f, 0.961865f, 1.000000f, -0.000000f, 0.000000f, 1.000000f, 0.000000f, -0.000000f, 1.000000f, 0.961865f, 0.313883f, -0.322602f, 0.000000f, -0.322602f, 0.313883f, 0.961865f, 0.531959f, 0.957419f, 0.313883f, -0.000000f, 0.313883f, 0.957419f, 0.531959f, -0.238811f, 0.531959f, 0.961865f, 1.000000f, 0.961865f, 0.531959f, -0.238811f +); +const vec2 w_pos[49] = vec2[] ( + vec2(-0.750000f, -0.750000f), vec2(-0.750000f, -0.500000f), vec2(-0.750000f, -0.250000f), vec2(-0.750000f, 0.000000f), vec2(-0.750000f, 0.250000f), vec2(-0.750000f, 0.500000f), vec2(-0.750000f, 0.750000f), vec2(-0.500000f, -0.750000f), vec2(-0.500000f, -0.500000f), vec2(-0.500000f, -0.250000f), vec2(-0.500000f, 0.000000f), vec2(-0.500000f, 0.250000f), vec2(-0.500000f, 0.500000f), vec2(-0.500000f, 0.750000f), vec2(-0.250000f, -0.750000f), vec2(-0.250000f, -0.500000f), vec2(-0.250000f, -0.250000f), vec2(-0.250000f, 0.000000f), vec2(-0.250000f, 0.250000f), vec2(-0.250000f, 0.500000f), vec2(-0.250000f, 0.750000f), vec2(0.000000f, -0.750000f), vec2(0.000000f, -0.500000f), vec2(0.000000f, -0.250000f), vec2(0.000000f, 0.000000f), vec2(0.000000f, 0.250000f), vec2(0.000000f, 0.500000f), vec2(0.000000f, 0.750000f), vec2(0.250000f, -0.750000f), vec2(0.250000f, -0.500000f), vec2(0.250000f, -0.250000f), vec2(0.250000f, 0.000000f), vec2(0.250000f, 0.250000f), vec2(0.250000f, 0.500000f), vec2(0.250000f, 0.750000f), vec2(0.500000f, -0.750000f), vec2(0.500000f, -0.500000f), vec2(0.500000f, -0.250000f), vec2(0.500000f, 0.000000f), vec2(0.500000f, 0.250000f), vec2(0.500000f, 0.500000f), vec2(0.500000f, 0.750000f), vec2(0.750000f, -0.750000f), vec2(0.750000f, -0.500000f), vec2(0.750000f, -0.250000f), vec2(0.750000f, 0.000000f), vec2(0.750000f, 0.250000f), vec2(0.750000f, 0.500000f), vec2(0.750000f, 0.750000f) +); +const float w_sum = 21.045683f; vec4 textureLanczos(sampler2D textureSampler, vec2 p) { - const int r = 1; //radius (1 = 3 steps) vec3 c_sum = vec3(0.0f); - float w_sum = 0.0f; vec2 res = vec2(textureSize(textureSampler, 0)); vec2 cc = floor(p * res) / res; - // kernel size = (r * 2 + 1) * (r * 2 + 1) - for (int x = -r; x <= r; x++) - for (int y = -r; y <= r; y++) { - vec2 kp = 0.5f * (vec2(x, y) / res); // 0.5 = half-pixel level resampling - vec2 uv = cc + kp; - float w = lanczos(kp, float(r)); - c_sum += w * texture(textureSampler, p + kp).rgb; - w_sum += w; - } + for (int i = 0; i < 49; i++) { // kernel size = (r * 2 + 1) ^ 2 + vec2 kp = w_pos[i] / res; + vec2 uv = cc + kp; + c_sum += w_kernel[i] * texture(textureSampler, p + kp).rgb; + } return vec4(c_sum / w_sum, 1.0f); } diff --git a/tools/lanczos_gen.c b/tools/lanczos_gen.c new file mode 100644 index 0000000000..6d7be3cb0e --- /dev/null +++ b/tools/lanczos_gen.c @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +// clang -lm tools/lanczos_gen.c -o tools/lanczos_gen && ./tools/lanczos_gen +#include +#include + +double sinc(double x) { + return x == 0.0f ? 1.0f : sin(M_PI * x) / (M_PI * x); +} + +typedef struct vec2 { + double x; + double y; +} vec2; + +double lanczos(vec2 v, float a) { + double d = sqrt(v.x * v.x + v.y * v.y); + return sinc(d) / sinc(d / a); +} + +int main(int argc, char* argv[]) { + const int r = 3; //radius (1 = 3 steps) + const int k_size = (r * 2 + 1) * (r * 2 + 1); + double w_sum = 0.0f; + // kernel size = (r * 2 + 1) ^ 2 + printf("const float w_kernel[%i] = float[] (\n ", k_size); + double factor = 1.0f / ((double)r + 1.0f); + for (int x = -r; x <= r; x++) + for (int y = -r; y <= r; y++) { + double w = lanczos((vec2){ .x = x, .y = y }, (double)r); + printf("%lff, ", w); + w_sum += w; + } + printf("\n);\n"); + printf("const vec2 w_pos[%i] = vec2[] (\n ", k_size); + for (int x = -r; x <= r; x++) + for (int y = -r; y <= r; y++) { + vec2 kp = (vec2){ + .x = x * factor, + .y = y * factor + }; + printf("vec2(%lff, %lff), ", kp.x, kp.y); + } + printf("\n);\n"); + printf("const float w_sum = %lff;\n", w_sum); + return 0; +} From 2e8d71b4063d297c43c18eeaa252780b45cb12e0 Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 14:26:53 +0000 Subject: [PATCH 06/17] actually memory is bad Signed-off-by: lizzie --- .../host_shaders/present_lanczos.frag | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/video_core/host_shaders/present_lanczos.frag b/src/video_core/host_shaders/present_lanczos.frag index 9501b7ca33..b69b329c1b 100644 --- a/src/video_core/host_shaders/present_lanczos.frag +++ b/src/video_core/host_shaders/present_lanczos.frag @@ -9,23 +9,29 @@ layout (location = 0) in vec2 frag_tex_coord; layout (location = 0) out vec4 color; layout (binding = 0) uniform sampler2D color_texture; -// precomputed kernel -const float w_kernel[49] = float[] ( - -0.238811f, 0.531959f, 0.961865f, 1.000000f, 0.961865f, 0.531959f, -0.238811f, 0.531959f, 0.957419f, 0.313883f, -0.000000f, 0.313883f, 0.957419f, 0.531959f, 0.961865f, 0.313883f, -0.322602f, 0.000000f, -0.322602f, 0.313883f, 0.961865f, 1.000000f, -0.000000f, 0.000000f, 1.000000f, 0.000000f, -0.000000f, 1.000000f, 0.961865f, 0.313883f, -0.322602f, 0.000000f, -0.322602f, 0.313883f, 0.961865f, 0.531959f, 0.957419f, 0.313883f, -0.000000f, 0.313883f, 0.957419f, 0.531959f, -0.238811f, 0.531959f, 0.961865f, 1.000000f, 0.961865f, 0.531959f, -0.238811f -); -const vec2 w_pos[49] = vec2[] ( - vec2(-0.750000f, -0.750000f), vec2(-0.750000f, -0.500000f), vec2(-0.750000f, -0.250000f), vec2(-0.750000f, 0.000000f), vec2(-0.750000f, 0.250000f), vec2(-0.750000f, 0.500000f), vec2(-0.750000f, 0.750000f), vec2(-0.500000f, -0.750000f), vec2(-0.500000f, -0.500000f), vec2(-0.500000f, -0.250000f), vec2(-0.500000f, 0.000000f), vec2(-0.500000f, 0.250000f), vec2(-0.500000f, 0.500000f), vec2(-0.500000f, 0.750000f), vec2(-0.250000f, -0.750000f), vec2(-0.250000f, -0.500000f), vec2(-0.250000f, -0.250000f), vec2(-0.250000f, 0.000000f), vec2(-0.250000f, 0.250000f), vec2(-0.250000f, 0.500000f), vec2(-0.250000f, 0.750000f), vec2(0.000000f, -0.750000f), vec2(0.000000f, -0.500000f), vec2(0.000000f, -0.250000f), vec2(0.000000f, 0.000000f), vec2(0.000000f, 0.250000f), vec2(0.000000f, 0.500000f), vec2(0.000000f, 0.750000f), vec2(0.250000f, -0.750000f), vec2(0.250000f, -0.500000f), vec2(0.250000f, -0.250000f), vec2(0.250000f, 0.000000f), vec2(0.250000f, 0.250000f), vec2(0.250000f, 0.500000f), vec2(0.250000f, 0.750000f), vec2(0.500000f, -0.750000f), vec2(0.500000f, -0.500000f), vec2(0.500000f, -0.250000f), vec2(0.500000f, 0.000000f), vec2(0.500000f, 0.250000f), vec2(0.500000f, 0.500000f), vec2(0.500000f, 0.750000f), vec2(0.750000f, -0.750000f), vec2(0.750000f, -0.500000f), vec2(0.750000f, -0.250000f), vec2(0.750000f, 0.000000f), vec2(0.750000f, 0.250000f), vec2(0.750000f, 0.500000f), vec2(0.750000f, 0.750000f) -); -const float w_sum = 21.045683f; +#define PI 3.1415926535897932384626433 +float sinc(float x) { + return x == 0.0f ? 1.0f : sin(PI * x) / (PI * x); +} +float lanczos(vec2 v, float a) { + float d = length(v); + return sinc(d) / sinc(d / a); +} vec4 textureLanczos(sampler2D textureSampler, vec2 p) { vec3 c_sum = vec3(0.0f); + float w_sum = 0.0f; vec2 res = vec2(textureSize(textureSampler, 0)); vec2 cc = floor(p * res) / res; - for (int i = 0; i < 49; i++) { // kernel size = (r * 2 + 1) ^ 2 - vec2 kp = w_pos[i] / res; - vec2 uv = cc + kp; - c_sum += w_kernel[i] * texture(textureSampler, p + kp).rgb; - } + // kernel size = (2r + 1)^2 + const int r = 3; //radius (1 = 3 steps) + for (int x = -r; x <= r; x++) + for (int y = -r; y <= r; y++) { + vec2 kp = 0.5f * (vec2(x, y) / res); // 0.5 = half-pixel level resampling + vec2 uv = cc + kp; + float w = lanczos(kp, float(r)); + c_sum += w * texture(textureSampler, p + kp).rgb; + w_sum += w; + } return vec4(c_sum / w_sum, 1.0f); } From 87d42cf542a9d7c9c6222459bd71927958a78359 Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 17:43:59 +0200 Subject: [PATCH 07/17] [fs] remove usage of subpar PooledBuffer (#342) PoolBuffer is a subpar "reimplementation" of an equivalent std::vector Signed-off-by: lizzie Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/342 Reviewed-by: crueter Reviewed-by: Shinmegumi Co-authored-by: lizzie Co-committed-by: lizzie --- src/core/CMakeLists.txt | 3 +- .../fssystem/fssystem_aes_ctr_storage.cpp | 30 ++---- .../fssystem/fssystem_aes_xts_storage.cpp | 16 ++-- .../fssystem_alignment_matching_storage.h | 28 ++---- .../fssystem/fssystem_bucket_tree.cpp | 24 +---- .../fssystem_bucket_tree_template_impl.h | 32 +++---- .../fssystem/fssystem_compressed_storage.h | 36 +++---- .../fssystem/fssystem_pooled_buffer.cpp | 61 ------------ .../fssystem/fssystem_pooled_buffer.h | 95 ------------------- 9 files changed, 54 insertions(+), 271 deletions(-) delete mode 100644 src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp delete mode 100644 src/core/file_sys/fssystem/fssystem_pooled_buffer.h diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 4d9566a60f..3c28ebd911 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -107,8 +107,7 @@ add_library(core STATIC file_sys/fssystem/fssystem_nca_header.cpp file_sys/fssystem/fssystem_nca_header.h file_sys/fssystem/fssystem_nca_reader.cpp - file_sys/fssystem/fssystem_pooled_buffer.cpp - file_sys/fssystem/fssystem_pooled_buffer.h + file_sys/fssystem/fssystem_passthrough_storage.h file_sys/fssystem/fssystem_sparse_storage.cpp file_sys/fssystem/fssystem_sparse_storage.h file_sys/fssystem/fssystem_switch_storage.h diff --git a/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp b/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp index c18fde18f4..aaf7788801 100644 --- a/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp +++ b/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp @@ -1,10 +1,12 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #include "common/alignment.h" #include "common/swap.h" #include "core/file_sys/fssystem/fssystem_aes_ctr_storage.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" #include "core/file_sys/fssystem/fssystem_utility.h" namespace FileSys { @@ -76,13 +78,6 @@ size_t AesCtrStorage::Write(const u8* buffer, size_t size, size_t offset) { ASSERT(Common::IsAligned(offset, BlockSize)); ASSERT(Common::IsAligned(size, BlockSize)); - // Get a pooled buffer. - PooledBuffer pooled_buffer; - const bool use_work_buffer = true; - if (use_work_buffer) { - pooled_buffer.Allocate(size, BlockSize); - } - // Setup the counter. std::array ctr; std::memcpy(ctr.data(), m_iv.data(), IvSize); @@ -91,25 +86,20 @@ size_t AesCtrStorage::Write(const u8* buffer, size_t size, size_t offset) { // Loop until all data is written. size_t remaining = size; s64 cur_offset = 0; + + // Get a pooled buffer. + std::vector pooled_buffer(BlockSize); while (remaining > 0) { // Determine data we're writing and where. - const size_t write_size = - use_work_buffer ? (std::min)(pooled_buffer.GetSize(), remaining) : remaining; - - void* write_buf; - if (use_work_buffer) { - write_buf = pooled_buffer.GetBuffer(); - } else { - write_buf = const_cast(buffer); - } + const size_t write_size = std::min(pooled_buffer.size(), remaining); + u8* write_buf = reinterpret_cast(pooled_buffer.data()); // Encrypt the data. m_cipher->SetIV(ctr); - m_cipher->Transcode(buffer, write_size, reinterpret_cast(write_buf), - Core::Crypto::Op::Encrypt); + m_cipher->Transcode(buffer, write_size, write_buf, Core::Crypto::Op::Encrypt); // Write the encrypted data. - m_base_storage->Write(reinterpret_cast(write_buf), write_size, offset + cur_offset); + m_base_storage->Write(write_buf, write_size, offset + cur_offset); // Advance. cur_offset += write_size; diff --git a/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp b/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp index 5ef2544dfb..9e7a104c89 100644 --- a/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp +++ b/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp @@ -1,11 +1,12 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #include "common/alignment.h" #include "common/swap.h" -#include "core/file_sys/errors.h" #include "core/file_sys/fssystem/fssystem_aes_xts_storage.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" #include "core/file_sys/fssystem/fssystem_utility.h" namespace FileSys { @@ -69,17 +70,14 @@ size_t AesXtsStorage::Read(u8* buffer, size_t size, size_t offset) const { // Decrypt into a pooled buffer. { - PooledBuffer tmp_buf(m_block_size, m_block_size); - ASSERT(tmp_buf.GetSize() >= m_block_size); - - std::memset(tmp_buf.GetBuffer(), 0, skip_size); - std::memcpy(tmp_buf.GetBuffer() + skip_size, buffer, data_size); + std::vector tmp_buf(m_block_size, 0); + std::memcpy(tmp_buf.data() + skip_size, buffer, data_size); m_cipher->SetIV(ctr); - m_cipher->Transcode(tmp_buf.GetBuffer(), m_block_size, tmp_buf.GetBuffer(), + m_cipher->Transcode(tmp_buf.data(), m_block_size, tmp_buf.data(), Core::Crypto::Op::Decrypt); - std::memcpy(buffer, tmp_buf.GetBuffer() + skip_size, data_size); + std::memcpy(buffer, tmp_buf.data() + skip_size, data_size); } AddCounter(ctr.data(), IvSize, 1); diff --git a/src/core/file_sys/fssystem/fssystem_alignment_matching_storage.h b/src/core/file_sys/fssystem/fssystem_alignment_matching_storage.h index f96691d03d..60a6d24435 100644 --- a/src/core/file_sys/fssystem/fssystem_alignment_matching_storage.h +++ b/src/core/file_sys/fssystem/fssystem_alignment_matching_storage.h @@ -1,13 +1,14 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #pragma once #include "common/alignment.h" -#include "core/file_sys/errors.h" #include "core/file_sys/fssystem/fs_i_storage.h" #include "core/file_sys/fssystem/fssystem_alignment_matching_storage_impl.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" namespace FileSys { @@ -89,10 +90,11 @@ private: VirtualFile m_base_storage; s64 m_base_storage_size; size_t m_data_align; + mutable std::vector work_buffer; public: explicit AlignmentMatchingStoragePooledBuffer(VirtualFile bs, size_t da) - : m_base_storage(std::move(bs)), m_data_align(da) { + : m_base_storage(std::move(bs)), m_data_align(da), work_buffer(da) { ASSERT(Common::IsPowerOfTwo(da)); } @@ -104,16 +106,10 @@ public: // Validate arguments. ASSERT(buffer != nullptr); - s64 bs_size = this->GetSize(); ASSERT(R_SUCCEEDED(IStorage::CheckAccessRange(offset, size, bs_size))); - - // Allocate a pooled buffer. - PooledBuffer pooled_buffer; - pooled_buffer.AllocateParticularlyLarge(m_data_align, m_data_align); - - return AlignmentMatchingStorageImpl::Read(m_base_storage, pooled_buffer.GetBuffer(), - pooled_buffer.GetSize(), m_data_align, + return AlignmentMatchingStorageImpl::Read(m_base_storage, work_buffer.data(), + work_buffer.size(), m_data_align, BufferAlign, offset, buffer, size); } @@ -125,16 +121,10 @@ public: // Validate arguments. ASSERT(buffer != nullptr); - s64 bs_size = this->GetSize(); ASSERT(R_SUCCEEDED(IStorage::CheckAccessRange(offset, size, bs_size))); - - // Allocate a pooled buffer. - PooledBuffer pooled_buffer; - pooled_buffer.AllocateParticularlyLarge(m_data_align, m_data_align); - - return AlignmentMatchingStorageImpl::Write(m_base_storage, pooled_buffer.GetBuffer(), - pooled_buffer.GetSize(), m_data_align, + return AlignmentMatchingStorageImpl::Write(m_base_storage, work_buffer.data(), + work_buffer.size(), m_data_align, BufferAlign, offset, buffer, size); } diff --git a/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp b/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp index f58b154968..ce3b62f26d 100644 --- a/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp +++ b/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp @@ -7,7 +7,6 @@ #include "core/file_sys/errors.h" #include "core/file_sys/fssystem/fssystem_bucket_tree.h" #include "core/file_sys/fssystem/fssystem_bucket_tree_utils.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" namespace FileSys { @@ -465,16 +464,8 @@ Result BucketTree::Visitor::Find(s64 virtual_address) { } Result BucketTree::Visitor::FindEntrySet(s32* out_index, s64 virtual_address, s32 node_index) { - const auto node_size = m_tree->m_node_size; - - PooledBuffer pool(node_size, 1); - if (node_size <= pool.GetSize()) { - R_RETURN( - this->FindEntrySetWithBuffer(out_index, virtual_address, node_index, pool.GetBuffer())); - } else { - pool.Deallocate(); - R_RETURN(this->FindEntrySetWithoutBuffer(out_index, virtual_address, node_index)); - } + std::vector pool(m_tree->m_node_size); + R_RETURN(FindEntrySetWithBuffer(out_index, virtual_address, node_index, pool.data())); } Result BucketTree::Visitor::FindEntrySetWithBuffer(s32* out_index, s64 virtual_address, @@ -525,15 +516,8 @@ Result BucketTree::Visitor::FindEntrySetWithoutBuffer(s32* out_index, s64 virtua } Result BucketTree::Visitor::FindEntry(s64 virtual_address, s32 entry_set_index) { - const auto entry_set_size = m_tree->m_node_size; - - PooledBuffer pool(entry_set_size, 1); - if (entry_set_size <= pool.GetSize()) { - R_RETURN(this->FindEntryWithBuffer(virtual_address, entry_set_index, pool.GetBuffer())); - } else { - pool.Deallocate(); - R_RETURN(this->FindEntryWithoutBuffer(virtual_address, entry_set_index)); - } + std::vector pool(m_tree->m_node_size); + R_RETURN(FindEntryWithBuffer(virtual_address, entry_set_index, pool.data())); } Result BucketTree::Visitor::FindEntryWithBuffer(s64 virtual_address, s32 entry_set_index, diff --git a/src/core/file_sys/fssystem/fssystem_bucket_tree_template_impl.h b/src/core/file_sys/fssystem/fssystem_bucket_tree_template_impl.h index 030b2916b0..fac6c37214 100644 --- a/src/core/file_sys/fssystem/fssystem_bucket_tree_template_impl.h +++ b/src/core/file_sys/fssystem/fssystem_bucket_tree_template_impl.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -6,7 +9,6 @@ #include "core/file_sys/errors.h" #include "core/file_sys/fssystem/fssystem_bucket_tree.h" #include "core/file_sys/fssystem/fssystem_bucket_tree_utils.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" namespace FileSys { @@ -35,23 +37,19 @@ Result BucketTree::ScanContinuousReading(ContinuousReadingInfo* out_info, R_UNLESS(entry.GetVirtualOffset() <= cur_offset, ResultOutOfRange); // Create a pooled buffer for our scan. - PooledBuffer pool(m_node_size, 1); - char* buffer = nullptr; - + std::vector pool(m_node_size); s64 entry_storage_size = m_entry_storage->GetSize(); // Read the node. - if (m_node_size <= pool.GetSize()) { - buffer = pool.GetBuffer(); - const auto ofs = param.entry_set.index * static_cast(m_node_size); - R_UNLESS(m_node_size + ofs <= static_cast(entry_storage_size), - ResultInvalidBucketTreeNodeEntryCount); + u8* buffer = reinterpret_cast(pool.data()); + const auto ofs = param.entry_set.index * s64(m_node_size); + R_UNLESS(m_node_size + ofs <= size_t(entry_storage_size), + ResultInvalidBucketTreeNodeEntryCount); - m_entry_storage->Read(reinterpret_cast(buffer), m_node_size, ofs); - } + m_entry_storage->Read(buffer, m_node_size, ofs); // Calculate extents. - const auto end_offset = cur_offset + static_cast(param.size); + const auto end_offset = cur_offset + s64(param.size); s64 phys_offset = entry.GetPhysicalOffset(); // Start merge tracking. @@ -76,14 +74,8 @@ Result BucketTree::ScanContinuousReading(ContinuousReadingInfo* out_info, s64 next_entry_offset; if (entry_index + 1 < entry_count) { - if (buffer != nullptr) { - const auto ofs = impl::GetBucketTreeEntryOffset(0, m_entry_size, entry_index + 1); - std::memcpy(std::addressof(next_entry), buffer + ofs, m_entry_size); - } else { - const auto ofs = impl::GetBucketTreeEntryOffset(param.entry_set.index, m_node_size, - m_entry_size, entry_index + 1); - m_entry_storage->ReadObject(std::addressof(next_entry), ofs); - } + const auto offset = impl::GetBucketTreeEntryOffset(0, m_entry_size, entry_index + 1); + std::memcpy(std::addressof(next_entry), buffer + offset, m_entry_size); next_entry_offset = next_entry.GetVirtualOffset(); R_UNLESS(param.offsets.IsInclude(next_entry_offset), ResultInvalidIndirectEntryOffset); diff --git a/src/core/file_sys/fssystem/fssystem_compressed_storage.h b/src/core/file_sys/fssystem/fssystem_compressed_storage.h index 74c98630ec..223d51647e 100644 --- a/src/core/file_sys/fssystem/fssystem_compressed_storage.h +++ b/src/core/file_sys/fssystem/fssystem_compressed_storage.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -9,8 +12,6 @@ #include "core/file_sys/fssystem/fs_i_storage.h" #include "core/file_sys/fssystem/fssystem_bucket_tree.h" #include "core/file_sys/fssystem/fssystem_compression_common.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" -#include "core/file_sys/vfs/vfs.h" namespace FileSys { @@ -317,23 +318,11 @@ private: R_SUCCEED_IF(entry_count == 0); // Get the remaining size in a convenient form. - const size_t total_required_size = - static_cast(required_access_physical_size); + const size_t total_required_size = size_t(required_access_physical_size); // Perform the read based on whether we need to allocate a buffer. if (will_allocate_pooled_buffer) { - // Allocate a pooled buffer. - PooledBuffer pooled_buffer; - if (pooled_buffer.GetAllocatableSizeMax() >= total_required_size) { - pooled_buffer.Allocate(total_required_size, m_block_size_max); - } else { - pooled_buffer.AllocateParticularlyLarge( - std::min( - total_required_size, - PooledBuffer::GetAllocatableParticularlyLargeSizeMax()), - m_block_size_max); - } - + std::vector pooled_buffer(std::max(m_block_size_max, total_required_size)); // Read each of the entries. for (s32 entry_idx = 0; entry_idx < entry_count; ++entry_idx) { // Determine the current read size. @@ -342,13 +331,13 @@ private: if (const size_t target_entry_size = static_cast(entries[entry_idx].physical_size) + static_cast(entries[entry_idx].gap_from_prev); - target_entry_size <= pooled_buffer.GetSize()) { + target_entry_size <= pooled_buffer.size()) { // We'll be using the pooled buffer. will_use_pooled_buffer = true; // Determine how much we can read. const size_t max_size = std::min( - required_access_physical_size, pooled_buffer.GetSize()); + required_access_physical_size, pooled_buffer.size()); size_t read_size = 0; for (auto n = entry_idx; n < entry_count; ++n) { @@ -376,7 +365,7 @@ private: // Perform the read based on whether or not we'll use the pooled buffer. if (will_use_pooled_buffer) { // Read the compressed data into the pooled buffer. - auto* const buffer = pooled_buffer.GetBuffer(); + auto* const buffer = pooled_buffer.data(); m_data_storage->Read(reinterpret_cast(buffer), cur_read_size, required_access_physical_offset); @@ -863,11 +852,9 @@ private: static_cast(unaligned_range->virtual_size)); // Get a pooled buffer for our read. - PooledBuffer pooled_buffer; - pooled_buffer.Allocate(size_buffer_required, size_buffer_required); - + std::vector pooled_buffer(size_buffer_required); // Perform read. - Result rc = read_impl(pooled_buffer.GetBuffer(), size_buffer_required); + Result rc = read_impl(pooled_buffer.data(), size_buffer_required); if (R_FAILED(rc)) { R_THROW(rc); } @@ -876,8 +863,7 @@ private: const size_t skip_size = cur_offset - unaligned_range->virtual_offset; const size_t copy_size = std::min( cur_size, unaligned_range->GetEndVirtualOffset() - cur_offset); - - std::memcpy(cur_dst, pooled_buffer.GetBuffer() + skip_size, copy_size); + std::memcpy(cur_dst, pooled_buffer.data() + skip_size, copy_size); // Advance. cur_dst += copy_size; diff --git a/src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp b/src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp deleted file mode 100644 index dcd08dac3e..0000000000 --- a/src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include "common/alignment.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" - -namespace FileSys { - -namespace { - -constexpr size_t HeapBlockSize = BufferPoolAlignment; -static_assert(HeapBlockSize == 4_KiB); - -// A heap block is 4KiB. An order is a power of two. -// This gives blocks of the order 32KiB, 512KiB, 4MiB. -constexpr s32 HeapOrderMax = 7; -constexpr s32 HeapOrderMaxForLarge = HeapOrderMax + 3; - -constexpr size_t HeapAllocatableSizeMax = HeapBlockSize * (static_cast(1) << HeapOrderMax); -constexpr size_t HeapAllocatableSizeMaxForLarge = - HeapBlockSize * (static_cast(1) << HeapOrderMaxForLarge); - -} // namespace - -size_t PooledBuffer::GetAllocatableSizeMaxCore(bool large) { - return large ? HeapAllocatableSizeMaxForLarge : HeapAllocatableSizeMax; -} - -void PooledBuffer::AllocateCore(size_t ideal_size, size_t required_size, bool large) { - // Ensure preconditions. - ASSERT(m_buffer == nullptr); - - // Check that we can allocate this size. - ASSERT(required_size <= GetAllocatableSizeMaxCore(large)); - - const size_t target_size = - (std::min)((std::max)(ideal_size, required_size), GetAllocatableSizeMaxCore(large)); - - // Dummy implementation for allocate. - if (target_size > 0) { - m_buffer = - reinterpret_cast(::operator new(target_size, std::align_val_t{HeapBlockSize})); - m_size = target_size; - - // Ensure postconditions. - ASSERT(m_buffer != nullptr); - } -} - -void PooledBuffer::Shrink(size_t ideal_size) { - ASSERT(ideal_size <= GetAllocatableSizeMaxCore(true)); - - // Shrinking to zero means that we have no buffer. - if (ideal_size == 0) { - ::operator delete(m_buffer, std::align_val_t{HeapBlockSize}); - m_buffer = nullptr; - m_size = ideal_size; - } -} - -} // namespace FileSys diff --git a/src/core/file_sys/fssystem/fssystem_pooled_buffer.h b/src/core/file_sys/fssystem/fssystem_pooled_buffer.h deleted file mode 100644 index 9a6adbcb5a..0000000000 --- a/src/core/file_sys/fssystem/fssystem_pooled_buffer.h +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include "common/common_funcs.h" -#include "common/common_types.h" -#include "common/literals.h" -#include "core/hle/result.h" - -namespace FileSys { - -using namespace Common::Literals; - -constexpr inline size_t BufferPoolAlignment = 4_KiB; -constexpr inline size_t BufferPoolWorkSize = 320; - -class PooledBuffer { - YUZU_NON_COPYABLE(PooledBuffer); - -public: - // Constructor/Destructor. - constexpr PooledBuffer() : m_buffer(), m_size() {} - - PooledBuffer(size_t ideal_size, size_t required_size) : m_buffer(), m_size() { - this->Allocate(ideal_size, required_size); - } - - ~PooledBuffer() { - this->Deallocate(); - } - - // Move and assignment. - explicit PooledBuffer(PooledBuffer&& rhs) : m_buffer(rhs.m_buffer), m_size(rhs.m_size) { - rhs.m_buffer = nullptr; - rhs.m_size = 0; - } - - PooledBuffer& operator=(PooledBuffer&& rhs) { - PooledBuffer(std::move(rhs)).Swap(*this); - return *this; - } - - // Allocation API. - void Allocate(size_t ideal_size, size_t required_size) { - return this->AllocateCore(ideal_size, required_size, false); - } - - void AllocateParticularlyLarge(size_t ideal_size, size_t required_size) { - return this->AllocateCore(ideal_size, required_size, true); - } - - void Shrink(size_t ideal_size); - - void Deallocate() { - // Shrink the buffer to empty. - this->Shrink(0); - ASSERT(m_buffer == nullptr); - } - - char* GetBuffer() const { - ASSERT(m_buffer != nullptr); - return m_buffer; - } - - size_t GetSize() const { - ASSERT(m_buffer != nullptr); - return m_size; - } - -public: - static size_t GetAllocatableSizeMax() { - return GetAllocatableSizeMaxCore(false); - } - static size_t GetAllocatableParticularlyLargeSizeMax() { - return GetAllocatableSizeMaxCore(true); - } - -private: - static size_t GetAllocatableSizeMaxCore(bool large); - -private: - void Swap(PooledBuffer& rhs) { - std::swap(m_buffer, rhs.m_buffer); - std::swap(m_size, rhs.m_size); - } - - void AllocateCore(size_t ideal_size, size_t required_size, bool large); - -private: - char* m_buffer; - size_t m_size; -}; - -} // namespace FileSys From 28b8159da1d5c9a82042cf43a86cec01d5758254 Mon Sep 17 00:00:00 2001 From: wildcard Date: Sat, 20 Sep 2025 17:52:40 +0200 Subject: [PATCH 08/17] [VK] Change barrier to transfer in present manager (#315) There is no Color_attachment happening here only transfer operation and hence the gpu should only wait for transfer not color_attachment_output_bit(may fix async presentation, not likely though) Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/315 Reviewed-by: Shinmegumi Reviewed-by: MaranBr Co-authored-by: wildcard Co-committed-by: wildcard --- src/video_core/renderer_vulkan/vk_present_manager.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_present_manager.cpp b/src/video_core/renderer_vulkan/vk_present_manager.cpp index 2c76584c72..23279e49b9 100644 --- a/src/video_core/renderer_vulkan/vk_present_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_present_manager.cpp @@ -470,8 +470,8 @@ void PresentManager::CopyToSwapchainImpl(Frame* frame) { const std::array wait_semaphores = {present_semaphore, *frame->render_ready}; static constexpr std::array wait_stage_masks{ - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, }; const VkSubmitInfo submit_info{ From 4b558e530324492c8321ba7e582a40442331d0d8 Mon Sep 17 00:00:00 2001 From: Gamer64 Date: Sat, 20 Sep 2025 17:54:14 +0200 Subject: [PATCH 09/17] [hw_composer]: Add some enhancements to improve its performance and logic (#225) These changes should mostly improve the performance for most of games and reduce reallocations from framebuffer releases. Co-authored-by: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/225 Reviewed-by: MaranBr Reviewed-by: Lizzie Co-authored-by: Gamer64 Co-committed-by: Gamer64 --- .../service/nvnflinger/hardware_composer.cpp | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/src/core/hle/service/nvnflinger/hardware_composer.cpp b/src/core/hle/service/nvnflinger/hardware_composer.cpp index a262a3dcd5..5c0515d473 100644 --- a/src/core/hle/service/nvnflinger/hardware_composer.cpp +++ b/src/core/hle/service/nvnflinger/hardware_composer.cpp @@ -53,6 +53,19 @@ u32 HardwareComposer::ComposeLocked(f32* out_speed_scale, Display& display, // Set default speed limit to 100%. *out_speed_scale = 1.0f; + // If no layers are available, skip the logic. + bool any_visible = false; + for (auto& layer : display.stack.layers) { + if (layer->visible) { + any_visible = true; + break; + } + } + if (!any_visible) { + *out_speed_scale = 1.0f; + return 1; + } + // Determine the number of vsync periods to wait before composing again. std::optional swap_interval{}; bool has_acquired_buffer{}; @@ -110,7 +123,7 @@ u32 HardwareComposer::ComposeLocked(f32* out_speed_scale, Display& display, } // If any new buffers were acquired, we can present. - if (has_acquired_buffer) { + if (has_acquired_buffer && !composition_stack.empty()) { // Sort by Z-index. std::stable_sort(composition_stack.begin(), composition_stack.end(), [&](auto& l, auto& r) { return l.z_index < r.z_index; }); @@ -119,6 +132,19 @@ u32 HardwareComposer::ComposeLocked(f32* out_speed_scale, Display& display, nvdisp.Composite(composition_stack); } + // Batch framebuffer releases, instead of one-into-one. + std::vector> to_release; + for (auto& [layer_id, framebuffer] : m_framebuffers) { + if (framebuffer.release_frame_number > m_frame_number || !framebuffer.is_acquired) + continue; + if (auto layer = display.stack.FindLayer(layer_id); layer) + to_release.emplace_back(layer.get(), &framebuffer); + } + for (auto& [layer, framebuffer] : to_release) { + layer->buffer_item_consumer->ReleaseBuffer(framebuffer->item, android::Fence::NoFence()); + framebuffer->is_acquired = false; + } + // Advance by at least one frame. const u32 frame_advance = swap_interval.value_or(1); m_frame_number += frame_advance; From d623e0460699adc2f8a70e8a52b549b20477ef30 Mon Sep 17 00:00:00 2001 From: Shinmegumi Date: Sat, 20 Sep 2025 18:19:44 +0200 Subject: [PATCH 10/17] Fix src/core/cmakelists.txt (#2537) Removed entry that was added back trying to fix a conflict in a PR. Signed-off-by: Shinmegumi Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2537 Reviewed-by: Lizzie Reviewed-by: MaranBr Co-authored-by: Shinmegumi Co-committed-by: Shinmegumi --- src/core/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 3c28ebd911..6b64ab7820 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -107,7 +107,6 @@ add_library(core STATIC file_sys/fssystem/fssystem_nca_header.cpp file_sys/fssystem/fssystem_nca_header.h file_sys/fssystem/fssystem_nca_reader.cpp - file_sys/fssystem/fssystem_passthrough_storage.h file_sys/fssystem/fssystem_sparse_storage.cpp file_sys/fssystem/fssystem_sparse_storage.h file_sys/fssystem/fssystem_switch_storage.h From 8c9cdf0d70eebd92933f7120c36069fec996f85d Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 21:49:25 +0200 Subject: [PATCH 11/17] [dynarmic] update docs for fastmem (#2539) Signed-off-by: lizzie Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2539 Reviewed-by: crueter Co-authored-by: lizzie Co-committed-by: lizzie --- src/dynarmic/docs/Design.md | 49 ++++-- src/dynarmic/docs/FastMemory.md | 19 ++ src/dynarmic/docs/Fastmem.svg | 4 + src/dynarmic/docs/HostToGuest.svg | 4 + src/dynarmic/docs/RegisterAllocator.md | 43 +++-- .../docs/ReturnStackBufferOptimization.md | 162 ++++++++++-------- 6 files changed, 181 insertions(+), 100 deletions(-) create mode 100644 src/dynarmic/docs/FastMemory.md create mode 100644 src/dynarmic/docs/Fastmem.svg create mode 100644 src/dynarmic/docs/HostToGuest.svg diff --git a/src/dynarmic/docs/Design.md b/src/dynarmic/docs/Design.md index 3c0deb5972..ffa8ccecdb 100644 --- a/src/dynarmic/docs/Design.md +++ b/src/dynarmic/docs/Design.md @@ -273,52 +273,73 @@ Exclusive OR (i.e.: XOR) ### Callback: {Read,Write}Memory{8,16,32,64} - ReadMemory8( vaddr) - ReadMemory16( vaddr) - ReadMemory32( vaddr) - ReadMemory64( vaddr) - WriteMemory8( vaddr, value_to_store) - WriteMemory16( vaddr, value_to_store) - WriteMemory32( vaddr, value_to_store) - WriteMemory64( vaddr, value_to_store) +```c++ + ReadMemory8( vaddr) + ReadMemory16( vaddr) + ReadMemory32( vaddr) + ReadMemory64( vaddr) + WriteMemory8( vaddr, value_to_store) + WriteMemory16( vaddr, value_to_store) + WriteMemory32( vaddr, value_to_store) + WriteMemory64( vaddr, value_to_store) +``` Memory access. ### Terminal: Interpret - SetTerm(IR::Term::Interpret{next}) +```c++ +SetTerm(IR::Term::Interpret{next}) +``` This terminal instruction calls the interpreter, starting at `next`. The interpreter must interpret exactly one instruction. ### Terminal: ReturnToDispatch - SetTerm(IR::Term::ReturnToDispatch{}) +```c++ +SetTerm(IR::Term::ReturnToDispatch{}) +``` This terminal instruction returns control to the dispatcher. The dispatcher will use the value in R15 to determine what comes next. ### Terminal: LinkBlock - SetTerm(IR::Term::LinkBlock{next}) +```c++ +SetTerm(IR::Term::LinkBlock{next}) +``` This terminal instruction jumps to the basic block described by `next` if we have enough cycles remaining. If we do not have enough cycles remaining, we return to the dispatcher, which will return control to the host. +### Terminal: LinkBlockFast + +```c++ +SetTerm(IR::Term::LinkBlockFast{next}) +``` + +This terminal instruction jumps to the basic block described by `next` unconditionally. +This promises guarantees that must be held at runtime - i.e that the program wont hang, + ### Terminal: PopRSBHint - SetTerm(IR::Term::PopRSBHint{}) +```c++ +SetTerm(IR::Term::PopRSBHint{}) +``` This terminal instruction checks the top of the Return Stack Buffer against R15. If RSB lookup fails, control is returned to the dispatcher. This is an optimization for faster function calls. A backend that doesn't support this optimization or doesn't have a RSB may choose to implement this exactly as -ReturnToDispatch. +`ReturnToDispatch`. ### Terminal: If - SetTerm(IR::Term::If{cond, term_then, term_else}) +```c++ +SetTerm(IR::Term::If{cond, term_then, term_else}) +``` This terminal instruction conditionally executes one terminal or another depending on the run-time state of the ARM flags. diff --git a/src/dynarmic/docs/FastMemory.md b/src/dynarmic/docs/FastMemory.md new file mode 100644 index 0000000000..c4f57996ba --- /dev/null +++ b/src/dynarmic/docs/FastMemory.md @@ -0,0 +1,19 @@ +# Fast memory (Fastmem) + +The main way of accessing memory in JITed programs is via an invoked function, say "Read()" and "Write()". On our translator, such functions usually take a sizable amounts of code space (push + call + pop). Trash the i-cache (due to an indirect call) and overall make code emission more bloated. + +The solution? Delegate invalid accesses to a dedicated arena, similar to a swap. The main idea behind such mechanism is to allow the OS to transmit page faults from invalid accesses into the JIT translator directly, bypassing address space calls, while this sacrifices i-cache coherency, it allows for smaller code-size and "faster" throguhput. + +Many kernels however, do not support fast signal dispatching (Solaris, OpenBSD, FreeBSD). Only Linux and Windows support relatively "fast" signal dispatching. Hence this feature is better suited for them only. + +![Host to guest translation](./HostToGuest.svg) + +![Fastmem translation](./Fastmem.svg) + +In x86_64 for example, when a page fault occurs, the CPU will transmit via control registers and the stack (see `IRETQ`) the appropriate arguments for a page fault handler, the OS then will transform that into something that can be sent into userspace. + +Most modern OSes implement kernel-page-table-isolation, which means a set of system calls will invoke a context switch (not often used syscalls), whereas others are handled by the same process address space (the smaller kernel portion, often used syscalls) without needing a context switch. This effect can be negated on systems with PCID (up to 4096 unique IDs). + +Signal dispatching takes a performance hit from reloading `%cr3` - but Linux does something more clever to avoid reloads: VDSO will take care of the entire thing in the same address space. Making dispatching as costly as an indirect call - without the hazards of increased code size. + +The main downside from this is the constant i-cache trashing and pipeline hazards introduced by the VDSO signal handlers. However on most benchmarks fastmem does perform faster than without (Linux only). This also abuses the fact of continous address space emulation by using an arena - which can then be potentially transparently mapped into a hugepage, reducing TLB walk times. diff --git a/src/dynarmic/docs/Fastmem.svg b/src/dynarmic/docs/Fastmem.svg new file mode 100644 index 0000000000..a3ed0bb68b --- /dev/null +++ b/src/dynarmic/docs/Fastmem.svg @@ -0,0 +1,4 @@ + + + +
Emulator
Address Space
Guest Address Space
SIGSEGV Trap
Fastmem
Only needs to linearly offset from fastmem arena
Less codegen (SIGSEGV traps)
Is fast only if SIGSEGV handlers are sufficiently fast
\ No newline at end of file diff --git a/src/dynarmic/docs/HostToGuest.svg b/src/dynarmic/docs/HostToGuest.svg new file mode 100644 index 0000000000..6a15a44b46 --- /dev/null +++ b/src/dynarmic/docs/HostToGuest.svg @@ -0,0 +1,4 @@ + + + +
Emulator
Address Space
Guest Address Space
Resolver
Host to Guest translation
Looks up correct PTE
Translates each address 
Is slow
\ No newline at end of file diff --git a/src/dynarmic/docs/RegisterAllocator.md b/src/dynarmic/docs/RegisterAllocator.md index fea6f19e6a..f5bbaaf168 100644 --- a/src/dynarmic/docs/RegisterAllocator.md +++ b/src/dynarmic/docs/RegisterAllocator.md @@ -16,19 +16,34 @@ Note that `Use`ing a value decrements its `use_count` by one. When the `use_coun The member functions on `RegAlloc` are just a combination of the above concepts. +The following registers are reserved for internal use and should NOT participate in register allocation: +- `%xmm0`, `%xmm1`, `%xmm2`: Used as scratch in exclusive memory access. +- `%rsp`: Stack pointer. +- `%r15`: JIT pointer +- `%r14`: Page table pointer. +- `%r13`: Fastmem pointer. + +The layout convenes `%r15` as the JIT state pointer - while it may be tempting to turn it into a synthetic pointer, keeping an entire register (out of 12 available) is preferable over inlining a directly computed immediate. + +Do NEVER modify `%r15`, we must make it clear that this register is "immutable" for the entirety of the JIT block duration. + ### `Scratch` - Xbyak::Reg64 ScratchGpr(HostLocList desired_locations = any_gpr) - Xbyak::Xmm ScratchXmm(HostLocList desired_locations = any_xmm) +```c++ +Xbyak::Reg64 ScratchGpr(HostLocList desired_locations = any_gpr); +Xbyak::Xmm ScratchXmm(HostLocList desired_locations = any_xmm); +``` At runtime, allocate one of the registers in `desired_locations`. You are free to modify the register. The register is discarded at the end of the allocation scope. ### Pure `Use` - Xbyak::Reg64 UseGpr(Argument& arg); - Xbyak::Xmm UseXmm(Argument& arg); - OpArg UseOpArg(Argument& arg); - void Use(Argument& arg, HostLoc host_loc); +```c++ +Xbyak::Reg64 UseGpr(Argument& arg); +Xbyak::Xmm UseXmm(Argument& arg); +OpArg UseOpArg(Argument& arg); +void Use(Argument& arg, HostLoc host_loc); +``` At runtime, the value corresponding to `arg` will be placed a register. The actual register is determined by which one of the above functions is called. `UseGpr` places it in an unused GPR, `UseXmm` places it @@ -39,9 +54,11 @@ This register **must not** have it's value changed. ### `UseScratch` - Xbyak::Reg64 UseScratchGpr(Argument& arg); - Xbyak::Xmm UseScratchXmm(Argument& arg); - void UseScratch(Argument& arg, HostLoc host_loc); +```c++ +Xbyak::Reg64 UseScratchGpr(Argument& arg); +Xbyak::Xmm UseScratchXmm(Argument& arg); +void UseScratch(Argument& arg, HostLoc host_loc); +``` At runtime, the value corresponding to `arg` will be placed a register. The actual register is determined by which one of the above functions is called. `UseScratchGpr` places it in an unused GPR, `UseScratchXmm` places it @@ -55,7 +72,9 @@ You are free to modify the value in the register. The register is discarded at t A `Define` is the defintion of a value. This is the only time when a value may be set. - void DefineValue(IR::Inst* inst, const Xbyak::Reg& reg); +```c++ +void DefineValue(IR::Inst* inst, const Xbyak::Reg& reg); +``` By calling `DefineValue`, you are stating that you wish to define the value for `inst`, and you have written the value to the specified register `reg`. @@ -64,7 +83,9 @@ value to the specified register `reg`. Adding a `Define` to an existing value. - void DefineValue(IR::Inst* inst, Argument& arg); +```c++ +void DefineValue(IR::Inst* inst, Argument& arg); +``` You are declaring that the value for `inst` is the same as the value for `arg`. No host machine instructions are emitted. diff --git a/src/dynarmic/docs/ReturnStackBufferOptimization.md b/src/dynarmic/docs/ReturnStackBufferOptimization.md index 6ffe41bcc6..0e72c3bce8 100644 --- a/src/dynarmic/docs/ReturnStackBufferOptimization.md +++ b/src/dynarmic/docs/ReturnStackBufferOptimization.md @@ -23,15 +23,17 @@ One complication dynarmic has is that a compiled block is not uniquely identifia the PC alone, but bits in the FPSCR and CPSR are also relevant. We resolve this by computing a 64-bit `UniqueHash` that is guaranteed to uniquely identify a block. - u64 LocationDescriptor::UniqueHash() const { - // This value MUST BE UNIQUE. - // This calculation has to match up with EmitX64::EmitTerminalPopRSBHint - u64 pc_u64 = u64(arm_pc) << 32; - u64 fpscr_u64 = u64(fpscr.Value()); - u64 t_u64 = cpsr.T() ? 1 : 0; - u64 e_u64 = cpsr.E() ? 2 : 0; - return pc_u64 | fpscr_u64 | t_u64 | e_u64; - } +```c++ +u64 LocationDescriptor::UniqueHash() const { + // This value MUST BE UNIQUE. + // This calculation has to match up with EmitX64::EmitTerminalPopRSBHint + u64 pc_u64 = u64(arm_pc) << 32; + u64 fpscr_u64 = u64(fpscr.Value()); + u64 t_u64 = cpsr.T() ? 1 : 0; + u64 e_u64 = cpsr.E() ? 2 : 0; + return pc_u64 | fpscr_u64 | t_u64 | e_u64; +} +``` ## Our implementation isn't actually a stack @@ -49,97 +51,107 @@ host addresses for the corresponding the compiled blocks. size of the real RSB in hardware (which has 3 entries). Larger RSBs than 8 showed degraded performance. - struct JitState { - // ... +```c++ +struct JitState { + // ... - static constexpr size_t RSBSize = 8; // MUST be a power of 2. - u32 rsb_ptr = 0; - std::array rsb_location_descriptors; - std::array rsb_codeptrs; - void ResetRSB(); + static constexpr size_t RSBSize = 8; // MUST be a power of 2. + u32 rsb_ptr = 0; + std::array rsb_location_descriptors; + std::array rsb_codeptrs; + void ResetRSB(); - // ... - }; + // ... +}; +``` ### RSB Push We insert our prediction at the insertion point iff the RSB doesn't already contain a prediction with the same `UniqueHash`. - void EmitX64::EmitPushRSB(IR::Block&, IR::Inst* inst) { - using namespace Xbyak::util; +```c++ +void EmitX64::EmitPushRSB(IR::Block&, IR::Inst* inst) { + using namespace Xbyak::util; - ASSERT(inst->GetArg(0).IsImmediate()); - u64 imm64 = inst->GetArg(0).GetU64(); + ASSERT(inst->GetArg(0).IsImmediate()); + u64 imm64 = inst->GetArg(0).GetU64(); - Xbyak::Reg64 code_ptr_reg = reg_alloc.ScratchGpr({HostLoc::RCX}); - Xbyak::Reg64 loc_desc_reg = reg_alloc.ScratchGpr(); - Xbyak::Reg32 index_reg = reg_alloc.ScratchGpr().cvt32(); - u64 code_ptr = unique_hash_to_code_ptr.find(imm64) != unique_hash_to_code_ptr.end() - ? u64(unique_hash_to_code_ptr[imm64]) - : u64(code->GetReturnFromRunCodeAddress()); + Xbyak::Reg64 code_ptr_reg = reg_alloc.ScratchGpr({HostLoc::RCX}); + Xbyak::Reg64 loc_desc_reg = reg_alloc.ScratchGpr(); + Xbyak::Reg32 index_reg = reg_alloc.ScratchGpr().cvt32(); + u64 code_ptr = unique_hash_to_code_ptr.find(imm64) != unique_hash_to_code_ptr.end() + ? u64(unique_hash_to_code_ptr[imm64]) + : u64(code->GetReturnFromRunCodeAddress()); - code->mov(index_reg, dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)]); - code->add(index_reg, 1); - code->and_(index_reg, u32(JitState::RSBSize - 1)); + code->mov(index_reg, dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)]); + code->add(index_reg, 1); + code->and_(index_reg, u32(JitState::RSBSize - 1)); - code->mov(loc_desc_reg, u64(imm64)); - CodePtr patch_location = code->getCurr(); - patch_unique_hash_locations[imm64].emplace_back(patch_location); - code->mov(code_ptr_reg, u64(code_ptr)); // This line has to match up with EmitX64::Patch. - code->EnsurePatchLocationSize(patch_location, 10); + code->mov(loc_desc_reg, u64(imm64)); + CodePtr patch_location = code->getCurr(); + patch_unique_hash_locations[imm64].emplace_back(patch_location); + code->mov(code_ptr_reg, u64(code_ptr)); // This line has to match up with EmitX64::Patch. + code->EnsurePatchLocationSize(patch_location, 10); - Xbyak::Label label; - for (size_t i = 0; i < JitState::RSBSize; ++i) { - code->cmp(loc_desc_reg, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]); - code->je(label, code->T_SHORT); - } - - code->mov(dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)], index_reg); - code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_location_descriptors)], loc_desc_reg); - code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_codeptrs)], code_ptr_reg); - code->L(label); + Xbyak::Label label; + for (size_t i = 0; i < JitState::RSBSize; ++i) { + code->cmp(loc_desc_reg, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]); + code->je(label, code->T_SHORT); } + code->mov(dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)], index_reg); + code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_location_descriptors)], loc_desc_reg); + code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_codeptrs)], code_ptr_reg); + code->L(label); +} +``` + In pseudocode: - for (i := 0 .. RSBSize-1) - if (rsb_location_descriptors[i] == imm64) - goto label; - rsb_ptr++; - rsb_ptr %= RSBSize; - rsb_location_desciptors[rsb_ptr] = imm64; //< The UniqueHash - rsb_codeptr[rsb_ptr] = /* codeptr corresponding to the UniqueHash */; - label: +```c++ + for (i := 0 .. RSBSize-1) + if (rsb_location_descriptors[i] == imm64) + goto label; + rsb_ptr++; + rsb_ptr %= RSBSize; + rsb_location_desciptors[rsb_ptr] = imm64; //< The UniqueHash + rsb_codeptr[rsb_ptr] = /* codeptr corresponding to the UniqueHash */; +label: +``` ## RSB Pop To check if a predicition is in the RSB, we linearly scan the RSB. - void EmitX64::EmitTerminalPopRSBHint(IR::Term::PopRSBHint, IR::LocationDescriptor initial_location) { - using namespace Xbyak::util; +```c++ +void EmitX64::EmitTerminalPopRSBHint(IR::Term::PopRSBHint, IR::LocationDescriptor initial_location) { + using namespace Xbyak::util; - // This calculation has to match up with IREmitter::PushRSB - code->mov(ecx, MJitStateReg(Arm::Reg::PC)); - code->shl(rcx, 32); - code->mov(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, FPSCR_mode)]); - code->or_(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, CPSR_et)]); - code->or_(rbx, rcx); + // This calculation has to match up with IREmitter::PushRSB + code->mov(ecx, MJitStateReg(Arm::Reg::PC)); + code->shl(rcx, 32); + code->mov(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, FPSCR_mode)]); + code->or_(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, CPSR_et)]); + code->or_(rbx, rcx); - code->mov(rax, u64(code->GetReturnFromRunCodeAddress())); - for (size_t i = 0; i < JitState::RSBSize; ++i) { - code->cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]); - code->cmove(rax, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_codeptrs) + i * sizeof(u64)]); - } - - code->jmp(rax); + code->mov(rax, u64(code->GetReturnFromRunCodeAddress())); + for (size_t i = 0; i < JitState::RSBSize; ++i) { + code->cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]); + code->cmove(rax, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_codeptrs) + i * sizeof(u64)]); } + code->jmp(rax); +} +``` + In pseudocode: - rbx := ComputeUniqueHash() - rax := ReturnToDispatch - for (i := 0 .. RSBSize-1) - if (rbx == rsb_location_descriptors[i]) - rax = rsb_codeptrs[i] - goto rax \ No newline at end of file +```c++ +rbx := ComputeUniqueHash() +rax := ReturnToDispatch +for (i := 0 .. RSBSize-1) + if (rbx == rsb_location_descriptors[i]) + rax = rsb_codeptrs[i] +goto rax +``` From ad472ad28878b49ba84fea91cb4b8d5ddfcf0115 Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 03:17:42 +0000 Subject: [PATCH 12/17] [vk, opengl] add lanczo filtering Signed-off-by: lizzie --- src/common/settings_enums.h | 2 +- src/qt_common/shared_translation.cpp | 1 + src/qt_common/shared_translation.h | 2 + src/video_core/host_shaders/CMakeLists.txt | 5 ++- .../host_shaders/present_lanczo.frag | 43 +++++++++++++++++++ .../renderer_opengl/gl_blit_screen.cpp | 3 ++ .../renderer_opengl/present/filters.cpp | 5 +++ .../renderer_opengl/present/filters.h | 1 + .../renderer_vulkan/present/filters.cpp | 6 +++ .../renderer_vulkan/present/filters.h | 1 + .../renderer_vulkan/vk_blit_screen.cpp | 3 ++ 11 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 src/video_core/host_shaders/present_lanczo.frag diff --git a/src/common/settings_enums.h b/src/common/settings_enums.h index 41133a7819..8d93c61ec1 100644 --- a/src/common/settings_enums.h +++ b/src/common/settings_enums.h @@ -166,7 +166,7 @@ ENUM(ResolutionSetup, Res7X, Res8X); -ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Gaussian, ScaleForce, Fsr, Area, MaxEnum); +ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Gaussian, Lanczo, ScaleForce, Fsr, Area, MaxEnum); ENUM(AntiAliasing, None, Fxaa, Smaa, MaxEnum); diff --git a/src/qt_common/shared_translation.cpp b/src/qt_common/shared_translation.cpp index eb413f28e9..c549b6b67f 100644 --- a/src/qt_common/shared_translation.cpp +++ b/src/qt_common/shared_translation.cpp @@ -573,6 +573,7 @@ std::unique_ptr ComboboxEnumeration(QObject* parent) PAIR(ScalingFilter, Bilinear, tr("Bilinear")), PAIR(ScalingFilter, Bicubic, tr("Bicubic")), PAIR(ScalingFilter, Gaussian, tr("Gaussian")), + PAIR(ScalingFilter, Lanczo, tr("Lanczo")), PAIR(ScalingFilter, ScaleForce, tr("ScaleForce")), PAIR(ScalingFilter, Fsr, tr("AMD FidelityFX™️ Super Resolution")), PAIR(ScalingFilter, Area, tr("Area")), diff --git a/src/qt_common/shared_translation.h b/src/qt_common/shared_translation.h index 48a2cb5205..a894da290a 100644 --- a/src/qt_common/shared_translation.h +++ b/src/qt_common/shared_translation.h @@ -40,6 +40,8 @@ static const std::map scaling_filter_texts_map {Settings::ScalingFilter::Bicubic, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Bicubic"))}, {Settings::ScalingFilter::Gaussian, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Gaussian"))}, + {Settings::ScalingFilter::Lanczo, + QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Lanczo"))}, {Settings::ScalingFilter::ScaleForce, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "ScaleForce"))}, {Settings::ScalingFilter::Fsr, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "FSR"))}, diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 688e10d2e4..e7dac21f98 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -1,5 +1,5 @@ -# SPDX-FileCopyrightText: 2018 yuzu Emulator Project -# SPDX-License-Identifier: GPL-2.0-or-later +# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +# SPDX-License-Identifier: GPL-3.0-or-later set(FIDELITYFX_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/externals/FidelityFX-FSR/ffx-fsr) @@ -45,6 +45,7 @@ set(SHADER_FILES present_area.frag present_bicubic.frag present_gaussian.frag + present_lanczo.frag queries_prefix_scan_sum.comp queries_prefix_scan_sum_nosubgroups.comp resolve_conditional_render.comp diff --git a/src/video_core/host_shaders/present_lanczo.frag b/src/video_core/host_shaders/present_lanczo.frag new file mode 100644 index 0000000000..5afc985bc3 --- /dev/null +++ b/src/video_core/host_shaders/present_lanczo.frag @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +// https://en.wikipedia.org/wiki/Lanczos_resampling + +#version 460 core + +layout (location = 0) in vec2 frag_tex_coord; +layout (location = 0) out vec4 color; +layout (binding = 0) uniform sampler2D color_texture; + +#define PI 3.1415926535897932384626433 + +float sinc(float x) { + return x == 0.0f ? 1.0f : sin(PI * x) / (PI * x); +} + +float lanczos(vec2 v, float a) { + float d = sqrt(v.x * v.x + v.y * v.y); + return sinc(d) / sinc(d / a); +} + +vec4 textureLanczos(sampler2D textureSampler, vec2 p) { + const int r = 1; //radius (1 = 3 steps) + vec3 c_sum = vec3(0.0f); + float w_sum = 0.0f; + vec2 res = vec2(textureSize(textureSampler, 0)); + vec2 cc = floor(p * res) / res; + // kernel size = (r * 2 + 1) * (r * 2 + 1) + for (int x = -r; x <= r; x++) + for (int y = -r; y <= r; y++) { + vec2 kp = 0.5f * (vec2(x, y) / res); // 0.5 = half-pixel level resampling + vec2 uv = cc + kp; + float w = lanczos(kp, float(r)); + c_sum += w * texture(textureSampler, p + kp).rgb; + w_sum += w; + } + return vec4(c_sum / w_sum, 1.0f); +} + +void main() { + color = textureLanczos(color_texture, frag_tex_coord); +} diff --git a/src/video_core/renderer_opengl/gl_blit_screen.cpp b/src/video_core/renderer_opengl/gl_blit_screen.cpp index 2071fe8d15..9fff39143e 100644 --- a/src/video_core/renderer_opengl/gl_blit_screen.cpp +++ b/src/video_core/renderer_opengl/gl_blit_screen.cpp @@ -89,6 +89,9 @@ void BlitScreen::CreateWindowAdapt() { case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device); break; + case Settings::ScalingFilter::Lanczo: + window_adapt = MakeLanczo(device); + break; case Settings::ScalingFilter::ScaleForce: window_adapt = MakeScaleForce(device); break; diff --git a/src/video_core/renderer_opengl/present/filters.cpp b/src/video_core/renderer_opengl/present/filters.cpp index c5ac8e7823..a9b3cdd0d9 100644 --- a/src/video_core/renderer_opengl/present/filters.cpp +++ b/src/video_core/renderer_opengl/present/filters.cpp @@ -37,6 +37,11 @@ std::unique_ptr MakeGaussian(const Device& device) { HostShaders::PRESENT_GAUSSIAN_FRAG); } +std::unique_ptr MakeLanczo(const Device& device) { + return std::make_unique(device, CreateBilinearSampler(), + HostShaders::PRESENT_LANCZO_FRAG); +} + std::unique_ptr MakeScaleForce(const Device& device) { return std::make_unique( device, CreateBilinearSampler(), diff --git a/src/video_core/renderer_opengl/present/filters.h b/src/video_core/renderer_opengl/present/filters.h index be2ce24842..c098d0da2e 100644 --- a/src/video_core/renderer_opengl/present/filters.h +++ b/src/video_core/renderer_opengl/present/filters.h @@ -18,6 +18,7 @@ std::unique_ptr MakeNearestNeighbor(const Device& device); std::unique_ptr MakeBilinear(const Device& device); std::unique_ptr MakeBicubic(const Device& device); std::unique_ptr MakeGaussian(const Device& device); +std::unique_ptr MakeLanczo(const Device& device); std::unique_ptr MakeScaleForce(const Device& device); std::unique_ptr MakeArea(const Device& device); diff --git a/src/video_core/renderer_vulkan/present/filters.cpp b/src/video_core/renderer_vulkan/present/filters.cpp index 7843f38d2c..a3a6bfc2f6 100644 --- a/src/video_core/renderer_vulkan/present/filters.cpp +++ b/src/video_core/renderer_vulkan/present/filters.cpp @@ -12,6 +12,7 @@ #include "video_core/host_shaders/present_area_frag_spv.h" #include "video_core/host_shaders/present_bicubic_frag_spv.h" #include "video_core/host_shaders/present_gaussian_frag_spv.h" +#include "video_core/host_shaders/present_lanczso_frag_spv.h" #include "video_core/host_shaders/vulkan_present_frag_spv.h" #include "video_core/host_shaders/vulkan_present_scaleforce_fp16_frag_spv.h" #include "video_core/host_shaders/vulkan_present_scaleforce_fp32_frag_spv.h" @@ -59,6 +60,11 @@ std::unique_ptr MakeGaussian(const Device& device, VkFormat fra BuildShader(device, PRESENT_GAUSSIAN_FRAG_SPV)); } +std::unique_ptr MakeLanczo(const Device& device, VkFormat frame_format) { + return std::make_unique(device, frame_format, CreateBilinearSampler(device), + BuildShader(device, PRESENT_LANCZO_FRAG_SPV)); +} + std::unique_ptr MakeScaleForce(const Device& device, VkFormat frame_format) { return std::make_unique(device, frame_format, CreateBilinearSampler(device), SelectScaleForceShader(device)); diff --git a/src/video_core/renderer_vulkan/present/filters.h b/src/video_core/renderer_vulkan/present/filters.h index c8259487f8..c51938db24 100644 --- a/src/video_core/renderer_vulkan/present/filters.h +++ b/src/video_core/renderer_vulkan/present/filters.h @@ -19,6 +19,7 @@ std::unique_ptr MakeNearestNeighbor(const Device& device, VkFor std::unique_ptr MakeBilinear(const Device& device, VkFormat frame_format); std::unique_ptr MakeBicubic(const Device& device, VkFormat frame_format); std::unique_ptr MakeGaussian(const Device& device, VkFormat frame_format); +std::unique_ptr MakeLanczo(const Device& device, VkFormat frame_format); std::unique_ptr MakeScaleForce(const Device& device, VkFormat frame_format); std::unique_ptr MakeArea(const Device& device, VkFormat frame_format); diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 39f07b966d..b398062dae 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -46,6 +46,9 @@ void BlitScreen::SetWindowAdaptPass() { case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device, swapchain_view_format); break; + case Settings::ScalingFilter::Lanczo: + window_adapt = MakeLanczo(device, swapchain_view_format); + break; case Settings::ScalingFilter::ScaleForce: window_adapt = MakeScaleForce(device, swapchain_view_format); break; From 2a472487555a9865e9ca2c429281680e4fc1bc5a Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 03:26:07 +0000 Subject: [PATCH 13/17] fix mispell Signed-off-by: lizzie --- src/common/settings_enums.h | 2 +- src/qt_common/shared_translation.cpp | 2 +- src/qt_common/shared_translation.h | 4 ++-- src/video_core/host_shaders/CMakeLists.txt | 2 +- .../{present_lanczo.frag => present_lanczos.frag} | 0 src/video_core/renderer_opengl/gl_blit_screen.cpp | 4 ++-- src/video_core/renderer_opengl/present/filters.cpp | 3 ++- src/video_core/renderer_opengl/present/filters.h | 2 +- src/video_core/renderer_vulkan/present/filters.cpp | 4 ++-- src/video_core/renderer_vulkan/present/filters.h | 2 +- src/video_core/renderer_vulkan/vk_blit_screen.cpp | 4 ++-- 11 files changed, 15 insertions(+), 14 deletions(-) rename src/video_core/host_shaders/{present_lanczo.frag => present_lanczos.frag} (100%) diff --git a/src/common/settings_enums.h b/src/common/settings_enums.h index 8d93c61ec1..c768c23cda 100644 --- a/src/common/settings_enums.h +++ b/src/common/settings_enums.h @@ -166,7 +166,7 @@ ENUM(ResolutionSetup, Res7X, Res8X); -ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Gaussian, Lanczo, ScaleForce, Fsr, Area, MaxEnum); +ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Gaussian, Lanczos, ScaleForce, Fsr, Area, MaxEnum); ENUM(AntiAliasing, None, Fxaa, Smaa, MaxEnum); diff --git a/src/qt_common/shared_translation.cpp b/src/qt_common/shared_translation.cpp index c549b6b67f..91c16f3102 100644 --- a/src/qt_common/shared_translation.cpp +++ b/src/qt_common/shared_translation.cpp @@ -573,7 +573,7 @@ std::unique_ptr ComboboxEnumeration(QObject* parent) PAIR(ScalingFilter, Bilinear, tr("Bilinear")), PAIR(ScalingFilter, Bicubic, tr("Bicubic")), PAIR(ScalingFilter, Gaussian, tr("Gaussian")), - PAIR(ScalingFilter, Lanczo, tr("Lanczo")), + PAIR(ScalingFilter, Lanczos, tr("Lanczos")), PAIR(ScalingFilter, ScaleForce, tr("ScaleForce")), PAIR(ScalingFilter, Fsr, tr("AMD FidelityFX™️ Super Resolution")), PAIR(ScalingFilter, Area, tr("Area")), diff --git a/src/qt_common/shared_translation.h b/src/qt_common/shared_translation.h index a894da290a..ea8e7fe1bd 100644 --- a/src/qt_common/shared_translation.h +++ b/src/qt_common/shared_translation.h @@ -40,8 +40,8 @@ static const std::map scaling_filter_texts_map {Settings::ScalingFilter::Bicubic, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Bicubic"))}, {Settings::ScalingFilter::Gaussian, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Gaussian"))}, - {Settings::ScalingFilter::Lanczo, - QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Lanczo"))}, + {Settings::ScalingFilter::Lanczos, + QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Lanczos"))}, {Settings::ScalingFilter::ScaleForce, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "ScaleForce"))}, {Settings::ScalingFilter::Fsr, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "FSR"))}, diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index e7dac21f98..d8ea826498 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -45,7 +45,7 @@ set(SHADER_FILES present_area.frag present_bicubic.frag present_gaussian.frag - present_lanczo.frag + present_lanczos.frag queries_prefix_scan_sum.comp queries_prefix_scan_sum_nosubgroups.comp resolve_conditional_render.comp diff --git a/src/video_core/host_shaders/present_lanczo.frag b/src/video_core/host_shaders/present_lanczos.frag similarity index 100% rename from src/video_core/host_shaders/present_lanczo.frag rename to src/video_core/host_shaders/present_lanczos.frag diff --git a/src/video_core/renderer_opengl/gl_blit_screen.cpp b/src/video_core/renderer_opengl/gl_blit_screen.cpp index 9fff39143e..5d2246ada1 100644 --- a/src/video_core/renderer_opengl/gl_blit_screen.cpp +++ b/src/video_core/renderer_opengl/gl_blit_screen.cpp @@ -89,8 +89,8 @@ void BlitScreen::CreateWindowAdapt() { case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device); break; - case Settings::ScalingFilter::Lanczo: - window_adapt = MakeLanczo(device); + case Settings::ScalingFilter::Lanczos: + window_adapt = MakeLanczos(device); break; case Settings::ScalingFilter::ScaleForce: window_adapt = MakeScaleForce(device); diff --git a/src/video_core/renderer_opengl/present/filters.cpp b/src/video_core/renderer_opengl/present/filters.cpp index a9b3cdd0d9..8464123be0 100644 --- a/src/video_core/renderer_opengl/present/filters.cpp +++ b/src/video_core/renderer_opengl/present/filters.cpp @@ -12,6 +12,7 @@ #include "video_core/host_shaders/present_area_frag.h" #include "video_core/host_shaders/present_bicubic_frag.h" #include "video_core/host_shaders/present_gaussian_frag.h" +#include "video_core/host_shaders/present_lanczos_frag.h" #include "video_core/renderer_opengl/present/filters.h" #include "video_core/renderer_opengl/present/util.h" @@ -37,7 +38,7 @@ std::unique_ptr MakeGaussian(const Device& device) { HostShaders::PRESENT_GAUSSIAN_FRAG); } -std::unique_ptr MakeLanczo(const Device& device) { +std::unique_ptr MakeLanczos(const Device& device) { return std::make_unique(device, CreateBilinearSampler(), HostShaders::PRESENT_LANCZO_FRAG); } diff --git a/src/video_core/renderer_opengl/present/filters.h b/src/video_core/renderer_opengl/present/filters.h index c098d0da2e..f71b5f93d3 100644 --- a/src/video_core/renderer_opengl/present/filters.h +++ b/src/video_core/renderer_opengl/present/filters.h @@ -18,7 +18,7 @@ std::unique_ptr MakeNearestNeighbor(const Device& device); std::unique_ptr MakeBilinear(const Device& device); std::unique_ptr MakeBicubic(const Device& device); std::unique_ptr MakeGaussian(const Device& device); -std::unique_ptr MakeLanczo(const Device& device); +std::unique_ptr MakeLanczos(const Device& device); std::unique_ptr MakeScaleForce(const Device& device); std::unique_ptr MakeArea(const Device& device); diff --git a/src/video_core/renderer_vulkan/present/filters.cpp b/src/video_core/renderer_vulkan/present/filters.cpp index a3a6bfc2f6..5ab3ac3114 100644 --- a/src/video_core/renderer_vulkan/present/filters.cpp +++ b/src/video_core/renderer_vulkan/present/filters.cpp @@ -12,7 +12,7 @@ #include "video_core/host_shaders/present_area_frag_spv.h" #include "video_core/host_shaders/present_bicubic_frag_spv.h" #include "video_core/host_shaders/present_gaussian_frag_spv.h" -#include "video_core/host_shaders/present_lanczso_frag_spv.h" +#include "video_core/host_shaders/present_lanczos_frag_spv.h" #include "video_core/host_shaders/vulkan_present_frag_spv.h" #include "video_core/host_shaders/vulkan_present_scaleforce_fp16_frag_spv.h" #include "video_core/host_shaders/vulkan_present_scaleforce_fp32_frag_spv.h" @@ -60,7 +60,7 @@ std::unique_ptr MakeGaussian(const Device& device, VkFormat fra BuildShader(device, PRESENT_GAUSSIAN_FRAG_SPV)); } -std::unique_ptr MakeLanczo(const Device& device, VkFormat frame_format) { +std::unique_ptr MakeLanczos(const Device& device, VkFormat frame_format) { return std::make_unique(device, frame_format, CreateBilinearSampler(device), BuildShader(device, PRESENT_LANCZO_FRAG_SPV)); } diff --git a/src/video_core/renderer_vulkan/present/filters.h b/src/video_core/renderer_vulkan/present/filters.h index c51938db24..8b0630e748 100644 --- a/src/video_core/renderer_vulkan/present/filters.h +++ b/src/video_core/renderer_vulkan/present/filters.h @@ -19,7 +19,7 @@ std::unique_ptr MakeNearestNeighbor(const Device& device, VkFor std::unique_ptr MakeBilinear(const Device& device, VkFormat frame_format); std::unique_ptr MakeBicubic(const Device& device, VkFormat frame_format); std::unique_ptr MakeGaussian(const Device& device, VkFormat frame_format); -std::unique_ptr MakeLanczo(const Device& device, VkFormat frame_format); +std::unique_ptr MakeLanczos(const Device& device, VkFormat frame_format); std::unique_ptr MakeScaleForce(const Device& device, VkFormat frame_format); std::unique_ptr MakeArea(const Device& device, VkFormat frame_format); diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index b398062dae..3a003a871e 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -46,8 +46,8 @@ void BlitScreen::SetWindowAdaptPass() { case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device, swapchain_view_format); break; - case Settings::ScalingFilter::Lanczo: - window_adapt = MakeLanczo(device, swapchain_view_format); + case Settings::ScalingFilter::Lanczos: + window_adapt = MakeLanczos(device, swapchain_view_format); break; case Settings::ScalingFilter::ScaleForce: window_adapt = MakeScaleForce(device, swapchain_view_format); From 5c00af4a02079d2deb01356841a272b7fedc0c09 Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 03:39:37 +0000 Subject: [PATCH 14/17] fix Signed-off-by: lizzie --- src/video_core/renderer_opengl/present/filters.cpp | 2 +- src/video_core/renderer_vulkan/present/filters.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_opengl/present/filters.cpp b/src/video_core/renderer_opengl/present/filters.cpp index 8464123be0..3424a52d80 100644 --- a/src/video_core/renderer_opengl/present/filters.cpp +++ b/src/video_core/renderer_opengl/present/filters.cpp @@ -40,7 +40,7 @@ std::unique_ptr MakeGaussian(const Device& device) { std::unique_ptr MakeLanczos(const Device& device) { return std::make_unique(device, CreateBilinearSampler(), - HostShaders::PRESENT_LANCZO_FRAG); + HostShaders::PRESENT_LANCZOS_FRAG); } std::unique_ptr MakeScaleForce(const Device& device) { diff --git a/src/video_core/renderer_vulkan/present/filters.cpp b/src/video_core/renderer_vulkan/present/filters.cpp index 5ab3ac3114..8fed222504 100644 --- a/src/video_core/renderer_vulkan/present/filters.cpp +++ b/src/video_core/renderer_vulkan/present/filters.cpp @@ -62,7 +62,7 @@ std::unique_ptr MakeGaussian(const Device& device, VkFormat fra std::unique_ptr MakeLanczos(const Device& device, VkFormat frame_format) { return std::make_unique(device, frame_format, CreateBilinearSampler(device), - BuildShader(device, PRESENT_LANCZO_FRAG_SPV)); + BuildShader(device, PRESENT_LANCZOS_FRAG_SPV)); } std::unique_ptr MakeScaleForce(const Device& device, VkFormat frame_format) { From 21b0964df6f7e987fda92c0db4592fc68a4586a3 Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 13:00:46 +0000 Subject: [PATCH 15/17] optimize with precomputed kernel Signed-off-by: lizzie --- .../host_shaders/present_lanczos.frag | 35 +++++--------- tools/lanczos_gen.c | 48 +++++++++++++++++++ 2 files changed, 61 insertions(+), 22 deletions(-) create mode 100644 tools/lanczos_gen.c diff --git a/src/video_core/host_shaders/present_lanczos.frag b/src/video_core/host_shaders/present_lanczos.frag index 5afc985bc3..9501b7ca33 100644 --- a/src/video_core/host_shaders/present_lanczos.frag +++ b/src/video_core/host_shaders/present_lanczos.frag @@ -9,32 +9,23 @@ layout (location = 0) in vec2 frag_tex_coord; layout (location = 0) out vec4 color; layout (binding = 0) uniform sampler2D color_texture; -#define PI 3.1415926535897932384626433 - -float sinc(float x) { - return x == 0.0f ? 1.0f : sin(PI * x) / (PI * x); -} - -float lanczos(vec2 v, float a) { - float d = sqrt(v.x * v.x + v.y * v.y); - return sinc(d) / sinc(d / a); -} - +// precomputed kernel +const float w_kernel[49] = float[] ( + -0.238811f, 0.531959f, 0.961865f, 1.000000f, 0.961865f, 0.531959f, -0.238811f, 0.531959f, 0.957419f, 0.313883f, -0.000000f, 0.313883f, 0.957419f, 0.531959f, 0.961865f, 0.313883f, -0.322602f, 0.000000f, -0.322602f, 0.313883f, 0.961865f, 1.000000f, -0.000000f, 0.000000f, 1.000000f, 0.000000f, -0.000000f, 1.000000f, 0.961865f, 0.313883f, -0.322602f, 0.000000f, -0.322602f, 0.313883f, 0.961865f, 0.531959f, 0.957419f, 0.313883f, -0.000000f, 0.313883f, 0.957419f, 0.531959f, -0.238811f, 0.531959f, 0.961865f, 1.000000f, 0.961865f, 0.531959f, -0.238811f +); +const vec2 w_pos[49] = vec2[] ( + vec2(-0.750000f, -0.750000f), vec2(-0.750000f, -0.500000f), vec2(-0.750000f, -0.250000f), vec2(-0.750000f, 0.000000f), vec2(-0.750000f, 0.250000f), vec2(-0.750000f, 0.500000f), vec2(-0.750000f, 0.750000f), vec2(-0.500000f, -0.750000f), vec2(-0.500000f, -0.500000f), vec2(-0.500000f, -0.250000f), vec2(-0.500000f, 0.000000f), vec2(-0.500000f, 0.250000f), vec2(-0.500000f, 0.500000f), vec2(-0.500000f, 0.750000f), vec2(-0.250000f, -0.750000f), vec2(-0.250000f, -0.500000f), vec2(-0.250000f, -0.250000f), vec2(-0.250000f, 0.000000f), vec2(-0.250000f, 0.250000f), vec2(-0.250000f, 0.500000f), vec2(-0.250000f, 0.750000f), vec2(0.000000f, -0.750000f), vec2(0.000000f, -0.500000f), vec2(0.000000f, -0.250000f), vec2(0.000000f, 0.000000f), vec2(0.000000f, 0.250000f), vec2(0.000000f, 0.500000f), vec2(0.000000f, 0.750000f), vec2(0.250000f, -0.750000f), vec2(0.250000f, -0.500000f), vec2(0.250000f, -0.250000f), vec2(0.250000f, 0.000000f), vec2(0.250000f, 0.250000f), vec2(0.250000f, 0.500000f), vec2(0.250000f, 0.750000f), vec2(0.500000f, -0.750000f), vec2(0.500000f, -0.500000f), vec2(0.500000f, -0.250000f), vec2(0.500000f, 0.000000f), vec2(0.500000f, 0.250000f), vec2(0.500000f, 0.500000f), vec2(0.500000f, 0.750000f), vec2(0.750000f, -0.750000f), vec2(0.750000f, -0.500000f), vec2(0.750000f, -0.250000f), vec2(0.750000f, 0.000000f), vec2(0.750000f, 0.250000f), vec2(0.750000f, 0.500000f), vec2(0.750000f, 0.750000f) +); +const float w_sum = 21.045683f; vec4 textureLanczos(sampler2D textureSampler, vec2 p) { - const int r = 1; //radius (1 = 3 steps) vec3 c_sum = vec3(0.0f); - float w_sum = 0.0f; vec2 res = vec2(textureSize(textureSampler, 0)); vec2 cc = floor(p * res) / res; - // kernel size = (r * 2 + 1) * (r * 2 + 1) - for (int x = -r; x <= r; x++) - for (int y = -r; y <= r; y++) { - vec2 kp = 0.5f * (vec2(x, y) / res); // 0.5 = half-pixel level resampling - vec2 uv = cc + kp; - float w = lanczos(kp, float(r)); - c_sum += w * texture(textureSampler, p + kp).rgb; - w_sum += w; - } + for (int i = 0; i < 49; i++) { // kernel size = (r * 2 + 1) ^ 2 + vec2 kp = w_pos[i] / res; + vec2 uv = cc + kp; + c_sum += w_kernel[i] * texture(textureSampler, p + kp).rgb; + } return vec4(c_sum / w_sum, 1.0f); } diff --git a/tools/lanczos_gen.c b/tools/lanczos_gen.c new file mode 100644 index 0000000000..6d7be3cb0e --- /dev/null +++ b/tools/lanczos_gen.c @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +// clang -lm tools/lanczos_gen.c -o tools/lanczos_gen && ./tools/lanczos_gen +#include +#include + +double sinc(double x) { + return x == 0.0f ? 1.0f : sin(M_PI * x) / (M_PI * x); +} + +typedef struct vec2 { + double x; + double y; +} vec2; + +double lanczos(vec2 v, float a) { + double d = sqrt(v.x * v.x + v.y * v.y); + return sinc(d) / sinc(d / a); +} + +int main(int argc, char* argv[]) { + const int r = 3; //radius (1 = 3 steps) + const int k_size = (r * 2 + 1) * (r * 2 + 1); + double w_sum = 0.0f; + // kernel size = (r * 2 + 1) ^ 2 + printf("const float w_kernel[%i] = float[] (\n ", k_size); + double factor = 1.0f / ((double)r + 1.0f); + for (int x = -r; x <= r; x++) + for (int y = -r; y <= r; y++) { + double w = lanczos((vec2){ .x = x, .y = y }, (double)r); + printf("%lff, ", w); + w_sum += w; + } + printf("\n);\n"); + printf("const vec2 w_pos[%i] = vec2[] (\n ", k_size); + for (int x = -r; x <= r; x++) + for (int y = -r; y <= r; y++) { + vec2 kp = (vec2){ + .x = x * factor, + .y = y * factor + }; + printf("vec2(%lff, %lff), ", kp.x, kp.y); + } + printf("\n);\n"); + printf("const float w_sum = %lff;\n", w_sum); + return 0; +} From de8dc44ab1a359ba5b7f16c9b864d4ad4946ea2a Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 14:26:53 +0000 Subject: [PATCH 16/17] actually memory is bad Signed-off-by: lizzie --- .../host_shaders/present_lanczos.frag | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/video_core/host_shaders/present_lanczos.frag b/src/video_core/host_shaders/present_lanczos.frag index 9501b7ca33..b69b329c1b 100644 --- a/src/video_core/host_shaders/present_lanczos.frag +++ b/src/video_core/host_shaders/present_lanczos.frag @@ -9,23 +9,29 @@ layout (location = 0) in vec2 frag_tex_coord; layout (location = 0) out vec4 color; layout (binding = 0) uniform sampler2D color_texture; -// precomputed kernel -const float w_kernel[49] = float[] ( - -0.238811f, 0.531959f, 0.961865f, 1.000000f, 0.961865f, 0.531959f, -0.238811f, 0.531959f, 0.957419f, 0.313883f, -0.000000f, 0.313883f, 0.957419f, 0.531959f, 0.961865f, 0.313883f, -0.322602f, 0.000000f, -0.322602f, 0.313883f, 0.961865f, 1.000000f, -0.000000f, 0.000000f, 1.000000f, 0.000000f, -0.000000f, 1.000000f, 0.961865f, 0.313883f, -0.322602f, 0.000000f, -0.322602f, 0.313883f, 0.961865f, 0.531959f, 0.957419f, 0.313883f, -0.000000f, 0.313883f, 0.957419f, 0.531959f, -0.238811f, 0.531959f, 0.961865f, 1.000000f, 0.961865f, 0.531959f, -0.238811f -); -const vec2 w_pos[49] = vec2[] ( - vec2(-0.750000f, -0.750000f), vec2(-0.750000f, -0.500000f), vec2(-0.750000f, -0.250000f), vec2(-0.750000f, 0.000000f), vec2(-0.750000f, 0.250000f), vec2(-0.750000f, 0.500000f), vec2(-0.750000f, 0.750000f), vec2(-0.500000f, -0.750000f), vec2(-0.500000f, -0.500000f), vec2(-0.500000f, -0.250000f), vec2(-0.500000f, 0.000000f), vec2(-0.500000f, 0.250000f), vec2(-0.500000f, 0.500000f), vec2(-0.500000f, 0.750000f), vec2(-0.250000f, -0.750000f), vec2(-0.250000f, -0.500000f), vec2(-0.250000f, -0.250000f), vec2(-0.250000f, 0.000000f), vec2(-0.250000f, 0.250000f), vec2(-0.250000f, 0.500000f), vec2(-0.250000f, 0.750000f), vec2(0.000000f, -0.750000f), vec2(0.000000f, -0.500000f), vec2(0.000000f, -0.250000f), vec2(0.000000f, 0.000000f), vec2(0.000000f, 0.250000f), vec2(0.000000f, 0.500000f), vec2(0.000000f, 0.750000f), vec2(0.250000f, -0.750000f), vec2(0.250000f, -0.500000f), vec2(0.250000f, -0.250000f), vec2(0.250000f, 0.000000f), vec2(0.250000f, 0.250000f), vec2(0.250000f, 0.500000f), vec2(0.250000f, 0.750000f), vec2(0.500000f, -0.750000f), vec2(0.500000f, -0.500000f), vec2(0.500000f, -0.250000f), vec2(0.500000f, 0.000000f), vec2(0.500000f, 0.250000f), vec2(0.500000f, 0.500000f), vec2(0.500000f, 0.750000f), vec2(0.750000f, -0.750000f), vec2(0.750000f, -0.500000f), vec2(0.750000f, -0.250000f), vec2(0.750000f, 0.000000f), vec2(0.750000f, 0.250000f), vec2(0.750000f, 0.500000f), vec2(0.750000f, 0.750000f) -); -const float w_sum = 21.045683f; +#define PI 3.1415926535897932384626433 +float sinc(float x) { + return x == 0.0f ? 1.0f : sin(PI * x) / (PI * x); +} +float lanczos(vec2 v, float a) { + float d = length(v); + return sinc(d) / sinc(d / a); +} vec4 textureLanczos(sampler2D textureSampler, vec2 p) { vec3 c_sum = vec3(0.0f); + float w_sum = 0.0f; vec2 res = vec2(textureSize(textureSampler, 0)); vec2 cc = floor(p * res) / res; - for (int i = 0; i < 49; i++) { // kernel size = (r * 2 + 1) ^ 2 - vec2 kp = w_pos[i] / res; - vec2 uv = cc + kp; - c_sum += w_kernel[i] * texture(textureSampler, p + kp).rgb; - } + // kernel size = (2r + 1)^2 + const int r = 3; //radius (1 = 3 steps) + for (int x = -r; x <= r; x++) + for (int y = -r; y <= r; y++) { + vec2 kp = 0.5f * (vec2(x, y) / res); // 0.5 = half-pixel level resampling + vec2 uv = cc + kp; + float w = lanczos(kp, float(r)); + c_sum += w * texture(textureSampler, p + kp).rgb; + w_sum += w; + } return vec4(c_sum / w_sum, 1.0f); } From 97933fe7e8bea72c06be9136fd10a4864b8c7ae9 Mon Sep 17 00:00:00 2001 From: lizzie Date: Sat, 20 Sep 2025 20:12:34 +0000 Subject: [PATCH 17/17] add spline-1 filter Signed-off-by: lizzie --- src/common/settings_enums.h | 2 +- src/qt_common/shared_translation.cpp | 1 + src/qt_common/shared_translation.h | 2 ++ src/video_core/host_shaders/CMakeLists.txt | 1 + .../host_shaders/present_spline1.frag | 24 +++++++++++++++++++ .../renderer_opengl/gl_blit_screen.cpp | 3 +++ .../renderer_opengl/present/filters.cpp | 5 ++++ .../renderer_opengl/present/filters.h | 1 + .../renderer_vulkan/present/filters.cpp | 5 ++++ .../renderer_vulkan/present/filters.h | 1 + .../renderer_vulkan/vk_blit_screen.cpp | 3 +++ 11 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 src/video_core/host_shaders/present_spline1.frag diff --git a/src/common/settings_enums.h b/src/common/settings_enums.h index c768c23cda..ebfa4ceb9e 100644 --- a/src/common/settings_enums.h +++ b/src/common/settings_enums.h @@ -166,7 +166,7 @@ ENUM(ResolutionSetup, Res7X, Res8X); -ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Gaussian, Lanczos, ScaleForce, Fsr, Area, MaxEnum); +ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Spline1, Gaussian, Lanczos, ScaleForce, Fsr, Area, MaxEnum); ENUM(AntiAliasing, None, Fxaa, Smaa, MaxEnum); diff --git a/src/qt_common/shared_translation.cpp b/src/qt_common/shared_translation.cpp index 91c16f3102..4254253c2f 100644 --- a/src/qt_common/shared_translation.cpp +++ b/src/qt_common/shared_translation.cpp @@ -572,6 +572,7 @@ std::unique_ptr ComboboxEnumeration(QObject* parent) PAIR(ScalingFilter, NearestNeighbor, tr("Nearest Neighbor")), PAIR(ScalingFilter, Bilinear, tr("Bilinear")), PAIR(ScalingFilter, Bicubic, tr("Bicubic")), + PAIR(ScalingFilter, Spline1, tr("Spline-1")), PAIR(ScalingFilter, Gaussian, tr("Gaussian")), PAIR(ScalingFilter, Lanczos, tr("Lanczos")), PAIR(ScalingFilter, ScaleForce, tr("ScaleForce")), diff --git a/src/qt_common/shared_translation.h b/src/qt_common/shared_translation.h index ea8e7fe1bd..c9216c2daa 100644 --- a/src/qt_common/shared_translation.h +++ b/src/qt_common/shared_translation.h @@ -38,6 +38,8 @@ static const std::map scaling_filter_texts_map {Settings::ScalingFilter::Bilinear, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Bilinear"))}, {Settings::ScalingFilter::Bicubic, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Bicubic"))}, + {Settings::ScalingFilter::Spline1, + QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Spline-1"))}, {Settings::ScalingFilter::Gaussian, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Gaussian"))}, {Settings::ScalingFilter::Lanczos, diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index d8ea826498..c14b44a45a 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -46,6 +46,7 @@ set(SHADER_FILES present_bicubic.frag present_gaussian.frag present_lanczos.frag + present_spline1.frag queries_prefix_scan_sum.comp queries_prefix_scan_sum_nosubgroups.comp resolve_conditional_render.comp diff --git a/src/video_core/host_shaders/present_spline1.frag b/src/video_core/host_shaders/present_spline1.frag new file mode 100644 index 0000000000..871b47586b --- /dev/null +++ b/src/video_core/host_shaders/present_spline1.frag @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +// Spline (smooth linear inerpolation) with 1 texel fetch (needs bilinear to work) +// Emulates bicubic without actually doing bicubic +// See https://iquilezles.org/articles/texture, unfortunely there are issues with the original +// where smoothstep "expansion" actually results in worse codegen (SPIRV-Opt does a direct conv to smoothstep) +// TODO: Numerical analysis - fract is sawtooth func and floor, reuse params? Perhaps - no need for precision + +#version 460 core + +layout (location = 0) in vec2 frag_tex_coord; +layout (location = 0) out vec4 color; +layout (binding = 0) uniform sampler2D color_texture; + +vec4 textureSpline1(sampler2D sam, vec2 uv) { + float r = float(textureSize(sam, 0).x); + vec2 x = fract(uv * r + 0.5); + return texture(sam, (floor(uv * r + 0.5) + smoothstep(0.0, 1.0, x) - 0.5) / r); +} + +void main() { + color = textureSpline1(color_texture, frag_tex_coord); +} diff --git a/src/video_core/renderer_opengl/gl_blit_screen.cpp b/src/video_core/renderer_opengl/gl_blit_screen.cpp index 5d2246ada1..65670fcad8 100644 --- a/src/video_core/renderer_opengl/gl_blit_screen.cpp +++ b/src/video_core/renderer_opengl/gl_blit_screen.cpp @@ -89,6 +89,9 @@ void BlitScreen::CreateWindowAdapt() { case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device); break; + case Settings::ScalingFilter::Spline1: + window_adapt = MakeSpline1(device); + break; case Settings::ScalingFilter::Lanczos: window_adapt = MakeLanczos(device); break; diff --git a/src/video_core/renderer_opengl/present/filters.cpp b/src/video_core/renderer_opengl/present/filters.cpp index 3424a52d80..a840de304e 100644 --- a/src/video_core/renderer_opengl/present/filters.cpp +++ b/src/video_core/renderer_opengl/present/filters.cpp @@ -28,6 +28,11 @@ std::unique_ptr MakeBilinear(const Device& device) { HostShaders::OPENGL_PRESENT_FRAG); } +std::unique_ptr MakeSpline1(const Device& device) { + return std::make_unique(device, CreateBilinearSampler(), + HostShaders::PRESENT_SPLINE1_FRAG); +} + std::unique_ptr MakeBicubic(const Device& device) { return std::make_unique(device, CreateBilinearSampler(), HostShaders::PRESENT_BICUBIC_FRAG); diff --git a/src/video_core/renderer_opengl/present/filters.h b/src/video_core/renderer_opengl/present/filters.h index f71b5f93d3..7b38ac56bc 100644 --- a/src/video_core/renderer_opengl/present/filters.h +++ b/src/video_core/renderer_opengl/present/filters.h @@ -18,6 +18,7 @@ std::unique_ptr MakeNearestNeighbor(const Device& device); std::unique_ptr MakeBilinear(const Device& device); std::unique_ptr MakeBicubic(const Device& device); std::unique_ptr MakeGaussian(const Device& device); +std::unique_ptr MakeSpline1(const Device& device); std::unique_ptr MakeLanczos(const Device& device); std::unique_ptr MakeScaleForce(const Device& device); std::unique_ptr MakeArea(const Device& device); diff --git a/src/video_core/renderer_vulkan/present/filters.cpp b/src/video_core/renderer_vulkan/present/filters.cpp index 8fed222504..6622b8daea 100644 --- a/src/video_core/renderer_vulkan/present/filters.cpp +++ b/src/video_core/renderer_vulkan/present/filters.cpp @@ -46,6 +46,11 @@ std::unique_ptr MakeBilinear(const Device& device, VkFormat fra BuildShader(device, VULKAN_PRESENT_FRAG_SPV)); } +std::unique_ptr MakeSpline1(const Device& device, VkFormat frame_format) { + return std::make_unique(device, frame_format, CreateBilinearSampler(device), + BuildShader(device, PRESENT_SPLINE1_FRAG_SPV)); +} + std::unique_ptr MakeBicubic(const Device& device, VkFormat frame_format) { // No need for handrolled shader -- if the VK impl can do it for us ;) if (device.IsExtFilterCubicSupported()) diff --git a/src/video_core/renderer_vulkan/present/filters.h b/src/video_core/renderer_vulkan/present/filters.h index 8b0630e748..015bffc8a5 100644 --- a/src/video_core/renderer_vulkan/present/filters.h +++ b/src/video_core/renderer_vulkan/present/filters.h @@ -18,6 +18,7 @@ class MemoryAllocator; std::unique_ptr MakeNearestNeighbor(const Device& device, VkFormat frame_format); std::unique_ptr MakeBilinear(const Device& device, VkFormat frame_format); std::unique_ptr MakeBicubic(const Device& device, VkFormat frame_format); +std::unique_ptr MakeSpline1(const Device& device, VkFormat frame_format); std::unique_ptr MakeGaussian(const Device& device, VkFormat frame_format); std::unique_ptr MakeLanczos(const Device& device, VkFormat frame_format); std::unique_ptr MakeScaleForce(const Device& device, VkFormat frame_format); diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 3a003a871e..b720bcded3 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -43,6 +43,9 @@ void BlitScreen::SetWindowAdaptPass() { case Settings::ScalingFilter::Bicubic: window_adapt = MakeBicubic(device, swapchain_view_format); break; + case Settings::ScalingFilter::Spline1: + window_adapt = MakeSpline1(device, swapchain_view_format); + break; case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device, swapchain_view_format); break;