diff --git a/src/common/settings_enums.h b/src/common/settings_enums.h index c768c23cda..ebfa4ceb9e 100644 --- a/src/common/settings_enums.h +++ b/src/common/settings_enums.h @@ -166,7 +166,7 @@ ENUM(ResolutionSetup, Res7X, Res8X); -ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Gaussian, Lanczos, ScaleForce, Fsr, Area, MaxEnum); +ENUM(ScalingFilter, NearestNeighbor, Bilinear, Bicubic, Spline1, Gaussian, Lanczos, ScaleForce, Fsr, Area, MaxEnum); ENUM(AntiAliasing, None, Fxaa, Smaa, MaxEnum); diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 4d9566a60f..6b64ab7820 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -107,8 +107,6 @@ add_library(core STATIC file_sys/fssystem/fssystem_nca_header.cpp file_sys/fssystem/fssystem_nca_header.h file_sys/fssystem/fssystem_nca_reader.cpp - file_sys/fssystem/fssystem_pooled_buffer.cpp - file_sys/fssystem/fssystem_pooled_buffer.h file_sys/fssystem/fssystem_sparse_storage.cpp file_sys/fssystem/fssystem_sparse_storage.h file_sys/fssystem/fssystem_switch_storage.h diff --git a/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp b/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp index c18fde18f4..aaf7788801 100644 --- a/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp +++ b/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp @@ -1,10 +1,12 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #include "common/alignment.h" #include "common/swap.h" #include "core/file_sys/fssystem/fssystem_aes_ctr_storage.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" #include "core/file_sys/fssystem/fssystem_utility.h" namespace FileSys { @@ -76,13 +78,6 @@ size_t AesCtrStorage::Write(const u8* buffer, size_t size, size_t offset) { ASSERT(Common::IsAligned(offset, BlockSize)); ASSERT(Common::IsAligned(size, BlockSize)); - // Get a pooled buffer. - PooledBuffer pooled_buffer; - const bool use_work_buffer = true; - if (use_work_buffer) { - pooled_buffer.Allocate(size, BlockSize); - } - // Setup the counter. std::array ctr; std::memcpy(ctr.data(), m_iv.data(), IvSize); @@ -91,25 +86,20 @@ size_t AesCtrStorage::Write(const u8* buffer, size_t size, size_t offset) { // Loop until all data is written. size_t remaining = size; s64 cur_offset = 0; + + // Get a pooled buffer. + std::vector pooled_buffer(BlockSize); while (remaining > 0) { // Determine data we're writing and where. - const size_t write_size = - use_work_buffer ? (std::min)(pooled_buffer.GetSize(), remaining) : remaining; - - void* write_buf; - if (use_work_buffer) { - write_buf = pooled_buffer.GetBuffer(); - } else { - write_buf = const_cast(buffer); - } + const size_t write_size = std::min(pooled_buffer.size(), remaining); + u8* write_buf = reinterpret_cast(pooled_buffer.data()); // Encrypt the data. m_cipher->SetIV(ctr); - m_cipher->Transcode(buffer, write_size, reinterpret_cast(write_buf), - Core::Crypto::Op::Encrypt); + m_cipher->Transcode(buffer, write_size, write_buf, Core::Crypto::Op::Encrypt); // Write the encrypted data. - m_base_storage->Write(reinterpret_cast(write_buf), write_size, offset + cur_offset); + m_base_storage->Write(write_buf, write_size, offset + cur_offset); // Advance. cur_offset += write_size; diff --git a/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp b/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp index 5ef2544dfb..9e7a104c89 100644 --- a/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp +++ b/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp @@ -1,11 +1,12 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #include "common/alignment.h" #include "common/swap.h" -#include "core/file_sys/errors.h" #include "core/file_sys/fssystem/fssystem_aes_xts_storage.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" #include "core/file_sys/fssystem/fssystem_utility.h" namespace FileSys { @@ -69,17 +70,14 @@ size_t AesXtsStorage::Read(u8* buffer, size_t size, size_t offset) const { // Decrypt into a pooled buffer. { - PooledBuffer tmp_buf(m_block_size, m_block_size); - ASSERT(tmp_buf.GetSize() >= m_block_size); - - std::memset(tmp_buf.GetBuffer(), 0, skip_size); - std::memcpy(tmp_buf.GetBuffer() + skip_size, buffer, data_size); + std::vector tmp_buf(m_block_size, 0); + std::memcpy(tmp_buf.data() + skip_size, buffer, data_size); m_cipher->SetIV(ctr); - m_cipher->Transcode(tmp_buf.GetBuffer(), m_block_size, tmp_buf.GetBuffer(), + m_cipher->Transcode(tmp_buf.data(), m_block_size, tmp_buf.data(), Core::Crypto::Op::Decrypt); - std::memcpy(buffer, tmp_buf.GetBuffer() + skip_size, data_size); + std::memcpy(buffer, tmp_buf.data() + skip_size, data_size); } AddCounter(ctr.data(), IvSize, 1); diff --git a/src/core/file_sys/fssystem/fssystem_alignment_matching_storage.h b/src/core/file_sys/fssystem/fssystem_alignment_matching_storage.h index f96691d03d..60a6d24435 100644 --- a/src/core/file_sys/fssystem/fssystem_alignment_matching_storage.h +++ b/src/core/file_sys/fssystem/fssystem_alignment_matching_storage.h @@ -1,13 +1,14 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #pragma once #include "common/alignment.h" -#include "core/file_sys/errors.h" #include "core/file_sys/fssystem/fs_i_storage.h" #include "core/file_sys/fssystem/fssystem_alignment_matching_storage_impl.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" namespace FileSys { @@ -89,10 +90,11 @@ private: VirtualFile m_base_storage; s64 m_base_storage_size; size_t m_data_align; + mutable std::vector work_buffer; public: explicit AlignmentMatchingStoragePooledBuffer(VirtualFile bs, size_t da) - : m_base_storage(std::move(bs)), m_data_align(da) { + : m_base_storage(std::move(bs)), m_data_align(da), work_buffer(da) { ASSERT(Common::IsPowerOfTwo(da)); } @@ -104,16 +106,10 @@ public: // Validate arguments. ASSERT(buffer != nullptr); - s64 bs_size = this->GetSize(); ASSERT(R_SUCCEEDED(IStorage::CheckAccessRange(offset, size, bs_size))); - - // Allocate a pooled buffer. - PooledBuffer pooled_buffer; - pooled_buffer.AllocateParticularlyLarge(m_data_align, m_data_align); - - return AlignmentMatchingStorageImpl::Read(m_base_storage, pooled_buffer.GetBuffer(), - pooled_buffer.GetSize(), m_data_align, + return AlignmentMatchingStorageImpl::Read(m_base_storage, work_buffer.data(), + work_buffer.size(), m_data_align, BufferAlign, offset, buffer, size); } @@ -125,16 +121,10 @@ public: // Validate arguments. ASSERT(buffer != nullptr); - s64 bs_size = this->GetSize(); ASSERT(R_SUCCEEDED(IStorage::CheckAccessRange(offset, size, bs_size))); - - // Allocate a pooled buffer. - PooledBuffer pooled_buffer; - pooled_buffer.AllocateParticularlyLarge(m_data_align, m_data_align); - - return AlignmentMatchingStorageImpl::Write(m_base_storage, pooled_buffer.GetBuffer(), - pooled_buffer.GetSize(), m_data_align, + return AlignmentMatchingStorageImpl::Write(m_base_storage, work_buffer.data(), + work_buffer.size(), m_data_align, BufferAlign, offset, buffer, size); } diff --git a/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp b/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp index f58b154968..ce3b62f26d 100644 --- a/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp +++ b/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp @@ -7,7 +7,6 @@ #include "core/file_sys/errors.h" #include "core/file_sys/fssystem/fssystem_bucket_tree.h" #include "core/file_sys/fssystem/fssystem_bucket_tree_utils.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" namespace FileSys { @@ -465,16 +464,8 @@ Result BucketTree::Visitor::Find(s64 virtual_address) { } Result BucketTree::Visitor::FindEntrySet(s32* out_index, s64 virtual_address, s32 node_index) { - const auto node_size = m_tree->m_node_size; - - PooledBuffer pool(node_size, 1); - if (node_size <= pool.GetSize()) { - R_RETURN( - this->FindEntrySetWithBuffer(out_index, virtual_address, node_index, pool.GetBuffer())); - } else { - pool.Deallocate(); - R_RETURN(this->FindEntrySetWithoutBuffer(out_index, virtual_address, node_index)); - } + std::vector pool(m_tree->m_node_size); + R_RETURN(FindEntrySetWithBuffer(out_index, virtual_address, node_index, pool.data())); } Result BucketTree::Visitor::FindEntrySetWithBuffer(s32* out_index, s64 virtual_address, @@ -525,15 +516,8 @@ Result BucketTree::Visitor::FindEntrySetWithoutBuffer(s32* out_index, s64 virtua } Result BucketTree::Visitor::FindEntry(s64 virtual_address, s32 entry_set_index) { - const auto entry_set_size = m_tree->m_node_size; - - PooledBuffer pool(entry_set_size, 1); - if (entry_set_size <= pool.GetSize()) { - R_RETURN(this->FindEntryWithBuffer(virtual_address, entry_set_index, pool.GetBuffer())); - } else { - pool.Deallocate(); - R_RETURN(this->FindEntryWithoutBuffer(virtual_address, entry_set_index)); - } + std::vector pool(m_tree->m_node_size); + R_RETURN(FindEntryWithBuffer(virtual_address, entry_set_index, pool.data())); } Result BucketTree::Visitor::FindEntryWithBuffer(s64 virtual_address, s32 entry_set_index, diff --git a/src/core/file_sys/fssystem/fssystem_bucket_tree_template_impl.h b/src/core/file_sys/fssystem/fssystem_bucket_tree_template_impl.h index 030b2916b0..fac6c37214 100644 --- a/src/core/file_sys/fssystem/fssystem_bucket_tree_template_impl.h +++ b/src/core/file_sys/fssystem/fssystem_bucket_tree_template_impl.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -6,7 +9,6 @@ #include "core/file_sys/errors.h" #include "core/file_sys/fssystem/fssystem_bucket_tree.h" #include "core/file_sys/fssystem/fssystem_bucket_tree_utils.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" namespace FileSys { @@ -35,23 +37,19 @@ Result BucketTree::ScanContinuousReading(ContinuousReadingInfo* out_info, R_UNLESS(entry.GetVirtualOffset() <= cur_offset, ResultOutOfRange); // Create a pooled buffer for our scan. - PooledBuffer pool(m_node_size, 1); - char* buffer = nullptr; - + std::vector pool(m_node_size); s64 entry_storage_size = m_entry_storage->GetSize(); // Read the node. - if (m_node_size <= pool.GetSize()) { - buffer = pool.GetBuffer(); - const auto ofs = param.entry_set.index * static_cast(m_node_size); - R_UNLESS(m_node_size + ofs <= static_cast(entry_storage_size), - ResultInvalidBucketTreeNodeEntryCount); + u8* buffer = reinterpret_cast(pool.data()); + const auto ofs = param.entry_set.index * s64(m_node_size); + R_UNLESS(m_node_size + ofs <= size_t(entry_storage_size), + ResultInvalidBucketTreeNodeEntryCount); - m_entry_storage->Read(reinterpret_cast(buffer), m_node_size, ofs); - } + m_entry_storage->Read(buffer, m_node_size, ofs); // Calculate extents. - const auto end_offset = cur_offset + static_cast(param.size); + const auto end_offset = cur_offset + s64(param.size); s64 phys_offset = entry.GetPhysicalOffset(); // Start merge tracking. @@ -76,14 +74,8 @@ Result BucketTree::ScanContinuousReading(ContinuousReadingInfo* out_info, s64 next_entry_offset; if (entry_index + 1 < entry_count) { - if (buffer != nullptr) { - const auto ofs = impl::GetBucketTreeEntryOffset(0, m_entry_size, entry_index + 1); - std::memcpy(std::addressof(next_entry), buffer + ofs, m_entry_size); - } else { - const auto ofs = impl::GetBucketTreeEntryOffset(param.entry_set.index, m_node_size, - m_entry_size, entry_index + 1); - m_entry_storage->ReadObject(std::addressof(next_entry), ofs); - } + const auto offset = impl::GetBucketTreeEntryOffset(0, m_entry_size, entry_index + 1); + std::memcpy(std::addressof(next_entry), buffer + offset, m_entry_size); next_entry_offset = next_entry.GetVirtualOffset(); R_UNLESS(param.offsets.IsInclude(next_entry_offset), ResultInvalidIndirectEntryOffset); diff --git a/src/core/file_sys/fssystem/fssystem_compressed_storage.h b/src/core/file_sys/fssystem/fssystem_compressed_storage.h index 74c98630ec..223d51647e 100644 --- a/src/core/file_sys/fssystem/fssystem_compressed_storage.h +++ b/src/core/file_sys/fssystem/fssystem_compressed_storage.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -9,8 +12,6 @@ #include "core/file_sys/fssystem/fs_i_storage.h" #include "core/file_sys/fssystem/fssystem_bucket_tree.h" #include "core/file_sys/fssystem/fssystem_compression_common.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" -#include "core/file_sys/vfs/vfs.h" namespace FileSys { @@ -317,23 +318,11 @@ private: R_SUCCEED_IF(entry_count == 0); // Get the remaining size in a convenient form. - const size_t total_required_size = - static_cast(required_access_physical_size); + const size_t total_required_size = size_t(required_access_physical_size); // Perform the read based on whether we need to allocate a buffer. if (will_allocate_pooled_buffer) { - // Allocate a pooled buffer. - PooledBuffer pooled_buffer; - if (pooled_buffer.GetAllocatableSizeMax() >= total_required_size) { - pooled_buffer.Allocate(total_required_size, m_block_size_max); - } else { - pooled_buffer.AllocateParticularlyLarge( - std::min( - total_required_size, - PooledBuffer::GetAllocatableParticularlyLargeSizeMax()), - m_block_size_max); - } - + std::vector pooled_buffer(std::max(m_block_size_max, total_required_size)); // Read each of the entries. for (s32 entry_idx = 0; entry_idx < entry_count; ++entry_idx) { // Determine the current read size. @@ -342,13 +331,13 @@ private: if (const size_t target_entry_size = static_cast(entries[entry_idx].physical_size) + static_cast(entries[entry_idx].gap_from_prev); - target_entry_size <= pooled_buffer.GetSize()) { + target_entry_size <= pooled_buffer.size()) { // We'll be using the pooled buffer. will_use_pooled_buffer = true; // Determine how much we can read. const size_t max_size = std::min( - required_access_physical_size, pooled_buffer.GetSize()); + required_access_physical_size, pooled_buffer.size()); size_t read_size = 0; for (auto n = entry_idx; n < entry_count; ++n) { @@ -376,7 +365,7 @@ private: // Perform the read based on whether or not we'll use the pooled buffer. if (will_use_pooled_buffer) { // Read the compressed data into the pooled buffer. - auto* const buffer = pooled_buffer.GetBuffer(); + auto* const buffer = pooled_buffer.data(); m_data_storage->Read(reinterpret_cast(buffer), cur_read_size, required_access_physical_offset); @@ -863,11 +852,9 @@ private: static_cast(unaligned_range->virtual_size)); // Get a pooled buffer for our read. - PooledBuffer pooled_buffer; - pooled_buffer.Allocate(size_buffer_required, size_buffer_required); - + std::vector pooled_buffer(size_buffer_required); // Perform read. - Result rc = read_impl(pooled_buffer.GetBuffer(), size_buffer_required); + Result rc = read_impl(pooled_buffer.data(), size_buffer_required); if (R_FAILED(rc)) { R_THROW(rc); } @@ -876,8 +863,7 @@ private: const size_t skip_size = cur_offset - unaligned_range->virtual_offset; const size_t copy_size = std::min( cur_size, unaligned_range->GetEndVirtualOffset() - cur_offset); - - std::memcpy(cur_dst, pooled_buffer.GetBuffer() + skip_size, copy_size); + std::memcpy(cur_dst, pooled_buffer.data() + skip_size, copy_size); // Advance. cur_dst += copy_size; diff --git a/src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp b/src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp deleted file mode 100644 index dcd08dac3e..0000000000 --- a/src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include "common/alignment.h" -#include "core/file_sys/fssystem/fssystem_pooled_buffer.h" - -namespace FileSys { - -namespace { - -constexpr size_t HeapBlockSize = BufferPoolAlignment; -static_assert(HeapBlockSize == 4_KiB); - -// A heap block is 4KiB. An order is a power of two. -// This gives blocks of the order 32KiB, 512KiB, 4MiB. -constexpr s32 HeapOrderMax = 7; -constexpr s32 HeapOrderMaxForLarge = HeapOrderMax + 3; - -constexpr size_t HeapAllocatableSizeMax = HeapBlockSize * (static_cast(1) << HeapOrderMax); -constexpr size_t HeapAllocatableSizeMaxForLarge = - HeapBlockSize * (static_cast(1) << HeapOrderMaxForLarge); - -} // namespace - -size_t PooledBuffer::GetAllocatableSizeMaxCore(bool large) { - return large ? HeapAllocatableSizeMaxForLarge : HeapAllocatableSizeMax; -} - -void PooledBuffer::AllocateCore(size_t ideal_size, size_t required_size, bool large) { - // Ensure preconditions. - ASSERT(m_buffer == nullptr); - - // Check that we can allocate this size. - ASSERT(required_size <= GetAllocatableSizeMaxCore(large)); - - const size_t target_size = - (std::min)((std::max)(ideal_size, required_size), GetAllocatableSizeMaxCore(large)); - - // Dummy implementation for allocate. - if (target_size > 0) { - m_buffer = - reinterpret_cast(::operator new(target_size, std::align_val_t{HeapBlockSize})); - m_size = target_size; - - // Ensure postconditions. - ASSERT(m_buffer != nullptr); - } -} - -void PooledBuffer::Shrink(size_t ideal_size) { - ASSERT(ideal_size <= GetAllocatableSizeMaxCore(true)); - - // Shrinking to zero means that we have no buffer. - if (ideal_size == 0) { - ::operator delete(m_buffer, std::align_val_t{HeapBlockSize}); - m_buffer = nullptr; - m_size = ideal_size; - } -} - -} // namespace FileSys diff --git a/src/core/file_sys/fssystem/fssystem_pooled_buffer.h b/src/core/file_sys/fssystem/fssystem_pooled_buffer.h deleted file mode 100644 index 9a6adbcb5a..0000000000 --- a/src/core/file_sys/fssystem/fssystem_pooled_buffer.h +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include "common/common_funcs.h" -#include "common/common_types.h" -#include "common/literals.h" -#include "core/hle/result.h" - -namespace FileSys { - -using namespace Common::Literals; - -constexpr inline size_t BufferPoolAlignment = 4_KiB; -constexpr inline size_t BufferPoolWorkSize = 320; - -class PooledBuffer { - YUZU_NON_COPYABLE(PooledBuffer); - -public: - // Constructor/Destructor. - constexpr PooledBuffer() : m_buffer(), m_size() {} - - PooledBuffer(size_t ideal_size, size_t required_size) : m_buffer(), m_size() { - this->Allocate(ideal_size, required_size); - } - - ~PooledBuffer() { - this->Deallocate(); - } - - // Move and assignment. - explicit PooledBuffer(PooledBuffer&& rhs) : m_buffer(rhs.m_buffer), m_size(rhs.m_size) { - rhs.m_buffer = nullptr; - rhs.m_size = 0; - } - - PooledBuffer& operator=(PooledBuffer&& rhs) { - PooledBuffer(std::move(rhs)).Swap(*this); - return *this; - } - - // Allocation API. - void Allocate(size_t ideal_size, size_t required_size) { - return this->AllocateCore(ideal_size, required_size, false); - } - - void AllocateParticularlyLarge(size_t ideal_size, size_t required_size) { - return this->AllocateCore(ideal_size, required_size, true); - } - - void Shrink(size_t ideal_size); - - void Deallocate() { - // Shrink the buffer to empty. - this->Shrink(0); - ASSERT(m_buffer == nullptr); - } - - char* GetBuffer() const { - ASSERT(m_buffer != nullptr); - return m_buffer; - } - - size_t GetSize() const { - ASSERT(m_buffer != nullptr); - return m_size; - } - -public: - static size_t GetAllocatableSizeMax() { - return GetAllocatableSizeMaxCore(false); - } - static size_t GetAllocatableParticularlyLargeSizeMax() { - return GetAllocatableSizeMaxCore(true); - } - -private: - static size_t GetAllocatableSizeMaxCore(bool large); - -private: - void Swap(PooledBuffer& rhs) { - std::swap(m_buffer, rhs.m_buffer); - std::swap(m_size, rhs.m_size); - } - - void AllocateCore(size_t ideal_size, size_t required_size, bool large); - -private: - char* m_buffer; - size_t m_size; -}; - -} // namespace FileSys diff --git a/src/core/hle/service/nvnflinger/hardware_composer.cpp b/src/core/hle/service/nvnflinger/hardware_composer.cpp index a262a3dcd5..5c0515d473 100644 --- a/src/core/hle/service/nvnflinger/hardware_composer.cpp +++ b/src/core/hle/service/nvnflinger/hardware_composer.cpp @@ -53,6 +53,19 @@ u32 HardwareComposer::ComposeLocked(f32* out_speed_scale, Display& display, // Set default speed limit to 100%. *out_speed_scale = 1.0f; + // If no layers are available, skip the logic. + bool any_visible = false; + for (auto& layer : display.stack.layers) { + if (layer->visible) { + any_visible = true; + break; + } + } + if (!any_visible) { + *out_speed_scale = 1.0f; + return 1; + } + // Determine the number of vsync periods to wait before composing again. std::optional swap_interval{}; bool has_acquired_buffer{}; @@ -110,7 +123,7 @@ u32 HardwareComposer::ComposeLocked(f32* out_speed_scale, Display& display, } // If any new buffers were acquired, we can present. - if (has_acquired_buffer) { + if (has_acquired_buffer && !composition_stack.empty()) { // Sort by Z-index. std::stable_sort(composition_stack.begin(), composition_stack.end(), [&](auto& l, auto& r) { return l.z_index < r.z_index; }); @@ -119,6 +132,19 @@ u32 HardwareComposer::ComposeLocked(f32* out_speed_scale, Display& display, nvdisp.Composite(composition_stack); } + // Batch framebuffer releases, instead of one-into-one. + std::vector> to_release; + for (auto& [layer_id, framebuffer] : m_framebuffers) { + if (framebuffer.release_frame_number > m_frame_number || !framebuffer.is_acquired) + continue; + if (auto layer = display.stack.FindLayer(layer_id); layer) + to_release.emplace_back(layer.get(), &framebuffer); + } + for (auto& [layer, framebuffer] : to_release) { + layer->buffer_item_consumer->ReleaseBuffer(framebuffer->item, android::Fence::NoFence()); + framebuffer->is_acquired = false; + } + // Advance by at least one frame. const u32 frame_advance = swap_interval.value_or(1); m_frame_number += frame_advance; diff --git a/src/dynarmic/docs/Design.md b/src/dynarmic/docs/Design.md index 3c0deb5972..ffa8ccecdb 100644 --- a/src/dynarmic/docs/Design.md +++ b/src/dynarmic/docs/Design.md @@ -273,52 +273,73 @@ Exclusive OR (i.e.: XOR) ### Callback: {Read,Write}Memory{8,16,32,64} - ReadMemory8( vaddr) - ReadMemory16( vaddr) - ReadMemory32( vaddr) - ReadMemory64( vaddr) - WriteMemory8( vaddr, value_to_store) - WriteMemory16( vaddr, value_to_store) - WriteMemory32( vaddr, value_to_store) - WriteMemory64( vaddr, value_to_store) +```c++ + ReadMemory8( vaddr) + ReadMemory16( vaddr) + ReadMemory32( vaddr) + ReadMemory64( vaddr) + WriteMemory8( vaddr, value_to_store) + WriteMemory16( vaddr, value_to_store) + WriteMemory32( vaddr, value_to_store) + WriteMemory64( vaddr, value_to_store) +``` Memory access. ### Terminal: Interpret - SetTerm(IR::Term::Interpret{next}) +```c++ +SetTerm(IR::Term::Interpret{next}) +``` This terminal instruction calls the interpreter, starting at `next`. The interpreter must interpret exactly one instruction. ### Terminal: ReturnToDispatch - SetTerm(IR::Term::ReturnToDispatch{}) +```c++ +SetTerm(IR::Term::ReturnToDispatch{}) +``` This terminal instruction returns control to the dispatcher. The dispatcher will use the value in R15 to determine what comes next. ### Terminal: LinkBlock - SetTerm(IR::Term::LinkBlock{next}) +```c++ +SetTerm(IR::Term::LinkBlock{next}) +``` This terminal instruction jumps to the basic block described by `next` if we have enough cycles remaining. If we do not have enough cycles remaining, we return to the dispatcher, which will return control to the host. +### Terminal: LinkBlockFast + +```c++ +SetTerm(IR::Term::LinkBlockFast{next}) +``` + +This terminal instruction jumps to the basic block described by `next` unconditionally. +This promises guarantees that must be held at runtime - i.e that the program wont hang, + ### Terminal: PopRSBHint - SetTerm(IR::Term::PopRSBHint{}) +```c++ +SetTerm(IR::Term::PopRSBHint{}) +``` This terminal instruction checks the top of the Return Stack Buffer against R15. If RSB lookup fails, control is returned to the dispatcher. This is an optimization for faster function calls. A backend that doesn't support this optimization or doesn't have a RSB may choose to implement this exactly as -ReturnToDispatch. +`ReturnToDispatch`. ### Terminal: If - SetTerm(IR::Term::If{cond, term_then, term_else}) +```c++ +SetTerm(IR::Term::If{cond, term_then, term_else}) +``` This terminal instruction conditionally executes one terminal or another depending on the run-time state of the ARM flags. diff --git a/src/dynarmic/docs/FastMemory.md b/src/dynarmic/docs/FastMemory.md new file mode 100644 index 0000000000..c4f57996ba --- /dev/null +++ b/src/dynarmic/docs/FastMemory.md @@ -0,0 +1,19 @@ +# Fast memory (Fastmem) + +The main way of accessing memory in JITed programs is via an invoked function, say "Read()" and "Write()". On our translator, such functions usually take a sizable amounts of code space (push + call + pop). Trash the i-cache (due to an indirect call) and overall make code emission more bloated. + +The solution? Delegate invalid accesses to a dedicated arena, similar to a swap. The main idea behind such mechanism is to allow the OS to transmit page faults from invalid accesses into the JIT translator directly, bypassing address space calls, while this sacrifices i-cache coherency, it allows for smaller code-size and "faster" throguhput. + +Many kernels however, do not support fast signal dispatching (Solaris, OpenBSD, FreeBSD). Only Linux and Windows support relatively "fast" signal dispatching. Hence this feature is better suited for them only. + +![Host to guest translation](./HostToGuest.svg) + +![Fastmem translation](./Fastmem.svg) + +In x86_64 for example, when a page fault occurs, the CPU will transmit via control registers and the stack (see `IRETQ`) the appropriate arguments for a page fault handler, the OS then will transform that into something that can be sent into userspace. + +Most modern OSes implement kernel-page-table-isolation, which means a set of system calls will invoke a context switch (not often used syscalls), whereas others are handled by the same process address space (the smaller kernel portion, often used syscalls) without needing a context switch. This effect can be negated on systems with PCID (up to 4096 unique IDs). + +Signal dispatching takes a performance hit from reloading `%cr3` - but Linux does something more clever to avoid reloads: VDSO will take care of the entire thing in the same address space. Making dispatching as costly as an indirect call - without the hazards of increased code size. + +The main downside from this is the constant i-cache trashing and pipeline hazards introduced by the VDSO signal handlers. However on most benchmarks fastmem does perform faster than without (Linux only). This also abuses the fact of continous address space emulation by using an arena - which can then be potentially transparently mapped into a hugepage, reducing TLB walk times. diff --git a/src/dynarmic/docs/Fastmem.svg b/src/dynarmic/docs/Fastmem.svg new file mode 100644 index 0000000000..a3ed0bb68b --- /dev/null +++ b/src/dynarmic/docs/Fastmem.svg @@ -0,0 +1,4 @@ + + + +
Emulator
Address Space
Guest Address Space
SIGSEGV Trap
Fastmem
Only needs to linearly offset from fastmem arena
Less codegen (SIGSEGV traps)
Is fast only if SIGSEGV handlers are sufficiently fast
\ No newline at end of file diff --git a/src/dynarmic/docs/HostToGuest.svg b/src/dynarmic/docs/HostToGuest.svg new file mode 100644 index 0000000000..6a15a44b46 --- /dev/null +++ b/src/dynarmic/docs/HostToGuest.svg @@ -0,0 +1,4 @@ + + + +
Emulator
Address Space
Guest Address Space
Resolver
Host to Guest translation
Looks up correct PTE
Translates each address 
Is slow
\ No newline at end of file diff --git a/src/dynarmic/docs/RegisterAllocator.md b/src/dynarmic/docs/RegisterAllocator.md index fea6f19e6a..f5bbaaf168 100644 --- a/src/dynarmic/docs/RegisterAllocator.md +++ b/src/dynarmic/docs/RegisterAllocator.md @@ -16,19 +16,34 @@ Note that `Use`ing a value decrements its `use_count` by one. When the `use_coun The member functions on `RegAlloc` are just a combination of the above concepts. +The following registers are reserved for internal use and should NOT participate in register allocation: +- `%xmm0`, `%xmm1`, `%xmm2`: Used as scratch in exclusive memory access. +- `%rsp`: Stack pointer. +- `%r15`: JIT pointer +- `%r14`: Page table pointer. +- `%r13`: Fastmem pointer. + +The layout convenes `%r15` as the JIT state pointer - while it may be tempting to turn it into a synthetic pointer, keeping an entire register (out of 12 available) is preferable over inlining a directly computed immediate. + +Do NEVER modify `%r15`, we must make it clear that this register is "immutable" for the entirety of the JIT block duration. + ### `Scratch` - Xbyak::Reg64 ScratchGpr(HostLocList desired_locations = any_gpr) - Xbyak::Xmm ScratchXmm(HostLocList desired_locations = any_xmm) +```c++ +Xbyak::Reg64 ScratchGpr(HostLocList desired_locations = any_gpr); +Xbyak::Xmm ScratchXmm(HostLocList desired_locations = any_xmm); +``` At runtime, allocate one of the registers in `desired_locations`. You are free to modify the register. The register is discarded at the end of the allocation scope. ### Pure `Use` - Xbyak::Reg64 UseGpr(Argument& arg); - Xbyak::Xmm UseXmm(Argument& arg); - OpArg UseOpArg(Argument& arg); - void Use(Argument& arg, HostLoc host_loc); +```c++ +Xbyak::Reg64 UseGpr(Argument& arg); +Xbyak::Xmm UseXmm(Argument& arg); +OpArg UseOpArg(Argument& arg); +void Use(Argument& arg, HostLoc host_loc); +``` At runtime, the value corresponding to `arg` will be placed a register. The actual register is determined by which one of the above functions is called. `UseGpr` places it in an unused GPR, `UseXmm` places it @@ -39,9 +54,11 @@ This register **must not** have it's value changed. ### `UseScratch` - Xbyak::Reg64 UseScratchGpr(Argument& arg); - Xbyak::Xmm UseScratchXmm(Argument& arg); - void UseScratch(Argument& arg, HostLoc host_loc); +```c++ +Xbyak::Reg64 UseScratchGpr(Argument& arg); +Xbyak::Xmm UseScratchXmm(Argument& arg); +void UseScratch(Argument& arg, HostLoc host_loc); +``` At runtime, the value corresponding to `arg` will be placed a register. The actual register is determined by which one of the above functions is called. `UseScratchGpr` places it in an unused GPR, `UseScratchXmm` places it @@ -55,7 +72,9 @@ You are free to modify the value in the register. The register is discarded at t A `Define` is the defintion of a value. This is the only time when a value may be set. - void DefineValue(IR::Inst* inst, const Xbyak::Reg& reg); +```c++ +void DefineValue(IR::Inst* inst, const Xbyak::Reg& reg); +``` By calling `DefineValue`, you are stating that you wish to define the value for `inst`, and you have written the value to the specified register `reg`. @@ -64,7 +83,9 @@ value to the specified register `reg`. Adding a `Define` to an existing value. - void DefineValue(IR::Inst* inst, Argument& arg); +```c++ +void DefineValue(IR::Inst* inst, Argument& arg); +``` You are declaring that the value for `inst` is the same as the value for `arg`. No host machine instructions are emitted. diff --git a/src/dynarmic/docs/ReturnStackBufferOptimization.md b/src/dynarmic/docs/ReturnStackBufferOptimization.md index 6ffe41bcc6..0e72c3bce8 100644 --- a/src/dynarmic/docs/ReturnStackBufferOptimization.md +++ b/src/dynarmic/docs/ReturnStackBufferOptimization.md @@ -23,15 +23,17 @@ One complication dynarmic has is that a compiled block is not uniquely identifia the PC alone, but bits in the FPSCR and CPSR are also relevant. We resolve this by computing a 64-bit `UniqueHash` that is guaranteed to uniquely identify a block. - u64 LocationDescriptor::UniqueHash() const { - // This value MUST BE UNIQUE. - // This calculation has to match up with EmitX64::EmitTerminalPopRSBHint - u64 pc_u64 = u64(arm_pc) << 32; - u64 fpscr_u64 = u64(fpscr.Value()); - u64 t_u64 = cpsr.T() ? 1 : 0; - u64 e_u64 = cpsr.E() ? 2 : 0; - return pc_u64 | fpscr_u64 | t_u64 | e_u64; - } +```c++ +u64 LocationDescriptor::UniqueHash() const { + // This value MUST BE UNIQUE. + // This calculation has to match up with EmitX64::EmitTerminalPopRSBHint + u64 pc_u64 = u64(arm_pc) << 32; + u64 fpscr_u64 = u64(fpscr.Value()); + u64 t_u64 = cpsr.T() ? 1 : 0; + u64 e_u64 = cpsr.E() ? 2 : 0; + return pc_u64 | fpscr_u64 | t_u64 | e_u64; +} +``` ## Our implementation isn't actually a stack @@ -49,97 +51,107 @@ host addresses for the corresponding the compiled blocks. size of the real RSB in hardware (which has 3 entries). Larger RSBs than 8 showed degraded performance. - struct JitState { - // ... +```c++ +struct JitState { + // ... - static constexpr size_t RSBSize = 8; // MUST be a power of 2. - u32 rsb_ptr = 0; - std::array rsb_location_descriptors; - std::array rsb_codeptrs; - void ResetRSB(); + static constexpr size_t RSBSize = 8; // MUST be a power of 2. + u32 rsb_ptr = 0; + std::array rsb_location_descriptors; + std::array rsb_codeptrs; + void ResetRSB(); - // ... - }; + // ... +}; +``` ### RSB Push We insert our prediction at the insertion point iff the RSB doesn't already contain a prediction with the same `UniqueHash`. - void EmitX64::EmitPushRSB(IR::Block&, IR::Inst* inst) { - using namespace Xbyak::util; +```c++ +void EmitX64::EmitPushRSB(IR::Block&, IR::Inst* inst) { + using namespace Xbyak::util; - ASSERT(inst->GetArg(0).IsImmediate()); - u64 imm64 = inst->GetArg(0).GetU64(); + ASSERT(inst->GetArg(0).IsImmediate()); + u64 imm64 = inst->GetArg(0).GetU64(); - Xbyak::Reg64 code_ptr_reg = reg_alloc.ScratchGpr({HostLoc::RCX}); - Xbyak::Reg64 loc_desc_reg = reg_alloc.ScratchGpr(); - Xbyak::Reg32 index_reg = reg_alloc.ScratchGpr().cvt32(); - u64 code_ptr = unique_hash_to_code_ptr.find(imm64) != unique_hash_to_code_ptr.end() - ? u64(unique_hash_to_code_ptr[imm64]) - : u64(code->GetReturnFromRunCodeAddress()); + Xbyak::Reg64 code_ptr_reg = reg_alloc.ScratchGpr({HostLoc::RCX}); + Xbyak::Reg64 loc_desc_reg = reg_alloc.ScratchGpr(); + Xbyak::Reg32 index_reg = reg_alloc.ScratchGpr().cvt32(); + u64 code_ptr = unique_hash_to_code_ptr.find(imm64) != unique_hash_to_code_ptr.end() + ? u64(unique_hash_to_code_ptr[imm64]) + : u64(code->GetReturnFromRunCodeAddress()); - code->mov(index_reg, dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)]); - code->add(index_reg, 1); - code->and_(index_reg, u32(JitState::RSBSize - 1)); + code->mov(index_reg, dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)]); + code->add(index_reg, 1); + code->and_(index_reg, u32(JitState::RSBSize - 1)); - code->mov(loc_desc_reg, u64(imm64)); - CodePtr patch_location = code->getCurr(); - patch_unique_hash_locations[imm64].emplace_back(patch_location); - code->mov(code_ptr_reg, u64(code_ptr)); // This line has to match up with EmitX64::Patch. - code->EnsurePatchLocationSize(patch_location, 10); + code->mov(loc_desc_reg, u64(imm64)); + CodePtr patch_location = code->getCurr(); + patch_unique_hash_locations[imm64].emplace_back(patch_location); + code->mov(code_ptr_reg, u64(code_ptr)); // This line has to match up with EmitX64::Patch. + code->EnsurePatchLocationSize(patch_location, 10); - Xbyak::Label label; - for (size_t i = 0; i < JitState::RSBSize; ++i) { - code->cmp(loc_desc_reg, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]); - code->je(label, code->T_SHORT); - } - - code->mov(dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)], index_reg); - code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_location_descriptors)], loc_desc_reg); - code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_codeptrs)], code_ptr_reg); - code->L(label); + Xbyak::Label label; + for (size_t i = 0; i < JitState::RSBSize; ++i) { + code->cmp(loc_desc_reg, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]); + code->je(label, code->T_SHORT); } + code->mov(dword[code.ABI_JIT_PTR + offsetof(JitState, rsb_ptr)], index_reg); + code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_location_descriptors)], loc_desc_reg); + code->mov(qword[code.ABI_JIT_PTR + index_reg.cvt64() * 8 + offsetof(JitState, rsb_codeptrs)], code_ptr_reg); + code->L(label); +} +``` + In pseudocode: - for (i := 0 .. RSBSize-1) - if (rsb_location_descriptors[i] == imm64) - goto label; - rsb_ptr++; - rsb_ptr %= RSBSize; - rsb_location_desciptors[rsb_ptr] = imm64; //< The UniqueHash - rsb_codeptr[rsb_ptr] = /* codeptr corresponding to the UniqueHash */; - label: +```c++ + for (i := 0 .. RSBSize-1) + if (rsb_location_descriptors[i] == imm64) + goto label; + rsb_ptr++; + rsb_ptr %= RSBSize; + rsb_location_desciptors[rsb_ptr] = imm64; //< The UniqueHash + rsb_codeptr[rsb_ptr] = /* codeptr corresponding to the UniqueHash */; +label: +``` ## RSB Pop To check if a predicition is in the RSB, we linearly scan the RSB. - void EmitX64::EmitTerminalPopRSBHint(IR::Term::PopRSBHint, IR::LocationDescriptor initial_location) { - using namespace Xbyak::util; +```c++ +void EmitX64::EmitTerminalPopRSBHint(IR::Term::PopRSBHint, IR::LocationDescriptor initial_location) { + using namespace Xbyak::util; - // This calculation has to match up with IREmitter::PushRSB - code->mov(ecx, MJitStateReg(Arm::Reg::PC)); - code->shl(rcx, 32); - code->mov(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, FPSCR_mode)]); - code->or_(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, CPSR_et)]); - code->or_(rbx, rcx); + // This calculation has to match up with IREmitter::PushRSB + code->mov(ecx, MJitStateReg(Arm::Reg::PC)); + code->shl(rcx, 32); + code->mov(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, FPSCR_mode)]); + code->or_(ebx, dword[code.ABI_JIT_PTR + offsetof(JitState, CPSR_et)]); + code->or_(rbx, rcx); - code->mov(rax, u64(code->GetReturnFromRunCodeAddress())); - for (size_t i = 0; i < JitState::RSBSize; ++i) { - code->cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]); - code->cmove(rax, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_codeptrs) + i * sizeof(u64)]); - } - - code->jmp(rax); + code->mov(rax, u64(code->GetReturnFromRunCodeAddress())); + for (size_t i = 0; i < JitState::RSBSize; ++i) { + code->cmp(rbx, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)]); + code->cmove(rax, qword[code.ABI_JIT_PTR + offsetof(JitState, rsb_codeptrs) + i * sizeof(u64)]); } + code->jmp(rax); +} +``` + In pseudocode: - rbx := ComputeUniqueHash() - rax := ReturnToDispatch - for (i := 0 .. RSBSize-1) - if (rbx == rsb_location_descriptors[i]) - rax = rsb_codeptrs[i] - goto rax \ No newline at end of file +```c++ +rbx := ComputeUniqueHash() +rax := ReturnToDispatch +for (i := 0 .. RSBSize-1) + if (rbx == rsb_location_descriptors[i]) + rax = rsb_codeptrs[i] +goto rax +``` diff --git a/src/qt_common/shared_translation.cpp b/src/qt_common/shared_translation.cpp index dfda88ba74..4254253c2f 100644 --- a/src/qt_common/shared_translation.cpp +++ b/src/qt_common/shared_translation.cpp @@ -246,10 +246,7 @@ std::unique_ptr InitializeTranslations(QObject* parent) INSERT(Settings, vram_usage_mode, tr("VRAM Usage Mode:"), - tr("Selects whether the emulator should prefer to conserve memory or make maximum usage " - "of available video memory for performance.\nHas no effect on integrated graphics. " - "Aggressive mode may severely impact the performance of other applications such as " - "recording software.")); + tr("Selects whether the emulator should prefer to conserve memory or make maximum usage of available video memory for performance.\nAggressive mode may severely impact the performance of other applications such as recording software.")); INSERT(Settings, skip_cpu_inner_invalidation, tr("Skip CPU Inner Invalidation"), @@ -575,6 +572,7 @@ std::unique_ptr ComboboxEnumeration(QObject* parent) PAIR(ScalingFilter, NearestNeighbor, tr("Nearest Neighbor")), PAIR(ScalingFilter, Bilinear, tr("Bilinear")), PAIR(ScalingFilter, Bicubic, tr("Bicubic")), + PAIR(ScalingFilter, Spline1, tr("Spline-1")), PAIR(ScalingFilter, Gaussian, tr("Gaussian")), PAIR(ScalingFilter, Lanczos, tr("Lanczos")), PAIR(ScalingFilter, ScaleForce, tr("ScaleForce")), diff --git a/src/qt_common/shared_translation.h b/src/qt_common/shared_translation.h index ea8e7fe1bd..c9216c2daa 100644 --- a/src/qt_common/shared_translation.h +++ b/src/qt_common/shared_translation.h @@ -38,6 +38,8 @@ static const std::map scaling_filter_texts_map {Settings::ScalingFilter::Bilinear, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Bilinear"))}, {Settings::ScalingFilter::Bicubic, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Bicubic"))}, + {Settings::ScalingFilter::Spline1, + QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Spline-1"))}, {Settings::ScalingFilter::Gaussian, QStringLiteral(QT_TRANSLATE_NOOP("GMainWindow", "Gaussian"))}, {Settings::ScalingFilter::Lanczos, diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index d8ea826498..c14b44a45a 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -46,6 +46,7 @@ set(SHADER_FILES present_bicubic.frag present_gaussian.frag present_lanczos.frag + present_spline1.frag queries_prefix_scan_sum.comp queries_prefix_scan_sum_nosubgroups.comp resolve_conditional_render.comp diff --git a/src/video_core/host_shaders/present_spline1.frag b/src/video_core/host_shaders/present_spline1.frag new file mode 100644 index 0000000000..871b47586b --- /dev/null +++ b/src/video_core/host_shaders/present_spline1.frag @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +// Spline (smooth linear inerpolation) with 1 texel fetch (needs bilinear to work) +// Emulates bicubic without actually doing bicubic +// See https://iquilezles.org/articles/texture, unfortunely there are issues with the original +// where smoothstep "expansion" actually results in worse codegen (SPIRV-Opt does a direct conv to smoothstep) +// TODO: Numerical analysis - fract is sawtooth func and floor, reuse params? Perhaps - no need for precision + +#version 460 core + +layout (location = 0) in vec2 frag_tex_coord; +layout (location = 0) out vec4 color; +layout (binding = 0) uniform sampler2D color_texture; + +vec4 textureSpline1(sampler2D sam, vec2 uv) { + float r = float(textureSize(sam, 0).x); + vec2 x = fract(uv * r + 0.5); + return texture(sam, (floor(uv * r + 0.5) + smoothstep(0.0, 1.0, x) - 0.5) / r); +} + +void main() { + color = textureSpline1(color_texture, frag_tex_coord); +} diff --git a/src/video_core/renderer_opengl/gl_blit_screen.cpp b/src/video_core/renderer_opengl/gl_blit_screen.cpp index 5d2246ada1..65670fcad8 100644 --- a/src/video_core/renderer_opengl/gl_blit_screen.cpp +++ b/src/video_core/renderer_opengl/gl_blit_screen.cpp @@ -89,6 +89,9 @@ void BlitScreen::CreateWindowAdapt() { case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device); break; + case Settings::ScalingFilter::Spline1: + window_adapt = MakeSpline1(device); + break; case Settings::ScalingFilter::Lanczos: window_adapt = MakeLanczos(device); break; diff --git a/src/video_core/renderer_opengl/present/filters.cpp b/src/video_core/renderer_opengl/present/filters.cpp index 3424a52d80..a840de304e 100644 --- a/src/video_core/renderer_opengl/present/filters.cpp +++ b/src/video_core/renderer_opengl/present/filters.cpp @@ -28,6 +28,11 @@ std::unique_ptr MakeBilinear(const Device& device) { HostShaders::OPENGL_PRESENT_FRAG); } +std::unique_ptr MakeSpline1(const Device& device) { + return std::make_unique(device, CreateBilinearSampler(), + HostShaders::PRESENT_SPLINE1_FRAG); +} + std::unique_ptr MakeBicubic(const Device& device) { return std::make_unique(device, CreateBilinearSampler(), HostShaders::PRESENT_BICUBIC_FRAG); diff --git a/src/video_core/renderer_opengl/present/filters.h b/src/video_core/renderer_opengl/present/filters.h index f71b5f93d3..7b38ac56bc 100644 --- a/src/video_core/renderer_opengl/present/filters.h +++ b/src/video_core/renderer_opengl/present/filters.h @@ -18,6 +18,7 @@ std::unique_ptr MakeNearestNeighbor(const Device& device); std::unique_ptr MakeBilinear(const Device& device); std::unique_ptr MakeBicubic(const Device& device); std::unique_ptr MakeGaussian(const Device& device); +std::unique_ptr MakeSpline1(const Device& device); std::unique_ptr MakeLanczos(const Device& device); std::unique_ptr MakeScaleForce(const Device& device); std::unique_ptr MakeArea(const Device& device); diff --git a/src/video_core/renderer_vulkan/present/filters.cpp b/src/video_core/renderer_vulkan/present/filters.cpp index 8fed222504..6622b8daea 100644 --- a/src/video_core/renderer_vulkan/present/filters.cpp +++ b/src/video_core/renderer_vulkan/present/filters.cpp @@ -46,6 +46,11 @@ std::unique_ptr MakeBilinear(const Device& device, VkFormat fra BuildShader(device, VULKAN_PRESENT_FRAG_SPV)); } +std::unique_ptr MakeSpline1(const Device& device, VkFormat frame_format) { + return std::make_unique(device, frame_format, CreateBilinearSampler(device), + BuildShader(device, PRESENT_SPLINE1_FRAG_SPV)); +} + std::unique_ptr MakeBicubic(const Device& device, VkFormat frame_format) { // No need for handrolled shader -- if the VK impl can do it for us ;) if (device.IsExtFilterCubicSupported()) diff --git a/src/video_core/renderer_vulkan/present/filters.h b/src/video_core/renderer_vulkan/present/filters.h index 8b0630e748..015bffc8a5 100644 --- a/src/video_core/renderer_vulkan/present/filters.h +++ b/src/video_core/renderer_vulkan/present/filters.h @@ -18,6 +18,7 @@ class MemoryAllocator; std::unique_ptr MakeNearestNeighbor(const Device& device, VkFormat frame_format); std::unique_ptr MakeBilinear(const Device& device, VkFormat frame_format); std::unique_ptr MakeBicubic(const Device& device, VkFormat frame_format); +std::unique_ptr MakeSpline1(const Device& device, VkFormat frame_format); std::unique_ptr MakeGaussian(const Device& device, VkFormat frame_format); std::unique_ptr MakeLanczos(const Device& device, VkFormat frame_format); std::unique_ptr MakeScaleForce(const Device& device, VkFormat frame_format); diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 3a003a871e..b720bcded3 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -43,6 +43,9 @@ void BlitScreen::SetWindowAdaptPass() { case Settings::ScalingFilter::Bicubic: window_adapt = MakeBicubic(device, swapchain_view_format); break; + case Settings::ScalingFilter::Spline1: + window_adapt = MakeSpline1(device, swapchain_view_format); + break; case Settings::ScalingFilter::Gaussian: window_adapt = MakeGaussian(device, swapchain_view_format); break; diff --git a/src/video_core/renderer_vulkan/vk_present_manager.cpp b/src/video_core/renderer_vulkan/vk_present_manager.cpp index 2c76584c72..23279e49b9 100644 --- a/src/video_core/renderer_vulkan/vk_present_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_present_manager.cpp @@ -470,8 +470,8 @@ void PresentManager::CopyToSwapchainImpl(Frame* frame) { const std::array wait_semaphores = {present_semaphore, *frame->render_ready}; static constexpr std::array wait_stage_masks{ - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, }; const VkSubmitInfo submit_info{ diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 6d7c33099b..41917a1b90 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -1395,23 +1395,20 @@ void Device::CollectPhysicalMemoryInfo() { } device_access_memory += mem_properties.memoryHeaps[element].size; } - if (!is_integrated) { + if (is_integrated) { + const s64 available_memory = static_cast(device_access_memory - device_initial_usage); + const u64 memory_size = Settings::values.vram_usage_mode.GetValue() == Settings::VramUsageMode::Aggressive ? 6_GiB : 4_GiB; + device_access_memory = static_cast(std::max(std::min(available_memory - 8_GiB, memory_size), std::min(local_memory, memory_size))); + } else { const u64 reserve_memory = std::min(device_access_memory / 8, 1_GiB); device_access_memory -= reserve_memory; - if (Settings::values.vram_usage_mode.GetValue() != Settings::VramUsageMode::Aggressive) { // Account for resolution scaling in memory limits const size_t normal_memory = 6_GiB; const size_t scaler_memory = 1_GiB * Settings::values.resolution_info.ScaleUp(1); - device_access_memory = - std::min(device_access_memory, normal_memory + scaler_memory); + device_access_memory = std::min(device_access_memory, normal_memory + scaler_memory); } - - return; } - const s64 available_memory = static_cast(device_access_memory - device_initial_usage); - device_access_memory = static_cast(std::max( - std::min(available_memory - 8_GiB, 6_GiB), std::min(local_memory, 6_GiB))); } void Device::CollectToolingInfo() {