From 57c4859da4c4b40f1144509b300c2ec290661409 Mon Sep 17 00:00:00 2001 From: weakboson Date: Sat, 26 Jul 2025 23:09:40 +0800 Subject: [PATCH] [core/memory] Remove defered heap allocation on Linux. --- src/common/heap_tracker.cpp | 71 +++----- src/core/arm/dynarmic/arm_dynarmic.cpp | 49 ------ src/core/arm/dynarmic/arm_dynarmic.h | 20 --- src/core/arm/dynarmic/arm_dynarmic_32.cpp | 5 - src/core/arm/dynarmic/arm_dynarmic_64.cpp | 5 - src/core/hle/kernel/k_process.cpp | 4 - src/core/memory.cpp | 190 ++++------------------ 7 files changed, 55 insertions(+), 289 deletions(-) delete mode 100644 src/core/arm/dynarmic/arm_dynarmic.cpp diff --git a/src/common/heap_tracker.cpp b/src/common/heap_tracker.cpp index 6832087959..abdd200c45 100644 --- a/src/common/heap_tracker.cpp +++ b/src/common/heap_tracker.cpp @@ -34,6 +34,8 @@ HeapTracker::~HeapTracker() = default; void HeapTracker::Map(size_t virtual_offset, size_t host_offset, size_t length, MemoryPermission perm, bool is_separate_heap) { + bool rebuild_required = false; + // When mapping other memory, map pages immediately. if (!is_separate_heap) { m_buffer.Map(virtual_offset, host_offset, length, perm, false); @@ -55,11 +57,29 @@ void HeapTracker::Map(size_t virtual_offset, size_t host_offset, size_t length, // Insert into mappings. m_map_count++; - m_mappings.insert(*map); + const auto it = m_mappings.insert(*map); + + // Update tick before possible rebuild. + it->tick = m_tick++; + + // Check if we need to rebuild. + if (m_resident_map_count >= m_max_resident_map_count) { + rebuild_required = true; + } + + // Map the area. + m_buffer.Map(it->vaddr, it->paddr, it->size, it->perm, false); + + // This map is now resident. + it->is_resident = true; + m_resident_map_count++; + m_resident_mappings.insert(*it); } - // Finally, map. - this->DeferredMapSeparateHeap(virtual_offset); + if (rebuild_required) { + // A rebuild was required, so perform it now. + this->RebuildSeparateHeapAddressSpace(); + } } void HeapTracker::Unmap(size_t virtual_offset, size_t size, bool is_separate_heap) { @@ -157,51 +177,6 @@ void HeapTracker::Protect(size_t virtual_offset, size_t size, MemoryPermission p } } -bool HeapTracker::DeferredMapSeparateHeap(u8* fault_address) { - if (m_buffer.IsInVirtualRange(fault_address)) { - return this->DeferredMapSeparateHeap(fault_address - m_buffer.VirtualBasePointer()); - } - - return false; -} - -bool HeapTracker::DeferredMapSeparateHeap(size_t virtual_offset) { - bool rebuild_required = false; - - { - std::scoped_lock lk{m_lock}; - - // Check to ensure this was a non-resident separate heap mapping. - const auto it = this->GetNearestHeapMapLocked(virtual_offset); - if (it == m_mappings.end() || it->is_resident) { - return false; - } - - // Update tick before possible rebuild. - it->tick = m_tick++; - - // Check if we need to rebuild. - if (m_resident_map_count > m_max_resident_map_count) { - rebuild_required = true; - } - - // Map the area. - m_buffer.Map(it->vaddr, it->paddr, it->size, it->perm, false); - - // This map is now resident. - it->is_resident = true; - m_resident_map_count++; - m_resident_mappings.insert(*it); - } - - if (rebuild_required) { - // A rebuild was required, so perform it now. - this->RebuildSeparateHeapAddressSpace(); - } - - return true; -} - void HeapTracker::RebuildSeparateHeapAddressSpace() { std::scoped_lock lk{m_rebuild_lock, m_lock}; diff --git a/src/core/arm/dynarmic/arm_dynarmic.cpp b/src/core/arm/dynarmic/arm_dynarmic.cpp deleted file mode 100644 index 2a9f19a98b..0000000000 --- a/src/core/arm/dynarmic/arm_dynarmic.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#ifdef __linux__ - -#include "common/signal_chain.h" - -#include "core/arm/dynarmic/arm_dynarmic.h" -#include "core/hle/kernel/k_process.h" -#include "core/memory.h" - -namespace Core { - -namespace { - -thread_local Core::Memory::Memory* g_current_memory{}; -std::once_flag g_registered{}; -struct sigaction g_old_segv{}; - -void HandleSigSegv(int sig, siginfo_t* info, void* ctx) { - if (g_current_memory && g_current_memory->InvalidateSeparateHeap(info->si_addr)) { - return; - } - - return g_old_segv.sa_sigaction(sig, info, ctx); -} - -} // namespace - -ScopedJitExecution::ScopedJitExecution(Kernel::KProcess* process) { - g_current_memory = std::addressof(process->GetMemory()); -} - -ScopedJitExecution::~ScopedJitExecution() { - g_current_memory = nullptr; -} - -void ScopedJitExecution::RegisterHandler() { - std::call_once(g_registered, [] { - struct sigaction sa{}; - sa.sa_sigaction = &HandleSigSegv; - sa.sa_flags = SA_SIGINFO | SA_ONSTACK; - Common::SigAction(SIGSEGV, std::addressof(sa), std::addressof(g_old_segv)); - }); -} - -} // namespace Core - -#endif diff --git a/src/core/arm/dynarmic/arm_dynarmic.h b/src/core/arm/dynarmic/arm_dynarmic.h index 53dd188151..eef7c31160 100644 --- a/src/core/arm/dynarmic/arm_dynarmic.h +++ b/src/core/arm/dynarmic/arm_dynarmic.h @@ -26,24 +26,4 @@ constexpr HaltReason TranslateHaltReason(Dynarmic::HaltReason hr) { return static_cast(hr); } -#ifdef __linux__ - -class ScopedJitExecution { -public: - explicit ScopedJitExecution(Kernel::KProcess* process); - ~ScopedJitExecution(); - static void RegisterHandler(); -}; - -#else - -class ScopedJitExecution { -public: - explicit ScopedJitExecution(Kernel::KProcess* process) {} - ~ScopedJitExecution() {} - static void RegisterHandler() {} -}; - -#endif - } // namespace Core diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp index e8c62ea2d9..01ac659ab8 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp @@ -337,15 +337,11 @@ bool ArmDynarmic32::IsInThumbMode() const { } HaltReason ArmDynarmic32::RunThread(Kernel::KThread* thread) { - ScopedJitExecution sj(thread->GetOwnerProcess()); - m_jit->ClearExclusiveState(); return TranslateHaltReason(m_jit->Run()); } HaltReason ArmDynarmic32::StepThread(Kernel::KThread* thread) { - ScopedJitExecution sj(thread->GetOwnerProcess()); - m_jit->ClearExclusiveState(); return TranslateHaltReason(m_jit->Step()); } @@ -387,7 +383,6 @@ ArmDynarmic32::ArmDynarmic32(System& system, bool uses_wall_clock, Kernel::KProc m_cp15(std::make_shared(*this)), m_core_index{core_index} { auto& page_table_impl = process->GetPageTable().GetBasePageTable().GetImpl(); m_jit = MakeJit(&page_table_impl); - ScopedJitExecution::RegisterHandler(); } ArmDynarmic32::~ArmDynarmic32() = default; diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp index f0b180bccd..e62d3c566a 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp @@ -368,15 +368,11 @@ std::shared_ptr ArmDynarmic64::MakeJit(Common::PageTable* pa } HaltReason ArmDynarmic64::RunThread(Kernel::KThread* thread) { - ScopedJitExecution sj(thread->GetOwnerProcess()); - m_jit->ClearExclusiveState(); return TranslateHaltReason(m_jit->Run()); } HaltReason ArmDynarmic64::StepThread(Kernel::KThread* thread) { - ScopedJitExecution sj(thread->GetOwnerProcess()); - m_jit->ClearExclusiveState(); return TranslateHaltReason(m_jit->Step()); } @@ -416,7 +412,6 @@ ArmDynarmic64::ArmDynarmic64(System& system, bool uses_wall_clock, Kernel::KProc auto& page_table = process->GetPageTable().GetBasePageTable(); auto& page_table_impl = page_table.GetImpl(); m_jit = MakeJit(&page_table_impl, page_table.GetAddressSpaceWidth()); - ScopedJitExecution::RegisterHandler(); } ArmDynarmic64::~ArmDynarmic64() = default; diff --git a/src/core/hle/kernel/k_process.cpp b/src/core/hle/kernel/k_process.cpp index 80566b7e77..cf03353f84 100644 --- a/src/core/hle/kernel/k_process.cpp +++ b/src/core/hle/kernel/k_process.cpp @@ -1266,10 +1266,6 @@ void KProcess::InitializeInterfaces() { #ifdef HAS_NCE if (this->IsApplication() && Settings::IsNceEnabled()) { - // Register the scoped JIT handler before creating any NCE instances - // so that its signal handler will appear first in the signal chain. - Core::ScopedJitExecution::RegisterHandler(); - for (size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) { m_arm_interfaces[i] = std::make_unique(m_kernel.System(), true, i); } diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 34539cc650..83bacc5aef 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -37,105 +37,6 @@ namespace Core::Memory { namespace { -inline void FastMemcpy(void* dst, const void* src, std::size_t size) { - // Fast path for small copies - switch (size) { - case 1: - *static_cast(dst) = *static_cast(src); - break; - case 2: - *static_cast(dst) = *static_cast(src); - break; - case 4: - *static_cast(dst) = *static_cast(src); - break; - case 8: - *static_cast(dst) = *static_cast(src); - break; - case 16: { - // Optimize for 16-byte copy (common case for SIMD registers) - const u64* src_64 = static_cast(src); - u64* dst_64 = static_cast(dst); - dst_64[0] = src_64[0]; - dst_64[1] = src_64[1]; - break; - } - case 32: { - // Optimize for 32-byte copy - const u64* src_64 = static_cast(src); - u64* dst_64 = static_cast(dst); - dst_64[0] = src_64[0]; - dst_64[1] = src_64[1]; - dst_64[2] = src_64[2]; - dst_64[3] = src_64[3]; - break; - } - case 64: { - // Optimize for 64-byte copy - const u64* src_64 = static_cast(src); - u64* dst_64 = static_cast(dst); - dst_64[0] = src_64[0]; - dst_64[1] = src_64[1]; - dst_64[2] = src_64[2]; - dst_64[3] = src_64[3]; - dst_64[4] = src_64[4]; - dst_64[5] = src_64[5]; - dst_64[6] = src_64[6]; - dst_64[7] = src_64[7]; - break; - } - default: - // For larger sizes, use standard memcpy which is usually optimized by the compiler - std::memcpy(dst, src, size); - break; - } -} - -inline void FastMemset(void* dst, int value, std::size_t size) { - // Fast path for small fills - switch (size) { - case 1: - *static_cast(dst) = static_cast(value); - break; - case 2: - *static_cast(dst) = static_cast(value); - break; - case 4: - *static_cast(dst) = static_cast(value); - break; - case 8: - *static_cast(dst) = static_cast(value); - break; - case 16: { - // Optimize for 16-byte fill (common case for SIMD registers) - u64* dst_64 = static_cast(dst); - const u64 val64 = static_cast(value) * 0x0101010101010101ULL; - dst_64[0] = val64; - dst_64[1] = val64; - break; - } - default: - if (size <= 128 && value == 0) { - // Fast path for small zero-fills - u8* dst_bytes = static_cast(dst); - for (std::size_t i = 0; i < size; i += 8) { - if (i + 8 <= size) { - *reinterpret_cast(dst_bytes + i) = 0; - } else { - // Handle remaining bytes (less than 8) - for (std::size_t j = i; j < size; j++) { - dst_bytes[j] = 0; - } - } - } - } else { - // For larger sizes, use standard memset which is usually optimized by the compiler - std::memset(dst, value, size); - } - break; - } -} - bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr, const std::size_t size) { const Common::ProcessAddress max_addr = 1ULL << table.GetAddressSpaceBits(); @@ -416,17 +317,17 @@ struct Memory::Impl { LOG_ERROR(HW_Memory, "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", GetInteger(current_vaddr), GetInteger(src_addr), size); - FastMemset(dest_buffer, 0, copy_amount); + std::memset(dest_buffer, 0, copy_amount); }, [&](const std::size_t copy_amount, const u8* const src_ptr) { - FastMemcpy(dest_buffer, src_ptr, copy_amount); + std::memcpy(dest_buffer, src_ptr, copy_amount); }, [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, const u8* const host_ptr) { if constexpr (!UNSAFE) { HandleRasterizerDownload(GetInteger(current_vaddr), copy_amount); } - FastMemcpy(dest_buffer, host_ptr, copy_amount); + std::memcpy(dest_buffer, host_ptr, copy_amount); }, [&](const std::size_t copy_amount) { dest_buffer = static_cast(dest_buffer) + copy_amount; @@ -434,7 +335,7 @@ struct Memory::Impl { } bool ReadBlockParallel(const Common::ProcessAddress src_addr, void* dest_buffer, - const std::size_t size) { + const std::size_t size) { // Calculate chunk size based on thread count const size_t chunk_size = (size + thread_count - 1) / thread_count; @@ -457,7 +358,8 @@ struct Memory::Impl { void* current_dest = static_cast(dest_buffer) + offset; // Launch thread - threads.emplace_back([this, i, current_addr, current_dest, current_chunk_size, &results] { + threads.emplace_back([this, i, current_addr, current_dest, current_chunk_size, + &results] { results[i] = ReadBlockImpl(current_addr, current_dest, current_chunk_size); }); } @@ -515,14 +417,14 @@ struct Memory::Impl { GetInteger(current_vaddr), GetInteger(dest_addr), size); }, [&](const std::size_t copy_amount, u8* const dest_ptr) { - FastMemcpy(dest_ptr, src_buffer, copy_amount); + std::memcpy(dest_ptr, src_buffer, copy_amount); }, [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, u8* const host_ptr) { if constexpr (!UNSAFE) { HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount); } - FastMemcpy(host_ptr, src_buffer, copy_amount); + std::memcpy(host_ptr, src_buffer, copy_amount); }, [&](const std::size_t copy_amount) { src_buffer = static_cast(src_buffer) + copy_amount; @@ -530,7 +432,7 @@ struct Memory::Impl { } bool WriteBlockParallel(const Common::ProcessAddress dest_addr, const void* src_buffer, - const std::size_t size) { + const std::size_t size) { // Calculate chunk size based on thread count const size_t chunk_size = (size + thread_count - 1) / thread_count; @@ -553,7 +455,8 @@ struct Memory::Impl { const void* current_src = static_cast(src_buffer) + offset; // Launch thread - threads.emplace_back([this, i, current_addr, current_src, current_chunk_size, &results] { + threads.emplace_back([this, i, current_addr, current_src, current_chunk_size, + &results] { results[i] = WriteBlockImpl(current_addr, current_src, current_chunk_size); }); } @@ -593,12 +496,12 @@ struct Memory::Impl { GetInteger(current_vaddr), GetInteger(dest_addr), size); }, [](const std::size_t copy_amount, u8* const dest_ptr) { - FastMemset(dest_ptr, 0, copy_amount); + std::memset(dest_ptr, 0, copy_amount); }, [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, u8* const host_ptr) { HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount); - FastMemset(host_ptr, 0, copy_amount); + std::memset(host_ptr, 0, copy_amount); }, [](const std::size_t copy_amount) {}); } @@ -876,8 +779,10 @@ struct Memory::Impl { return nullptr; } else { // Avoid adding any extra logic to this fast-path block - const uintptr_t raw_pointer = current_page_table->pointers[vaddr >> YUZU_PAGEBITS].Raw(); - if (const uintptr_t pointer = Common::PageTable::PageInfo::ExtractPointer(raw_pointer)) { + const uintptr_t raw_pointer = + current_page_table->pointers[vaddr >> YUZU_PAGEBITS].Raw(); + if (const uintptr_t pointer = + Common::PageTable::PageInfo::ExtractPointer(raw_pointer)) { return reinterpret_cast(pointer + vaddr); } else { switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) { @@ -912,8 +817,7 @@ struct Memory::Impl { } [[nodiscard]] u8* GetPointerSilent(const Common::ProcessAddress vaddr) const { - return GetPointerImpl( - GetInteger(vaddr), []() {}, []() {}); + return GetPointerImpl(GetInteger(vaddr), []() {}, []() {}); } /** @@ -934,10 +838,7 @@ struct Memory::Impl { if constexpr (std::is_same_v || std::is_same_v) { // 8-bit reads are always aligned const u8* const ptr = GetPointerImpl( - addr, - [addr]() { - LOG_ERROR(HW_Memory, "Unmapped Read8 @ 0x{:016X}", addr); - }, + addr, [addr]() { LOG_ERROR(HW_Memory, "Unmapped Read8 @ 0x{:016X}", addr); }, [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); if (ptr) { return static_cast(*ptr); @@ -947,10 +848,7 @@ struct Memory::Impl { // Check alignment for 16-bit reads if ((addr & 1) == 0) { const u8* const ptr = GetPointerImpl( - addr, - [addr]() { - LOG_ERROR(HW_Memory, "Unmapped Read16 @ 0x{:016X}", addr); - }, + addr, [addr]() { LOG_ERROR(HW_Memory, "Unmapped Read16 @ 0x{:016X}", addr); }, [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); if (ptr) { return static_cast(*reinterpret_cast(ptr)); @@ -960,10 +858,7 @@ struct Memory::Impl { // Check alignment for 32-bit reads if ((addr & 3) == 0) { const u8* const ptr = GetPointerImpl( - addr, - [addr]() { - LOG_ERROR(HW_Memory, "Unmapped Read32 @ 0x{:016X}", addr); - }, + addr, [addr]() { LOG_ERROR(HW_Memory, "Unmapped Read32 @ 0x{:016X}", addr); }, [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); if (ptr) { return static_cast(*reinterpret_cast(ptr)); @@ -973,10 +868,7 @@ struct Memory::Impl { // Check alignment for 64-bit reads if ((addr & 7) == 0) { const u8* const ptr = GetPointerImpl( - addr, - [addr]() { - LOG_ERROR(HW_Memory, "Unmapped Read64 @ 0x{:016X}", addr); - }, + addr, [addr]() { LOG_ERROR(HW_Memory, "Unmapped Read64 @ 0x{:016X}", addr); }, [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); if (ptr) { return static_cast(*reinterpret_cast(ptr)); @@ -988,12 +880,10 @@ struct Memory::Impl { T result = 0; const u8* const ptr = GetPointerImpl( addr, - [addr]() { - LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:016X}", sizeof(T) * 8, addr); - }, + [addr]() { LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:016X}", sizeof(T) * 8, addr); }, [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); if (ptr) { - FastMemcpy(&result, ptr, sizeof(T)); + std::memcpy(&result, ptr, sizeof(T)); } return result; } @@ -1080,7 +970,7 @@ struct Memory::Impl { }, [&]() { HandleRasterizerWrite(addr, sizeof(T)); }); if (ptr) { - FastMemcpy(ptr, &data, sizeof(T)); + std::memcpy(ptr, &data, sizeof(T)); } } @@ -1114,8 +1004,7 @@ struct Memory::Impl { } void HandleRasterizerDownload(VAddr v_address, size_t size) { - const auto* p = GetPointerImpl( - v_address, []() {}, []() {}); + const auto* p = GetPointerImpl(v_address, []() {}, []() {}); if (!gpu_device_memory) [[unlikely]] { gpu_device_memory = &system.Host1x().MemoryManager(); } @@ -1132,8 +1021,7 @@ struct Memory::Impl { } void HandleRasterizerWrite(VAddr v_address, size_t size) { - const auto* p = GetPointerImpl( - v_address, []() {}, []() {}); + const auto* p = GetPointerImpl(v_address, []() {}, []() {}); constexpr size_t sys_core = Core::Hardware::NUM_CPU_CORES - 1; const size_t core = std::min(system.GetCurrentHostThreadID(), sys_core); // any other calls threads go to syscore. @@ -1153,13 +1041,13 @@ struct Memory::Impl { auto& current_area = rasterizer_write_areas[core]; PAddr subaddress = address >> YUZU_PAGEBITS; // Performance note: - // It may not be a good idea to assume accesses are within the same subaddress (i.e same page) - // It is often the case the games like to access wildly different addresses. Hence why I propose - // we should let the compiler just do it's thing... + // It may not be a good idea to assume accesses are within the same subaddress (i.e same + // page) It is often the case the games like to access wildly different addresses. Hence + // why I propose we should let the compiler just do it's thing... if (current_area.last_address != subaddress) { // Short circuit the need to check for address/size - auto const do_collection = (address != 0 && size != 0) - && system.GPU().OnCPUWrite(address, size); + auto const do_collection = + (address != 0 && size != 0) && system.GPU().OnCPUWrite(address, size); if (do_collection) { current_area.last_address = subaddress; } else { @@ -1418,21 +1306,7 @@ bool Memory::InvalidateNCE(Common::ProcessAddress vaddr, size_t size) { impl->InvalidateGPUMemory(ptr, size); } -#ifdef __linux__ - if (!rasterizer && mapped) { - impl->buffer->DeferredMapSeparateHeap(GetInteger(vaddr)); - } -#endif - return mapped && ptr != nullptr; } -bool Memory::InvalidateSeparateHeap(void* fault_address) { -#ifdef __linux__ - return impl->buffer->DeferredMapSeparateHeap(static_cast(fault_address)); -#else - return false; -#endif -} - } // namespace Core::Memory