From b7cdf41bc6fae93fbfa677ab241e80ed8dc7f4ee Mon Sep 17 00:00:00 2001 From: lizzie Date: Sun, 3 Aug 2025 03:30:27 +0100 Subject: [PATCH] [memory] remove "fast" memcpy and memset (not fast at all), remove slow parallel impl and just use serial --- src/core/memory.cpp | 221 ++++---------------------------------------- 1 file changed, 17 insertions(+), 204 deletions(-) diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 34539cc650..0035c626e2 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -35,115 +35,12 @@ namespace Core::Memory { -namespace { - -inline void FastMemcpy(void* dst, const void* src, std::size_t size) { - // Fast path for small copies - switch (size) { - case 1: - *static_cast(dst) = *static_cast(src); - break; - case 2: - *static_cast(dst) = *static_cast(src); - break; - case 4: - *static_cast(dst) = *static_cast(src); - break; - case 8: - *static_cast(dst) = *static_cast(src); - break; - case 16: { - // Optimize for 16-byte copy (common case for SIMD registers) - const u64* src_64 = static_cast(src); - u64* dst_64 = static_cast(dst); - dst_64[0] = src_64[0]; - dst_64[1] = src_64[1]; - break; - } - case 32: { - // Optimize for 32-byte copy - const u64* src_64 = static_cast(src); - u64* dst_64 = static_cast(dst); - dst_64[0] = src_64[0]; - dst_64[1] = src_64[1]; - dst_64[2] = src_64[2]; - dst_64[3] = src_64[3]; - break; - } - case 64: { - // Optimize for 64-byte copy - const u64* src_64 = static_cast(src); - u64* dst_64 = static_cast(dst); - dst_64[0] = src_64[0]; - dst_64[1] = src_64[1]; - dst_64[2] = src_64[2]; - dst_64[3] = src_64[3]; - dst_64[4] = src_64[4]; - dst_64[5] = src_64[5]; - dst_64[6] = src_64[6]; - dst_64[7] = src_64[7]; - break; - } - default: - // For larger sizes, use standard memcpy which is usually optimized by the compiler - std::memcpy(dst, src, size); - break; - } -} - -inline void FastMemset(void* dst, int value, std::size_t size) { - // Fast path for small fills - switch (size) { - case 1: - *static_cast(dst) = static_cast(value); - break; - case 2: - *static_cast(dst) = static_cast(value); - break; - case 4: - *static_cast(dst) = static_cast(value); - break; - case 8: - *static_cast(dst) = static_cast(value); - break; - case 16: { - // Optimize for 16-byte fill (common case for SIMD registers) - u64* dst_64 = static_cast(dst); - const u64 val64 = static_cast(value) * 0x0101010101010101ULL; - dst_64[0] = val64; - dst_64[1] = val64; - break; - } - default: - if (size <= 128 && value == 0) { - // Fast path for small zero-fills - u8* dst_bytes = static_cast(dst); - for (std::size_t i = 0; i < size; i += 8) { - if (i + 8 <= size) { - *reinterpret_cast(dst_bytes + i) = 0; - } else { - // Handle remaining bytes (less than 8) - for (std::size_t j = i; j < size; j++) { - dst_bytes[j] = 0; - } - } - } - } else { - // For larger sizes, use standard memset which is usually optimized by the compiler - std::memset(dst, value, size); - } - break; - } -} - -bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr, +static inline bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr, const std::size_t size) { const Common::ProcessAddress max_addr = 1ULL << table.GetAddressSpaceBits(); return addr + size >= addr && addr + size <= max_addr; } -} // namespace - // Implementation class used to keep the specifics of the memory subsystem hidden // from outside classes. This also allows modification to the internals of the memory // subsystem without needing to rebuild all files that make use of the memory interface. @@ -416,70 +313,28 @@ struct Memory::Impl { LOG_ERROR(HW_Memory, "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", GetInteger(current_vaddr), GetInteger(src_addr), size); - FastMemset(dest_buffer, 0, copy_amount); + std::memset(dest_buffer, 0, copy_amount); }, [&](const std::size_t copy_amount, const u8* const src_ptr) { - FastMemcpy(dest_buffer, src_ptr, copy_amount); + std::memcpy(dest_buffer, src_ptr, copy_amount); }, [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, const u8* const host_ptr) { if constexpr (!UNSAFE) { HandleRasterizerDownload(GetInteger(current_vaddr), copy_amount); } - FastMemcpy(dest_buffer, host_ptr, copy_amount); + std::memcpy(dest_buffer, host_ptr, copy_amount); }, [&](const std::size_t copy_amount) { dest_buffer = static_cast(dest_buffer) + copy_amount; }); } - bool ReadBlockParallel(const Common::ProcessAddress src_addr, void* dest_buffer, - const std::size_t size) { - // Calculate chunk size based on thread count - const size_t chunk_size = (size + thread_count - 1) / thread_count; - - // Create threads for parallel processing - std::vector threads; - threads.reserve(thread_count); - - // Create a vector to store the results of each thread - std::vector results(thread_count, true); - - // Split the work among threads - for (unsigned int i = 0; i < thread_count; ++i) { - const size_t offset = i * chunk_size; - if (offset >= size) { - break; - } - - const size_t current_chunk_size = std::min(chunk_size, size - offset); - const Common::ProcessAddress current_addr = src_addr + offset; - void* current_dest = static_cast(dest_buffer) + offset; - - // Launch thread - threads.emplace_back([this, i, current_addr, current_dest, current_chunk_size, &results] { - results[i] = ReadBlockImpl(current_addr, current_dest, current_chunk_size); - }); - } - - // Wait for all threads to complete - for (auto& thread : threads) { - thread.join(); - } - - // Check if all operations succeeded - return std::all_of(results.begin(), results.end(), [](bool result) { return result; }); - } - bool ReadBlock(const Common::ProcessAddress src_addr, void* dest_buffer, const std::size_t size) { - // For small reads, use the regular implementation - if (size < PARALLEL_THRESHOLD) { - return ReadBlockImpl(src_addr, dest_buffer, size); - } - - // For large reads, use parallel implementation - return ReadBlockParallel(src_addr, dest_buffer, size); + // TODO: If you want a proper multithreaded implementation (w/o cache coherency fights) + // use TBB or something that splits the job properly + return ReadBlockImpl(src_addr, dest_buffer, size); } bool ReadBlockUnsafe(const Common::ProcessAddress src_addr, void* dest_buffer, @@ -515,67 +370,25 @@ struct Memory::Impl { GetInteger(current_vaddr), GetInteger(dest_addr), size); }, [&](const std::size_t copy_amount, u8* const dest_ptr) { - FastMemcpy(dest_ptr, src_buffer, copy_amount); + std::memcpy(dest_ptr, src_buffer, copy_amount); }, [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, u8* const host_ptr) { if constexpr (!UNSAFE) { HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount); } - FastMemcpy(host_ptr, src_buffer, copy_amount); + std::memcpy(host_ptr, src_buffer, copy_amount); }, [&](const std::size_t copy_amount) { src_buffer = static_cast(src_buffer) + copy_amount; }); } - bool WriteBlockParallel(const Common::ProcessAddress dest_addr, const void* src_buffer, - const std::size_t size) { - // Calculate chunk size based on thread count - const size_t chunk_size = (size + thread_count - 1) / thread_count; - - // Create threads for parallel processing - std::vector threads; - threads.reserve(thread_count); - - // Create a vector to store the results of each thread - std::vector results(thread_count, true); - - // Split the work among threads - for (unsigned int i = 0; i < thread_count; ++i) { - const size_t offset = i * chunk_size; - if (offset >= size) { - break; - } - - const size_t current_chunk_size = std::min(chunk_size, size - offset); - const Common::ProcessAddress current_addr = dest_addr + offset; - const void* current_src = static_cast(src_buffer) + offset; - - // Launch thread - threads.emplace_back([this, i, current_addr, current_src, current_chunk_size, &results] { - results[i] = WriteBlockImpl(current_addr, current_src, current_chunk_size); - }); - } - - // Wait for all threads to complete - for (auto& thread : threads) { - thread.join(); - } - - // Check if all operations succeeded - return std::all_of(results.begin(), results.end(), [](bool result) { return result; }); - } - bool WriteBlock(const Common::ProcessAddress dest_addr, const void* src_buffer, const std::size_t size) { - // For small writes, use the regular implementation - if (size < PARALLEL_THRESHOLD) { - return WriteBlockImpl(dest_addr, src_buffer, size); - } - - // For large writes, use parallel implementation - return WriteBlockParallel(dest_addr, src_buffer, size); + // TODO: If you want a proper multithreaded implementation (w/o cache coherency fights) + // use TBB or something that splits the job properly + return WriteBlockImpl(dest_addr, src_buffer, size); } bool WriteBlockUnsafe(const Common::ProcessAddress dest_addr, const void* src_buffer, @@ -593,12 +406,12 @@ struct Memory::Impl { GetInteger(current_vaddr), GetInteger(dest_addr), size); }, [](const std::size_t copy_amount, u8* const dest_ptr) { - FastMemset(dest_ptr, 0, copy_amount); + std::memset(dest_ptr, 0, copy_amount); }, [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, u8* const host_ptr) { HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount); - FastMemset(host_ptr, 0, copy_amount); + std::memset(host_ptr, 0, copy_amount); }, [](const std::size_t copy_amount) {}); } @@ -993,7 +806,7 @@ struct Memory::Impl { }, [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); if (ptr) { - FastMemcpy(&result, ptr, sizeof(T)); + std::memcpy(&result, ptr, sizeof(T)); } return result; } @@ -1080,7 +893,7 @@ struct Memory::Impl { }, [&]() { HandleRasterizerWrite(addr, sizeof(T)); }); if (ptr) { - FastMemcpy(ptr, &data, sizeof(T)); + std::memcpy(ptr, &data, sizeof(T)); } } @@ -1203,7 +1016,7 @@ struct Memory::Impl { unsigned int thread_count = 2; // Minimum size in bytes for which parallel processing is beneficial - static constexpr size_t PARALLEL_THRESHOLD = 64 * 1024; // 64 KB + //size_t PARALLEL_THRESHOLD = (L3 CACHE * NUM PHYSICAL CORES); // 64 KB std::array rasterizer_read_areas{}; std::array rasterizer_write_areas{}; -- 2.39.5