From b7cdf41bc6fae93fbfa677ab241e80ed8dc7f4ee Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Sun, 3 Aug 2025 03:30:27 +0100
Subject: [PATCH] [memory] remove "fast" memcpy and memset (not fast at all),
 remove slow parallel impl and just use serial

---
 src/core/memory.cpp | 221 ++++----------------------------------------
 1 file changed, 17 insertions(+), 204 deletions(-)
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 34539cc650..0035c626e2 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -35,115 +35,12 @@
 
 namespace Core::Memory {
 
-namespace {
-
-inline void FastMemcpy(void* dst, const void* src, std::size_t size) {
-    // Fast path for small copies
-    switch (size) {
-    case 1:
-        *static_cast<u8*>(dst) = *static_cast<const u8*>(src);
-        break;
-    case 2:
-        *static_cast<u16*>(dst) = *static_cast<const u16*>(src);
-        break;
-    case 4:
-        *static_cast<u32*>(dst) = *static_cast<const u32*>(src);
-        break;
-    case 8:
-        *static_cast<u64*>(dst) = *static_cast<const u64*>(src);
-        break;
-    case 16: {
-        // Optimize for 16-byte copy (common case for SIMD registers)
-        const u64* src_64 = static_cast<const u64*>(src);
-        u64* dst_64 = static_cast<u64*>(dst);
-        dst_64[0] = src_64[0];
-        dst_64[1] = src_64[1];
-        break;
-    }
-    case 32: {
-        // Optimize for 32-byte copy
-        const u64* src_64 = static_cast<const u64*>(src);
-        u64* dst_64 = static_cast<u64*>(dst);
-        dst_64[0] = src_64[0];
-        dst_64[1] = src_64[1];
-        dst_64[2] = src_64[2];
-        dst_64[3] = src_64[3];
-        break;
-    }
-    case 64: {
-        // Optimize for 64-byte copy
-        const u64* src_64 = static_cast<const u64*>(src);
-        u64* dst_64 = static_cast<u64*>(dst);
-        dst_64[0] = src_64[0];
-        dst_64[1] = src_64[1];
-        dst_64[2] = src_64[2];
-        dst_64[3] = src_64[3];
-        dst_64[4] = src_64[4];
-        dst_64[5] = src_64[5];
-        dst_64[6] = src_64[6];
-        dst_64[7] = src_64[7];
-        break;
-    }
-    default:
-        // For larger sizes, use standard memcpy which is usually optimized by the compiler
-        std::memcpy(dst, src, size);
-        break;
-    }
-}
-
-inline void FastMemset(void* dst, int value, std::size_t size) {
-    // Fast path for small fills
-    switch (size) {
-    case 1:
-        *static_cast<u8*>(dst) = static_cast<u8>(value);
-        break;
-    case 2:
-        *static_cast<u16*>(dst) = static_cast<u16>(value);
-        break;
-    case 4:
-        *static_cast<u32*>(dst) = static_cast<u32>(value);
-        break;
-    case 8:
-        *static_cast<u64*>(dst) = static_cast<u64>(value);
-        break;
-    case 16: {
-        // Optimize for 16-byte fill (common case for SIMD registers)
-        u64* dst_64 = static_cast<u64*>(dst);
-        const u64 val64 = static_cast<u8>(value) * 0x0101010101010101ULL;
-        dst_64[0] = val64;
-        dst_64[1] = val64;
-        break;
-    }
-    default:
-        if (size <= 128 && value == 0) {
-            // Fast path for small zero-fills
-            u8* dst_bytes = static_cast<u8*>(dst);
-            for (std::size_t i = 0; i < size; i += 8) {
-                if (i + 8 <= size) {
-                    *reinterpret_cast<u64*>(dst_bytes + i) = 0;
-                } else {
-                    // Handle remaining bytes (less than 8)
-                    for (std::size_t j = i; j < size; j++) {
-                        dst_bytes[j] = 0;
-                    }
-                }
-            }
-        } else {
-            // For larger sizes, use standard memset which is usually optimized by the compiler
-            std::memset(dst, value, size);
-        }
-        break;
-    }
-}
-
-bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr,
+static inline bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr,
                           const std::size_t size) {
     const Common::ProcessAddress max_addr = 1ULL << table.GetAddressSpaceBits();
     return addr + size >= addr && addr + size <= max_addr;
 }
 
-} // namespace
-
 // Implementation class used to keep the specifics of the memory subsystem hidden
 // from outside classes. This also allows modification to the internals of the memory
 // subsystem without needing to rebuild all files that make use of the memory interface.
@@ -416,70 +313,28 @@ struct Memory::Impl {
                 LOG_ERROR(HW_Memory,
                           "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
                           GetInteger(current_vaddr), GetInteger(src_addr), size);
-               FastMemset(dest_buffer, 0, copy_amount);
+               std::memset(dest_buffer, 0, copy_amount);
             },
             [&](const std::size_t copy_amount, const u8* const src_ptr) {
-                FastMemcpy(dest_buffer, src_ptr, copy_amount);
+                std::memcpy(dest_buffer, src_ptr, copy_amount);
             },
             [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount,
                 const u8* const host_ptr) {
                 if constexpr (!UNSAFE) {
                     HandleRasterizerDownload(GetInteger(current_vaddr), copy_amount);
                 }
-                FastMemcpy(dest_buffer, host_ptr, copy_amount);
+                std::memcpy(dest_buffer, host_ptr, copy_amount);
             },
             [&](const std::size_t copy_amount) {
                 dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
             });
     }
 
-    bool ReadBlockParallel(const Common::ProcessAddress src_addr, void* dest_buffer,
-                          const std::size_t size) {
-        // Calculate chunk size based on thread count
-        const size_t chunk_size = (size + thread_count - 1) / thread_count;
-
-        // Create threads for parallel processing
-        std::vector<std::thread> threads;
-        threads.reserve(thread_count);
-
-        // Create a vector to store the results of each thread
-        std::vector<bool> results(thread_count, true);
-
-        // Split the work among threads
-        for (unsigned int i = 0; i < thread_count; ++i) {
-            const size_t offset = i * chunk_size;
-            if (offset >= size) {
-                break;
-            }
-
-            const size_t current_chunk_size = std::min(chunk_size, size - offset);
-            const Common::ProcessAddress current_addr = src_addr + offset;
-            void* current_dest = static_cast<u8*>(dest_buffer) + offset;
-
-            // Launch thread
-            threads.emplace_back([this, i, current_addr, current_dest, current_chunk_size, &results] {
-                results[i] = ReadBlockImpl<false>(current_addr, current_dest, current_chunk_size);
-            });
-        }
-
-        // Wait for all threads to complete
-        for (auto& thread : threads) {
-            thread.join();
-        }
-
-        // Check if all operations succeeded
-        return std::all_of(results.begin(), results.end(), [](bool result) { return result; });
-    }
-
     bool ReadBlock(const Common::ProcessAddress src_addr, void* dest_buffer,
                    const std::size_t size) {
-        // For small reads, use the regular implementation
-        if (size < PARALLEL_THRESHOLD) {
-            return ReadBlockImpl<false>(src_addr, dest_buffer, size);
-        }
-
-        // For large reads, use parallel implementation
-        return ReadBlockParallel(src_addr, dest_buffer, size);
+        // TODO: If you want a proper multithreaded implementation (w/o cache coherency fights)
+        // use TBB or something that splits the job properly
+        return ReadBlockImpl<false>(src_addr, dest_buffer, size);
     }
 
     bool ReadBlockUnsafe(const Common::ProcessAddress src_addr, void* dest_buffer,
@@ -515,67 +370,25 @@ struct Memory::Impl {
                           GetInteger(current_vaddr), GetInteger(dest_addr), size);
             },
             [&](const std::size_t copy_amount, u8* const dest_ptr) {
-                FastMemcpy(dest_ptr, src_buffer, copy_amount);
+                std::memcpy(dest_ptr, src_buffer, copy_amount);
             },
             [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount,
                 u8* const host_ptr) {
                 if constexpr (!UNSAFE) {
                     HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount);
                 }
-                FastMemcpy(host_ptr, src_buffer, copy_amount);
+                std::memcpy(host_ptr, src_buffer, copy_amount);
             },
             [&](const std::size_t copy_amount) {
                 src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
             });
     }
 
-    bool WriteBlockParallel(const Common::ProcessAddress dest_addr, const void* src_buffer,
-                          const std::size_t size) {
-        // Calculate chunk size based on thread count
-        const size_t chunk_size = (size + thread_count - 1) / thread_count;
-
-        // Create threads for parallel processing
-        std::vector<std::thread> threads;
-        threads.reserve(thread_count);
-
-        // Create a vector to store the results of each thread
-        std::vector<bool> results(thread_count, true);
-
-        // Split the work among threads
-        for (unsigned int i = 0; i < thread_count; ++i) {
-            const size_t offset = i * chunk_size;
-            if (offset >= size) {
-                break;
-            }
-
-            const size_t current_chunk_size = std::min(chunk_size, size - offset);
-            const Common::ProcessAddress current_addr = dest_addr + offset;
-            const void* current_src = static_cast<const u8*>(src_buffer) + offset;
-
-            // Launch thread
-            threads.emplace_back([this, i, current_addr, current_src, current_chunk_size, &results] {
-                results[i] = WriteBlockImpl<false>(current_addr, current_src, current_chunk_size);
-            });
-        }
-
-        // Wait for all threads to complete
-        for (auto& thread : threads) {
-            thread.join();
-        }
-
-        // Check if all operations succeeded
-        return std::all_of(results.begin(), results.end(), [](bool result) { return result; });
-    }
-
     bool WriteBlock(const Common::ProcessAddress dest_addr, const void* src_buffer,
                     const std::size_t size) {
-        // For small writes, use the regular implementation
-        if (size < PARALLEL_THRESHOLD) {
-            return WriteBlockImpl<false>(dest_addr, src_buffer, size);
-        }
-
-        // For large writes, use parallel implementation
-        return WriteBlockParallel(dest_addr, src_buffer, size);
+        // TODO: If you want a proper multithreaded implementation (w/o cache coherency fights)
+        // use TBB or something that splits the job properly
+        return WriteBlockImpl<false>(dest_addr, src_buffer, size);
     }
 
     bool WriteBlockUnsafe(const Common::ProcessAddress dest_addr, const void* src_buffer,
@@ -593,12 +406,12 @@ struct Memory::Impl {
                           GetInteger(current_vaddr), GetInteger(dest_addr), size);
             },
             [](const std::size_t copy_amount, u8* const dest_ptr) {
-               FastMemset(dest_ptr, 0, copy_amount);
+               std::memset(dest_ptr, 0, copy_amount);
             },
             [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount,
                 u8* const host_ptr) {
                 HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount);
-               FastMemset(host_ptr, 0, copy_amount);
+               std::memset(host_ptr, 0, copy_amount);
             },
             [](const std::size_t copy_amount) {});
     }
@@ -993,7 +806,7 @@ struct Memory::Impl {
             },
             [&]() { HandleRasterizerDownload(addr, sizeof(T)); });
         if (ptr) {
-            FastMemcpy(&result, ptr, sizeof(T));
+            std::memcpy(&result, ptr, sizeof(T));
         }
         return result;
     }
@@ -1080,7 +893,7 @@ struct Memory::Impl {
             },
             [&]() { HandleRasterizerWrite(addr, sizeof(T)); });
         if (ptr) {
-            FastMemcpy(ptr, &data, sizeof(T));
+            std::memcpy(ptr, &data, sizeof(T));
         }
     }
 
@@ -1203,7 +1016,7 @@ struct Memory::Impl {
     unsigned int thread_count = 2;
 
     // Minimum size in bytes for which parallel processing is beneficial
-    static constexpr size_t PARALLEL_THRESHOLD = 64 * 1024; // 64 KB
+    //size_t PARALLEL_THRESHOLD = (L3 CACHE * NUM PHYSICAL CORES); // 64 KB
     std::array<VideoCore::RasterizerDownloadArea, Core::Hardware::NUM_CPU_CORES>
         rasterizer_read_areas{};
     std::array<GPUDirtyState, Core::Hardware::NUM_CPU_CORES> rasterizer_write_areas{};
-- 
2.39.5