From c58af7b556c87a605ba4429e519dc4c40265eeb6 Mon Sep 17 00:00:00 2001
From: unknown <sahyno1996@gmail.com>
Date: Wed, 1 Oct 2025 13:54:48 +0200
Subject: [PATCH] add syncpoint to codecs

---
 src/core/memory.cpp                      | 221 +++++++++++++++++++++--
 src/video_core/host1x/codecs/decoder.cpp |   5 +-
 src/video_core/host1x/codecs/decoder.h   |   3 +-
 src/video_core/host1x/codecs/h264.cpp    |   3 +-
 src/video_core/host1x/codecs/h264.h      |   2 +-
 src/video_core/host1x/codecs/vp8.cpp     |   4 +-
 src/video_core/host1x/codecs/vp8.h       |   2 +-
 src/video_core/host1x/codecs/vp9.cpp     |   6 +-
 src/video_core/host1x/codecs/vp9.h       |   4 +-
 src/video_core/host1x/nvdec.cpp          |   6 +-
 10 files changed, 224 insertions(+), 32 deletions(-)
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 2583aae867..629897cb35 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -35,12 +35,115 @@
 
 namespace Core::Memory {
 
-static inline bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr,
+namespace {
+
+inline void FastMemcpy(void* dst, const void* src, std::size_t size) {
+    // Fast path for small copies
+    switch (size) {
+    case 1:
+        *static_cast<u8*>(dst) = *static_cast<const u8*>(src);
+        break;
+    case 2:
+        *static_cast<u16*>(dst) = *static_cast<const u16*>(src);
+        break;
+    case 4:
+        *static_cast<u32*>(dst) = *static_cast<const u32*>(src);
+        break;
+    case 8:
+        *static_cast<u64*>(dst) = *static_cast<const u64*>(src);
+        break;
+    case 16: {
+        // Optimize for 16-byte copy (common case for SIMD registers)
+        const u64* src_64 = static_cast<const u64*>(src);
+        u64* dst_64 = static_cast<u64*>(dst);
+        dst_64[0] = src_64[0];
+        dst_64[1] = src_64[1];
+        break;
+    }
+    case 32: {
+        // Optimize for 32-byte copy
+        const u64* src_64 = static_cast<const u64*>(src);
+        u64* dst_64 = static_cast<u64*>(dst);
+        dst_64[0] = src_64[0];
+        dst_64[1] = src_64[1];
+        dst_64[2] = src_64[2];
+        dst_64[3] = src_64[3];
+        break;
+    }
+    case 64: {
+        // Optimize for 64-byte copy
+        const u64* src_64 = static_cast<const u64*>(src);
+        u64* dst_64 = static_cast<u64*>(dst);
+        dst_64[0] = src_64[0];
+        dst_64[1] = src_64[1];
+        dst_64[2] = src_64[2];
+        dst_64[3] = src_64[3];
+        dst_64[4] = src_64[4];
+        dst_64[5] = src_64[5];
+        dst_64[6] = src_64[6];
+        dst_64[7] = src_64[7];
+        break;
+    }
+    default:
+        // For larger sizes, use standard memcpy which is usually optimized by the compiler
+        std::memcpy(dst, src, size);
+        break;
+    }
+}
+
+inline void FastMemset(void* dst, int value, std::size_t size) {
+    // Fast path for small fills
+    switch (size) {
+    case 1:
+        *static_cast<u8*>(dst) = static_cast<u8>(value);
+        break;
+    case 2:
+        *static_cast<u16*>(dst) = static_cast<u16>(value);
+        break;
+    case 4:
+        *static_cast<u32*>(dst) = static_cast<u32>(value);
+        break;
+    case 8:
+        *static_cast<u64*>(dst) = static_cast<u64>(value);
+        break;
+    case 16: {
+        // Optimize for 16-byte fill (common case for SIMD registers)
+        u64* dst_64 = static_cast<u64*>(dst);
+        const u64 val64 = static_cast<u8>(value) * 0x0101010101010101ULL;
+        dst_64[0] = val64;
+        dst_64[1] = val64;
+        break;
+    }
+    default:
+        if (size <= 128 && value == 0) {
+            // Fast path for small zero-fills
+            u8* dst_bytes = static_cast<u8*>(dst);
+            for (std::size_t i = 0; i < size; i += 8) {
+                if (i + 8 <= size) {
+                    *reinterpret_cast<u64*>(dst_bytes + i) = 0;
+                } else {
+                    // Handle remaining bytes (less than 8)
+                    for (std::size_t j = i; j < size; j++) {
+                        dst_bytes[j] = 0;
+                    }
+                }
+            }
+        } else {
+            // For larger sizes, use standard memset which is usually optimized by the compiler
+            std::memset(dst, value, size);
+        }
+        break;
+    }
+}
+
+bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr,
                           const std::size_t size) {
     const Common::ProcessAddress max_addr = 1ULL << table.GetAddressSpaceBits();
     return addr + size >= addr && addr + size <= max_addr;
 }
 
+} // namespace
+
 // Implementation class used to keep the specifics of the memory subsystem hidden
 // from outside classes. This also allows modification to the internals of the memory
 // subsystem without needing to rebuild all files that make use of the memory interface.
@@ -313,28 +416,70 @@ struct Memory::Impl {
                 LOG_ERROR(HW_Memory,
                           "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
                           GetInteger(current_vaddr), GetInteger(src_addr), size);
-               std::memset(dest_buffer, 0, copy_amount);
+               FastMemset(dest_buffer, 0, copy_amount);
             },
             [&](const std::size_t copy_amount, const u8* const src_ptr) {
-                std::memcpy(dest_buffer, src_ptr, copy_amount);
+                FastMemcpy(dest_buffer, src_ptr, copy_amount);
             },
             [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount,
                 const u8* const host_ptr) {
                 if constexpr (!UNSAFE) {
                     HandleRasterizerDownload(GetInteger(current_vaddr), copy_amount);
                 }
-                std::memcpy(dest_buffer, host_ptr, copy_amount);
+                FastMemcpy(dest_buffer, host_ptr, copy_amount);
             },
             [&](const std::size_t copy_amount) {
                 dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
             });
     }
 
+    bool ReadBlockParallel(const Common::ProcessAddress src_addr, void* dest_buffer,
+                          const std::size_t size) {
+        // Calculate chunk size based on thread count
+        const size_t chunk_size = (size + thread_count - 1) / thread_count;
+
+        // Create threads for parallel processing
+        std::vector<std::thread> threads;
+        threads.reserve(thread_count);
+
+        // Create a vector to store the results of each thread
+        std::vector<bool> results(thread_count, true);
+
+        // Split the work among threads
+        for (unsigned int i = 0; i < thread_count; ++i) {
+            const size_t offset = i * chunk_size;
+            if (offset >= size) {
+                break;
+            }
+
+            const size_t current_chunk_size = std::min(chunk_size, size - offset);
+            const Common::ProcessAddress current_addr = src_addr + offset;
+            void* current_dest = static_cast<u8*>(dest_buffer) + offset;
+
+            // Launch thread
+            threads.emplace_back([this, i, current_addr, current_dest, current_chunk_size, &results] {
+                results[i] = ReadBlockImpl<false>(current_addr, current_dest, current_chunk_size);
+            });
+        }
+
+        // Wait for all threads to complete
+        for (auto& thread : threads) {
+            thread.join();
+        }
+
+        // Check if all operations succeeded
+        return std::all_of(results.begin(), results.end(), [](bool result) { return result; });
+    }
+
     bool ReadBlock(const Common::ProcessAddress src_addr, void* dest_buffer,
                    const std::size_t size) {
-        // TODO: If you want a proper multithreaded implementation (w/o cache coherency fights)
-        // use TBB or something that splits the job properly
-        return ReadBlockImpl<false>(src_addr, dest_buffer, size);
+        // For small reads, use the regular implementation
+        if (size < PARALLEL_THRESHOLD) {
+            return ReadBlockImpl<false>(src_addr, dest_buffer, size);
+        }
+
+        // For large reads, use parallel implementation
+        return ReadBlockParallel(src_addr, dest_buffer, size);
     }
 
     bool ReadBlockUnsafe(const Common::ProcessAddress src_addr, void* dest_buffer,
@@ -370,25 +515,67 @@ struct Memory::Impl {
                           GetInteger(current_vaddr), GetInteger(dest_addr), size);
             },
             [&](const std::size_t copy_amount, u8* const dest_ptr) {
-                std::memcpy(dest_ptr, src_buffer, copy_amount);
+                FastMemcpy(dest_ptr, src_buffer, copy_amount);
             },
             [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount,
                 u8* const host_ptr) {
                 if constexpr (!UNSAFE) {
                     HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount);
                 }
-                std::memcpy(host_ptr, src_buffer, copy_amount);
+                FastMemcpy(host_ptr, src_buffer, copy_amount);
             },
             [&](const std::size_t copy_amount) {
                 src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
             });
     }
 
+    bool WriteBlockParallel(const Common::ProcessAddress dest_addr, const void* src_buffer,
+                          const std::size_t size) {
+        // Calculate chunk size based on thread count
+        const size_t chunk_size = (size + thread_count - 1) / thread_count;
+
+        // Create threads for parallel processing
+        std::vector<std::thread> threads;
+        threads.reserve(thread_count);
+
+        // Create a vector to store the results of each thread
+        std::vector<bool> results(thread_count, true);
+
+        // Split the work among threads
+        for (unsigned int i = 0; i < thread_count; ++i) {
+            const size_t offset = i * chunk_size;
+            if (offset >= size) {
+                break;
+            }
+
+            const size_t current_chunk_size = std::min(chunk_size, size - offset);
+            const Common::ProcessAddress current_addr = dest_addr + offset;
+            const void* current_src = static_cast<const u8*>(src_buffer) + offset;
+
+            // Launch thread
+            threads.emplace_back([this, i, current_addr, current_src, current_chunk_size, &results] {
+                results[i] = WriteBlockImpl<false>(current_addr, current_src, current_chunk_size);
+            });
+        }
+
+        // Wait for all threads to complete
+        for (auto& thread : threads) {
+            thread.join();
+        }
+
+        // Check if all operations succeeded
+        return std::all_of(results.begin(), results.end(), [](bool result) { return result; });
+    }
+
     bool WriteBlock(const Common::ProcessAddress dest_addr, const void* src_buffer,
                     const std::size_t size) {
-        // TODO: If you want a proper multithreaded implementation (w/o cache coherency fights)
-        // use TBB or something that splits the job properly
-        return WriteBlockImpl<false>(dest_addr, src_buffer, size);
+        // For small writes, use the regular implementation
+        if (size < PARALLEL_THRESHOLD) {
+            return WriteBlockImpl<false>(dest_addr, src_buffer, size);
+        }
+
+        // For large writes, use parallel implementation
+        return WriteBlockParallel(dest_addr, src_buffer, size);
     }
 
     bool WriteBlockUnsafe(const Common::ProcessAddress dest_addr, const void* src_buffer,
@@ -406,12 +593,12 @@ struct Memory::Impl {
                           GetInteger(current_vaddr), GetInteger(dest_addr), size);
             },
             [](const std::size_t copy_amount, u8* const dest_ptr) {
-               std::memset(dest_ptr, 0, copy_amount);
+               FastMemset(dest_ptr, 0, copy_amount);
             },
             [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount,
                 u8* const host_ptr) {
                 HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount);
-               std::memset(host_ptr, 0, copy_amount);
+               FastMemset(host_ptr, 0, copy_amount);
             },
             [](const std::size_t copy_amount) {});
     }
@@ -806,7 +993,7 @@ struct Memory::Impl {
             },
             [&]() { HandleRasterizerDownload(addr, sizeof(T)); });
         if (ptr) {
-            std::memcpy(&result, ptr, sizeof(T));
+            FastMemcpy(&result, ptr, sizeof(T));
         }
         return result;
     }
@@ -893,7 +1080,7 @@ struct Memory::Impl {
             },
             [&]() { HandleRasterizerWrite(addr, sizeof(T)); });
         if (ptr) {
-            std::memcpy(ptr, &data, sizeof(T));
+            FastMemcpy(ptr, &data, sizeof(T));
         }
     }
 
@@ -1016,7 +1203,7 @@ struct Memory::Impl {
     unsigned int thread_count = 2;
 
     // Minimum size in bytes for which parallel processing is beneficial
-    //size_t PARALLEL_THRESHOLD = (L3 CACHE * NUM PHYSICAL CORES); // 64 KB
+    static constexpr size_t PARALLEL_THRESHOLD = 64 * 1024; // 64 KB
     std::array<VideoCore::RasterizerDownloadArea, Core::Hardware::NUM_CPU_CORES>
         rasterizer_read_areas{};
     std::array<GPUDirtyState, Core::Hardware::NUM_CPU_CORES> rasterizer_write_areas{};
diff --git a/src/video_core/host1x/codecs/decoder.cpp b/src/video_core/host1x/codecs/decoder.cpp
index cb17784b19..27559da2a9 100755
--- a/src/video_core/host1x/codecs/decoder.cpp
+++ b/src/video_core/host1x/codecs/decoder.cpp
@@ -12,9 +12,10 @@
 
 namespace Tegra {
 
-Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, const Host1x::NvdecCommon::NvdecRegisters& regs_,
+Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, u32 syncpoint_,
+                 const Host1x::NvdecCommon::NvdecRegisters& regs_,
                  Host1x::FrameQueue& frame_queue_)
-    : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, id{id_}, frame_queue{
+    : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, syncpoint{syncpoint_},id{id_}, frame_queue{
                                                                                 frame_queue_} {}
 
 Decoder::~Decoder() = default;
diff --git a/src/video_core/host1x/codecs/decoder.h b/src/video_core/host1x/codecs/decoder.h
index 22e6db8151..24e7d15801 100755
--- a/src/video_core/host1x/codecs/decoder.h
+++ b/src/video_core/host1x/codecs/decoder.h
@@ -41,7 +41,7 @@ public:
     [[nodiscard]] virtual std::string_view GetCurrentCodecName() const = 0;
 
 protected:
-    explicit Decoder(Host1x::Host1x& host1x, s32 id,
+    explicit Decoder(Host1x::Host1x& host1x, s32 id, u32 syncpoint,
                      const Host1x::NvdecCommon::NvdecRegisters& regs,
                      Host1x::FrameQueue& frame_queue);
 
@@ -53,6 +53,7 @@ protected:
     Host1x::Host1x& host1x;
     Tegra::MemoryManager& memory_manager;
     const Host1x::NvdecCommon::NvdecRegisters& regs;
+    u32 syncpoint;
     s32 id;
     Host1x::FrameQueue& frame_queue;
     Host1x::NvdecCommon::VideoCodec codec;
diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp
index 0896fa6001..0b7cf0637f 100644
--- a/src/video_core/host1x/codecs/h264.cpp
+++ b/src/video_core/host1x/codecs/h264.cpp
@@ -29,8 +29,9 @@ constexpr std::array<u8, 16> zig_zag_scan{
 } // Anonymous namespace
 
 H264::H264(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+           u32 syncpoint_,
            Host1x::FrameQueue& frame_queue_)
-    : Decoder{host1x_, id_, regs_, frame_queue_} {
+    : Decoder{host1x_, id_, syncpoint_, regs_, frame_queue_} {
     codec = Host1x::NvdecCommon::VideoCodec::H264;
     initialized = decode_api.Initialize(codec);
 }
diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h
index d946c6937d..5c94a76fcd 100644
--- a/src/video_core/host1x/codecs/h264.h
+++ b/src/video_core/host1x/codecs/h264.h
@@ -242,7 +242,7 @@ ASSERT_POSITION(weight_scale_4x4, 0x1C0);
 class H264 final : public Decoder {
 public:
     explicit H264(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
-                  Host1x::FrameQueue& frame_queue);
+                  u32 syncpoint, Host1x::FrameQueue& frame_queue);
     ~H264() override;
 
     H264(const H264&) = delete;
diff --git a/src/video_core/host1x/codecs/vp8.cpp b/src/video_core/host1x/codecs/vp8.cpp
index 6094f16e0e..300b5381ec 100644
--- a/src/video_core/host1x/codecs/vp8.cpp
+++ b/src/video_core/host1x/codecs/vp8.cpp
@@ -9,8 +9,8 @@
 
 namespace Tegra::Decoders {
 VP8::VP8(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
-         Host1x::FrameQueue& frame_queue_)
-    : Decoder{host1x_, id_, regs_, frame_queue_} {
+         u32 syncpoint_, Host1x::FrameQueue& frame_queue_)
+    : Decoder{host1x_, id_, syncpoint_, regs_, frame_queue_} {
     codec = Host1x::NvdecCommon::VideoCodec::VP8;
     initialized = decode_api.Initialize(codec);
 }
diff --git a/src/video_core/host1x/codecs/vp8.h b/src/video_core/host1x/codecs/vp8.h
index 74800281d8..b36ceea4fe 100644
--- a/src/video_core/host1x/codecs/vp8.h
+++ b/src/video_core/host1x/codecs/vp8.h
@@ -29,7 +29,7 @@ enum class Vp8SurfaceIndex : u32 {
 class VP8 final : public Decoder {
 public:
     explicit VP8(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
-                 Host1x::FrameQueue& frame_queue);
+                 u32 syncpoint, Host1x::FrameQueue& frame_queue);
     ~VP8() override;
 
     VP8(const VP8&) = delete;
diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp
index f80709d785..fce16e9357 100644
--- a/src/video_core/host1x/codecs/vp9.cpp
+++ b/src/video_core/host1x/codecs/vp9.cpp
@@ -242,8 +242,8 @@ constexpr std::array<u8, 254> map_lut{
 } // Anonymous namespace
 
 VP9::VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
-         Host1x::FrameQueue& frame_queue_)
-    : Decoder{host1x_, id_, regs_, frame_queue_} {
+         u32 syncpoint_, Host1x::FrameQueue& frame_queue_)
+    : Decoder{host1x_, id_, syncpoint_, regs_, frame_queue_} {
     codec = Host1x::NvdecCommon::VideoCodec::VP9;
     initialized = decode_api.Initialize(codec);
 }
@@ -900,6 +900,8 @@ std::span<const u8> VP9::ComposeFrame() {
 
     vp9_hidden_frame = WasFrameHidden();
 
+    host1x.GetSyncpointManager().IncrementGuest(syncpoint);
+
     return GetFrameBytes();
 }
 
diff --git a/src/video_core/host1x/codecs/vp9.h b/src/video_core/host1x/codecs/vp9.h
index 9d42033cb3..93c7d82682 100644
--- a/src/video_core/host1x/codecs/vp9.h
+++ b/src/video_core/host1x/codecs/vp9.h
@@ -113,8 +113,8 @@ private:
 
 class VP9 final : public Decoder {
 public:
-    explicit VP9(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
-                 Host1x::FrameQueue& frame_queue);
+    VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+        u32 syncpoint_, Host1x::FrameQueue& frame_queue_);
     ~VP9() override;
 
     VP9(const VP9&) = delete;
diff --git a/src/video_core/host1x/nvdec.cpp b/src/video_core/host1x/nvdec.cpp
index 1882ccb100..67ba3b0f70 100644
--- a/src/video_core/host1x/nvdec.cpp
+++ b/src/video_core/host1x/nvdec.cpp
@@ -48,13 +48,13 @@ void Nvdec::CreateDecoder(NvdecCommon::VideoCodec codec) {
     }
     switch (codec) {
     case NvdecCommon::VideoCodec::H264:
-        decoder = std::make_unique<Decoders::H264>(host1x, regs, id, frame_queue);
+        decoder = std::make_unique<Decoders::H264>(host1x, regs, id, syncpoint, frame_queue);
         break;
     case NvdecCommon::VideoCodec::VP8:
-        decoder = std::make_unique<Decoders::VP8>(host1x, regs, id, frame_queue);
+        decoder = std::make_unique<Decoders::VP8>(host1x, regs, id, syncpoint, frame_queue);
         break;
     case NvdecCommon::VideoCodec::VP9:
-        decoder = std::make_unique<Decoders::VP9>(host1x, regs, id, frame_queue);
+        decoder = std::make_unique<Decoders::VP9>(host1x, regs, id, syncpoint, frame_queue);
         break;
     default:
         UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());