add syncpoint to codecs

2025-10-01 13:54:48 +02:00 · 2025-10-01 13:54:48 +02:00 · c58af7b556
commit c58af7b556
parent 020f1cdb1f
10 changed files with 224 additions and 32 deletions
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@ -35,12 +35,115 @@

 namespace Core::Memory {

-static inline bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr,
+namespace {
+
+inline void FastMemcpy(void* dst, const void* src, std::size_t size) {
+    // Fast path for small copies
+    switch (size) {
+    case 1:
+        *static_cast<u8*>(dst) = *static_cast<const u8*>(src);
+        break;
+    case 2:
+        *static_cast<u16*>(dst) = *static_cast<const u16*>(src);
+        break;
+    case 4:
+        *static_cast<u32*>(dst) = *static_cast<const u32*>(src);
+        break;
+    case 8:
+        *static_cast<u64*>(dst) = *static_cast<const u64*>(src);
+        break;
+    case 16: {
+        // Optimize for 16-byte copy (common case for SIMD registers)
+        const u64* src_64 = static_cast<const u64*>(src);
+        u64* dst_64 = static_cast<u64*>(dst);
+        dst_64[0] = src_64[0];
+        dst_64[1] = src_64[1];
+        break;
+    }
+    case 32: {
+        // Optimize for 32-byte copy
+        const u64* src_64 = static_cast<const u64*>(src);
+        u64* dst_64 = static_cast<u64*>(dst);
+        dst_64[0] = src_64[0];
+        dst_64[1] = src_64[1];
+        dst_64[2] = src_64[2];
+        dst_64[3] = src_64[3];
+        break;
+    }
+    case 64: {
+        // Optimize for 64-byte copy
+        const u64* src_64 = static_cast<const u64*>(src);
+        u64* dst_64 = static_cast<u64*>(dst);
+        dst_64[0] = src_64[0];
+        dst_64[1] = src_64[1];
+        dst_64[2] = src_64[2];
+        dst_64[3] = src_64[3];
+        dst_64[4] = src_64[4];
+        dst_64[5] = src_64[5];
+        dst_64[6] = src_64[6];
+        dst_64[7] = src_64[7];
+        break;
+    }
+    default:
+        // For larger sizes, use standard memcpy which is usually optimized by the compiler
+        std::memcpy(dst, src, size);
+        break;
+    }
+}
+
+inline void FastMemset(void* dst, int value, std::size_t size) {
+    // Fast path for small fills
+    switch (size) {
+    case 1:
+        *static_cast<u8*>(dst) = static_cast<u8>(value);
+        break;
+    case 2:
+        *static_cast<u16*>(dst) = static_cast<u16>(value);
+        break;
+    case 4:
+        *static_cast<u32*>(dst) = static_cast<u32>(value);
+        break;
+    case 8:
+        *static_cast<u64*>(dst) = static_cast<u64>(value);
+        break;
+    case 16: {
+        // Optimize for 16-byte fill (common case for SIMD registers)
+        u64* dst_64 = static_cast<u64*>(dst);
+        const u64 val64 = static_cast<u8>(value) * 0x0101010101010101ULL;
+        dst_64[0] = val64;
+        dst_64[1] = val64;
+        break;
+    }
+    default:
+        if (size <= 128 && value == 0) {
+            // Fast path for small zero-fills
+            u8* dst_bytes = static_cast<u8*>(dst);
+            for (std::size_t i = 0; i < size; i += 8) {
+                if (i + 8 <= size) {
+                    *reinterpret_cast<u64*>(dst_bytes + i) = 0;
+                } else {
+                    // Handle remaining bytes (less than 8)
+                    for (std::size_t j = i; j < size; j++) {
+                        dst_bytes[j] = 0;
+                    }
+                }
+            }
+        } else {
+            // For larger sizes, use standard memset which is usually optimized by the compiler
+            std::memset(dst, value, size);
+        }
+        break;
+    }
+}
+
+bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr,
                          const std::size_t size) {
    const Common::ProcessAddress max_addr = 1ULL << table.GetAddressSpaceBits();
    return addr + size >= addr && addr + size <= max_addr;
 }

+} // namespace
+
 // Implementation class used to keep the specifics of the memory subsystem hidden
 // from outside classes. This also allows modification to the internals of the memory
 // subsystem without needing to rebuild all files that make use of the memory interface.
@ -313,28 +416,70 @@ struct Memory::Impl {
                LOG_ERROR(HW_Memory,
                          "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
                          GetInteger(current_vaddr), GetInteger(src_addr), size);
-               std::memset(dest_buffer, 0, copy_amount);
+               FastMemset(dest_buffer, 0, copy_amount);
            },
            [&](const std::size_t copy_amount, const u8* const src_ptr) {
-                std::memcpy(dest_buffer, src_ptr, copy_amount);
+                FastMemcpy(dest_buffer, src_ptr, copy_amount);
            },
            [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount,
                const u8* const host_ptr) {
                if constexpr (!UNSAFE) {
                    HandleRasterizerDownload(GetInteger(current_vaddr), copy_amount);
                }
-                std::memcpy(dest_buffer, host_ptr, copy_amount);
+                FastMemcpy(dest_buffer, host_ptr, copy_amount);
            },
            [&](const std::size_t copy_amount) {
                dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
            });
    }

+    bool ReadBlockParallel(const Common::ProcessAddress src_addr, void* dest_buffer,
+                          const std::size_t size) {
+        // Calculate chunk size based on thread count
+        const size_t chunk_size = (size + thread_count - 1) / thread_count;
+
+        // Create threads for parallel processing
+        std::vector<std::thread> threads;
+        threads.reserve(thread_count);
+
+        // Create a vector to store the results of each thread
+        std::vector<bool> results(thread_count, true);
+
+        // Split the work among threads
+        for (unsigned int i = 0; i < thread_count; ++i) {
+            const size_t offset = i * chunk_size;
+            if (offset >= size) {
+                break;
+            }
+
+            const size_t current_chunk_size = std::min(chunk_size, size - offset);
+            const Common::ProcessAddress current_addr = src_addr + offset;
+            void* current_dest = static_cast<u8*>(dest_buffer) + offset;
+
+            // Launch thread
+            threads.emplace_back([this, i, current_addr, current_dest, current_chunk_size, &results] {
+                results[i] = ReadBlockImpl<false>(current_addr, current_dest, current_chunk_size);
+            });
+        }
+
+        // Wait for all threads to complete
+        for (auto& thread : threads) {
+            thread.join();
+        }
+
+        // Check if all operations succeeded
+        return std::all_of(results.begin(), results.end(), [](bool result) { return result; });
+    }
+
    bool ReadBlock(const Common::ProcessAddress src_addr, void* dest_buffer,
                   const std::size_t size) {
-        // TODO: If you want a proper multithreaded implementation (w/o cache coherency fights)
-        // use TBB or something that splits the job properly
-        return ReadBlockImpl<false>(src_addr, dest_buffer, size);
+        // For small reads, use the regular implementation
+        if (size < PARALLEL_THRESHOLD) {
+            return ReadBlockImpl<false>(src_addr, dest_buffer, size);
+        }
+
+        // For large reads, use parallel implementation
+        return ReadBlockParallel(src_addr, dest_buffer, size);
    }

    bool ReadBlockUnsafe(const Common::ProcessAddress src_addr, void* dest_buffer,
@ -370,25 +515,67 @@ struct Memory::Impl {
                          GetInteger(current_vaddr), GetInteger(dest_addr), size);
            },
            [&](const std::size_t copy_amount, u8* const dest_ptr) {
-                std::memcpy(dest_ptr, src_buffer, copy_amount);
+                FastMemcpy(dest_ptr, src_buffer, copy_amount);
            },
            [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount,
                u8* const host_ptr) {
                if constexpr (!UNSAFE) {
                    HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount);
                }
-                std::memcpy(host_ptr, src_buffer, copy_amount);
+                FastMemcpy(host_ptr, src_buffer, copy_amount);
            },
            [&](const std::size_t copy_amount) {
                src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
            });
    }

+    bool WriteBlockParallel(const Common::ProcessAddress dest_addr, const void* src_buffer,
+                          const std::size_t size) {
+        // Calculate chunk size based on thread count
+        const size_t chunk_size = (size + thread_count - 1) / thread_count;
+
+        // Create threads for parallel processing
+        std::vector<std::thread> threads;
+        threads.reserve(thread_count);
+
+        // Create a vector to store the results of each thread
+        std::vector<bool> results(thread_count, true);
+
+        // Split the work among threads
+        for (unsigned int i = 0; i < thread_count; ++i) {
+            const size_t offset = i * chunk_size;
+            if (offset >= size) {
+                break;
+            }
+
+            const size_t current_chunk_size = std::min(chunk_size, size - offset);
+            const Common::ProcessAddress current_addr = dest_addr + offset;
+            const void* current_src = static_cast<const u8*>(src_buffer) + offset;
+
+            // Launch thread
+            threads.emplace_back([this, i, current_addr, current_src, current_chunk_size, &results] {
+                results[i] = WriteBlockImpl<false>(current_addr, current_src, current_chunk_size);
+            });
+        }
+
+        // Wait for all threads to complete
+        for (auto& thread : threads) {
+            thread.join();
+        }
+
+        // Check if all operations succeeded
+        return std::all_of(results.begin(), results.end(), [](bool result) { return result; });
+    }
+
    bool WriteBlock(const Common::ProcessAddress dest_addr, const void* src_buffer,
                    const std::size_t size) {
-        // TODO: If you want a proper multithreaded implementation (w/o cache coherency fights)
-        // use TBB or something that splits the job properly
-        return WriteBlockImpl<false>(dest_addr, src_buffer, size);
+        // For small writes, use the regular implementation
+        if (size < PARALLEL_THRESHOLD) {
+            return WriteBlockImpl<false>(dest_addr, src_buffer, size);
+        }
+
+        // For large writes, use parallel implementation
+        return WriteBlockParallel(dest_addr, src_buffer, size);
    }

    bool WriteBlockUnsafe(const Common::ProcessAddress dest_addr, const void* src_buffer,
@ -406,12 +593,12 @@ struct Memory::Impl {
                          GetInteger(current_vaddr), GetInteger(dest_addr), size);
            },
            [](const std::size_t copy_amount, u8* const dest_ptr) {
-               std::memset(dest_ptr, 0, copy_amount);
+               FastMemset(dest_ptr, 0, copy_amount);
            },
            [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount,
                u8* const host_ptr) {
                HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount);
-               std::memset(host_ptr, 0, copy_amount);
+               FastMemset(host_ptr, 0, copy_amount);
            },
            [](const std::size_t copy_amount) {});
    }
@ -806,7 +993,7 @@ struct Memory::Impl {
            },
            [&]() { HandleRasterizerDownload(addr, sizeof(T)); });
        if (ptr) {
-            std::memcpy(&result, ptr, sizeof(T));
+            FastMemcpy(&result, ptr, sizeof(T));
        }
        return result;
    }
@ -893,7 +1080,7 @@ struct Memory::Impl {
            },
            [&]() { HandleRasterizerWrite(addr, sizeof(T)); });
        if (ptr) {
-            std::memcpy(ptr, &data, sizeof(T));
+            FastMemcpy(ptr, &data, sizeof(T));
        }
    }

@ -1016,7 +1203,7 @@ struct Memory::Impl {
    unsigned int thread_count = 2;

    // Minimum size in bytes for which parallel processing is beneficial
-    //size_t PARALLEL_THRESHOLD = (L3 CACHE * NUM PHYSICAL CORES); // 64 KB
+    static constexpr size_t PARALLEL_THRESHOLD = 64 * 1024; // 64 KB
    std::array<VideoCore::RasterizerDownloadArea, Core::Hardware::NUM_CPU_CORES>
        rasterizer_read_areas{};
    std::array<GPUDirtyState, Core::Hardware::NUM_CPU_CORES> rasterizer_write_areas{};
--- a/src/video_core/host1x/codecs/decoder.cpp
+++ b/src/video_core/host1x/codecs/decoder.cpp
@ -12,9 +12,10 @@

 namespace Tegra {

-Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, const Host1x::NvdecCommon::NvdecRegisters& regs_,
+Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, u32 syncpoint_,
+                 const Host1x::NvdecCommon::NvdecRegisters& regs_,
                 Host1x::FrameQueue& frame_queue_)
-    : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, id{id_}, frame_queue{
+    : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, syncpoint{syncpoint_},id{id_}, frame_queue{
                                                                                frame_queue_} {}

 Decoder::~Decoder() = default;
--- a/src/video_core/host1x/codecs/decoder.h
+++ b/src/video_core/host1x/codecs/decoder.h
@ -41,7 +41,7 @@ public:
    [[nodiscard]] virtual std::string_view GetCurrentCodecName() const = 0;

 protected:
-    explicit Decoder(Host1x::Host1x& host1x, s32 id,
+    explicit Decoder(Host1x::Host1x& host1x, s32 id, u32 syncpoint,
                     const Host1x::NvdecCommon::NvdecRegisters& regs,
                     Host1x::FrameQueue& frame_queue);

@ -53,6 +53,7 @@ protected:
    Host1x::Host1x& host1x;
    Tegra::MemoryManager& memory_manager;
    const Host1x::NvdecCommon::NvdecRegisters& regs;
+    u32 syncpoint;
    s32 id;
    Host1x::FrameQueue& frame_queue;
    Host1x::NvdecCommon::VideoCodec codec;
--- a/src/video_core/host1x/codecs/h264.cpp
+++ b/src/video_core/host1x/codecs/h264.cpp
@ -29,8 +29,9 @@ constexpr std::array<u8, 16> zig_zag_scan{
 } // Anonymous namespace

 H264::H264(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+           u32 syncpoint_,
           Host1x::FrameQueue& frame_queue_)
-    : Decoder{host1x_, id_, regs_, frame_queue_} {
+    : Decoder{host1x_, id_, syncpoint_, regs_, frame_queue_} {
    codec = Host1x::NvdecCommon::VideoCodec::H264;
    initialized = decode_api.Initialize(codec);
 }
--- a/src/video_core/host1x/codecs/h264.h
+++ b/src/video_core/host1x/codecs/h264.h
@ -242,7 +242,7 @@ ASSERT_POSITION(weight_scale_4x4, 0x1C0);
 class H264 final : public Decoder {
 public:
    explicit H264(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
-                  Host1x::FrameQueue& frame_queue);
+                  u32 syncpoint, Host1x::FrameQueue& frame_queue);
    ~H264() override;

    H264(const H264&) = delete;
--- a/src/video_core/host1x/codecs/vp8.cpp
+++ b/src/video_core/host1x/codecs/vp8.cpp
@ -9,8 +9,8 @@

 namespace Tegra::Decoders {
 VP8::VP8(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
-         Host1x::FrameQueue& frame_queue_)
-    : Decoder{host1x_, id_, regs_, frame_queue_} {
+         u32 syncpoint_, Host1x::FrameQueue& frame_queue_)
+    : Decoder{host1x_, id_, syncpoint_, regs_, frame_queue_} {
    codec = Host1x::NvdecCommon::VideoCodec::VP8;
    initialized = decode_api.Initialize(codec);
 }
--- a/src/video_core/host1x/codecs/vp8.h
+++ b/src/video_core/host1x/codecs/vp8.h
@ -29,7 +29,7 @@ enum class Vp8SurfaceIndex : u32 {
 class VP8 final : public Decoder {
 public:
    explicit VP8(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
-                 Host1x::FrameQueue& frame_queue);
+                 u32 syncpoint, Host1x::FrameQueue& frame_queue);
    ~VP8() override;

    VP8(const VP8&) = delete;
--- a/src/video_core/host1x/codecs/vp9.cpp
+++ b/src/video_core/host1x/codecs/vp9.cpp
@ -242,8 +242,8 @@ constexpr std::array<u8, 254> map_lut{
 } // Anonymous namespace

 VP9::VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
-         Host1x::FrameQueue& frame_queue_)
-    : Decoder{host1x_, id_, regs_, frame_queue_} {
+         u32 syncpoint_, Host1x::FrameQueue& frame_queue_)
+    : Decoder{host1x_, id_, syncpoint_, regs_, frame_queue_} {
    codec = Host1x::NvdecCommon::VideoCodec::VP9;
    initialized = decode_api.Initialize(codec);
 }
@ -900,6 +900,8 @@ std::span<const u8> VP9::ComposeFrame() {

    vp9_hidden_frame = WasFrameHidden();

+    host1x.GetSyncpointManager().IncrementGuest(syncpoint);
+
    return GetFrameBytes();
 }

--- a/src/video_core/host1x/codecs/vp9.h
+++ b/src/video_core/host1x/codecs/vp9.h
@ -113,8 +113,8 @@ private:

 class VP9 final : public Decoder {
 public:
-    explicit VP9(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
-                 Host1x::FrameQueue& frame_queue);
+    VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+        u32 syncpoint_, Host1x::FrameQueue& frame_queue_);
    ~VP9() override;

    VP9(const VP9&) = delete;
--- a/src/video_core/host1x/nvdec.cpp
+++ b/src/video_core/host1x/nvdec.cpp
@ -48,13 +48,13 @@ void Nvdec::CreateDecoder(NvdecCommon::VideoCodec codec) {
    }
    switch (codec) {
    case NvdecCommon::VideoCodec::H264:
-        decoder = std::make_unique<Decoders::H264>(host1x, regs, id, frame_queue);
+        decoder = std::make_unique<Decoders::H264>(host1x, regs, id, syncpoint, frame_queue);
        break;
    case NvdecCommon::VideoCodec::VP8:
-        decoder = std::make_unique<Decoders::VP8>(host1x, regs, id, frame_queue);
+        decoder = std::make_unique<Decoders::VP8>(host1x, regs, id, syncpoint, frame_queue);
        break;
    case NvdecCommon::VideoCodec::VP9:
-        decoder = std::make_unique<Decoders::VP9>(host1x, regs, id, frame_queue);
+        decoder = std::make_unique<Decoders::VP9>(host1x, regs, id, syncpoint, frame_queue);
        break;
    default:
        UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());