From c58af7b556c87a605ba4429e519dc4c40265eeb6 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 1 Oct 2025 13:54:48 +0200 Subject: [PATCH] add syncpoint to codecs --- src/core/memory.cpp | 221 +++++++++++++++++++++-- src/video_core/host1x/codecs/decoder.cpp | 5 +- src/video_core/host1x/codecs/decoder.h | 3 +- src/video_core/host1x/codecs/h264.cpp | 3 +- src/video_core/host1x/codecs/h264.h | 2 +- src/video_core/host1x/codecs/vp8.cpp | 4 +- src/video_core/host1x/codecs/vp8.h | 2 +- src/video_core/host1x/codecs/vp9.cpp | 6 +- src/video_core/host1x/codecs/vp9.h | 4 +- src/video_core/host1x/nvdec.cpp | 6 +- 10 files changed, 224 insertions(+), 32 deletions(-) diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 2583aae867..629897cb35 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -35,12 +35,115 @@ namespace Core::Memory { -static inline bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr, +namespace { + +inline void FastMemcpy(void* dst, const void* src, std::size_t size) { + // Fast path for small copies + switch (size) { + case 1: + *static_cast(dst) = *static_cast(src); + break; + case 2: + *static_cast(dst) = *static_cast(src); + break; + case 4: + *static_cast(dst) = *static_cast(src); + break; + case 8: + *static_cast(dst) = *static_cast(src); + break; + case 16: { + // Optimize for 16-byte copy (common case for SIMD registers) + const u64* src_64 = static_cast(src); + u64* dst_64 = static_cast(dst); + dst_64[0] = src_64[0]; + dst_64[1] = src_64[1]; + break; + } + case 32: { + // Optimize for 32-byte copy + const u64* src_64 = static_cast(src); + u64* dst_64 = static_cast(dst); + dst_64[0] = src_64[0]; + dst_64[1] = src_64[1]; + dst_64[2] = src_64[2]; + dst_64[3] = src_64[3]; + break; + } + case 64: { + // Optimize for 64-byte copy + const u64* src_64 = static_cast(src); + u64* dst_64 = static_cast(dst); + dst_64[0] = src_64[0]; + dst_64[1] = src_64[1]; + dst_64[2] = src_64[2]; + dst_64[3] = src_64[3]; + dst_64[4] = src_64[4]; + dst_64[5] = src_64[5]; + dst_64[6] = src_64[6]; + dst_64[7] = src_64[7]; + break; + } + default: + // For larger sizes, use standard memcpy which is usually optimized by the compiler + std::memcpy(dst, src, size); + break; + } +} + +inline void FastMemset(void* dst, int value, std::size_t size) { + // Fast path for small fills + switch (size) { + case 1: + *static_cast(dst) = static_cast(value); + break; + case 2: + *static_cast(dst) = static_cast(value); + break; + case 4: + *static_cast(dst) = static_cast(value); + break; + case 8: + *static_cast(dst) = static_cast(value); + break; + case 16: { + // Optimize for 16-byte fill (common case for SIMD registers) + u64* dst_64 = static_cast(dst); + const u64 val64 = static_cast(value) * 0x0101010101010101ULL; + dst_64[0] = val64; + dst_64[1] = val64; + break; + } + default: + if (size <= 128 && value == 0) { + // Fast path for small zero-fills + u8* dst_bytes = static_cast(dst); + for (std::size_t i = 0; i < size; i += 8) { + if (i + 8 <= size) { + *reinterpret_cast(dst_bytes + i) = 0; + } else { + // Handle remaining bytes (less than 8) + for (std::size_t j = i; j < size; j++) { + dst_bytes[j] = 0; + } + } + } + } else { + // For larger sizes, use standard memset which is usually optimized by the compiler + std::memset(dst, value, size); + } + break; + } +} + +bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr, const std::size_t size) { const Common::ProcessAddress max_addr = 1ULL << table.GetAddressSpaceBits(); return addr + size >= addr && addr + size <= max_addr; } +} // namespace + // Implementation class used to keep the specifics of the memory subsystem hidden // from outside classes. This also allows modification to the internals of the memory // subsystem without needing to rebuild all files that make use of the memory interface. @@ -313,28 +416,70 @@ struct Memory::Impl { LOG_ERROR(HW_Memory, "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", GetInteger(current_vaddr), GetInteger(src_addr), size); - std::memset(dest_buffer, 0, copy_amount); + FastMemset(dest_buffer, 0, copy_amount); }, [&](const std::size_t copy_amount, const u8* const src_ptr) { - std::memcpy(dest_buffer, src_ptr, copy_amount); + FastMemcpy(dest_buffer, src_ptr, copy_amount); }, [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, const u8* const host_ptr) { if constexpr (!UNSAFE) { HandleRasterizerDownload(GetInteger(current_vaddr), copy_amount); } - std::memcpy(dest_buffer, host_ptr, copy_amount); + FastMemcpy(dest_buffer, host_ptr, copy_amount); }, [&](const std::size_t copy_amount) { dest_buffer = static_cast(dest_buffer) + copy_amount; }); } + bool ReadBlockParallel(const Common::ProcessAddress src_addr, void* dest_buffer, + const std::size_t size) { + // Calculate chunk size based on thread count + const size_t chunk_size = (size + thread_count - 1) / thread_count; + + // Create threads for parallel processing + std::vector threads; + threads.reserve(thread_count); + + // Create a vector to store the results of each thread + std::vector results(thread_count, true); + + // Split the work among threads + for (unsigned int i = 0; i < thread_count; ++i) { + const size_t offset = i * chunk_size; + if (offset >= size) { + break; + } + + const size_t current_chunk_size = std::min(chunk_size, size - offset); + const Common::ProcessAddress current_addr = src_addr + offset; + void* current_dest = static_cast(dest_buffer) + offset; + + // Launch thread + threads.emplace_back([this, i, current_addr, current_dest, current_chunk_size, &results] { + results[i] = ReadBlockImpl(current_addr, current_dest, current_chunk_size); + }); + } + + // Wait for all threads to complete + for (auto& thread : threads) { + thread.join(); + } + + // Check if all operations succeeded + return std::all_of(results.begin(), results.end(), [](bool result) { return result; }); + } + bool ReadBlock(const Common::ProcessAddress src_addr, void* dest_buffer, const std::size_t size) { - // TODO: If you want a proper multithreaded implementation (w/o cache coherency fights) - // use TBB or something that splits the job properly - return ReadBlockImpl(src_addr, dest_buffer, size); + // For small reads, use the regular implementation + if (size < PARALLEL_THRESHOLD) { + return ReadBlockImpl(src_addr, dest_buffer, size); + } + + // For large reads, use parallel implementation + return ReadBlockParallel(src_addr, dest_buffer, size); } bool ReadBlockUnsafe(const Common::ProcessAddress src_addr, void* dest_buffer, @@ -370,25 +515,67 @@ struct Memory::Impl { GetInteger(current_vaddr), GetInteger(dest_addr), size); }, [&](const std::size_t copy_amount, u8* const dest_ptr) { - std::memcpy(dest_ptr, src_buffer, copy_amount); + FastMemcpy(dest_ptr, src_buffer, copy_amount); }, [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, u8* const host_ptr) { if constexpr (!UNSAFE) { HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount); } - std::memcpy(host_ptr, src_buffer, copy_amount); + FastMemcpy(host_ptr, src_buffer, copy_amount); }, [&](const std::size_t copy_amount) { src_buffer = static_cast(src_buffer) + copy_amount; }); } + bool WriteBlockParallel(const Common::ProcessAddress dest_addr, const void* src_buffer, + const std::size_t size) { + // Calculate chunk size based on thread count + const size_t chunk_size = (size + thread_count - 1) / thread_count; + + // Create threads for parallel processing + std::vector threads; + threads.reserve(thread_count); + + // Create a vector to store the results of each thread + std::vector results(thread_count, true); + + // Split the work among threads + for (unsigned int i = 0; i < thread_count; ++i) { + const size_t offset = i * chunk_size; + if (offset >= size) { + break; + } + + const size_t current_chunk_size = std::min(chunk_size, size - offset); + const Common::ProcessAddress current_addr = dest_addr + offset; + const void* current_src = static_cast(src_buffer) + offset; + + // Launch thread + threads.emplace_back([this, i, current_addr, current_src, current_chunk_size, &results] { + results[i] = WriteBlockImpl(current_addr, current_src, current_chunk_size); + }); + } + + // Wait for all threads to complete + for (auto& thread : threads) { + thread.join(); + } + + // Check if all operations succeeded + return std::all_of(results.begin(), results.end(), [](bool result) { return result; }); + } + bool WriteBlock(const Common::ProcessAddress dest_addr, const void* src_buffer, const std::size_t size) { - // TODO: If you want a proper multithreaded implementation (w/o cache coherency fights) - // use TBB or something that splits the job properly - return WriteBlockImpl(dest_addr, src_buffer, size); + // For small writes, use the regular implementation + if (size < PARALLEL_THRESHOLD) { + return WriteBlockImpl(dest_addr, src_buffer, size); + } + + // For large writes, use parallel implementation + return WriteBlockParallel(dest_addr, src_buffer, size); } bool WriteBlockUnsafe(const Common::ProcessAddress dest_addr, const void* src_buffer, @@ -406,12 +593,12 @@ struct Memory::Impl { GetInteger(current_vaddr), GetInteger(dest_addr), size); }, [](const std::size_t copy_amount, u8* const dest_ptr) { - std::memset(dest_ptr, 0, copy_amount); + FastMemset(dest_ptr, 0, copy_amount); }, [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, u8* const host_ptr) { HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount); - std::memset(host_ptr, 0, copy_amount); + FastMemset(host_ptr, 0, copy_amount); }, [](const std::size_t copy_amount) {}); } @@ -806,7 +993,7 @@ struct Memory::Impl { }, [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); if (ptr) { - std::memcpy(&result, ptr, sizeof(T)); + FastMemcpy(&result, ptr, sizeof(T)); } return result; } @@ -893,7 +1080,7 @@ struct Memory::Impl { }, [&]() { HandleRasterizerWrite(addr, sizeof(T)); }); if (ptr) { - std::memcpy(ptr, &data, sizeof(T)); + FastMemcpy(ptr, &data, sizeof(T)); } } @@ -1016,7 +1203,7 @@ struct Memory::Impl { unsigned int thread_count = 2; // Minimum size in bytes for which parallel processing is beneficial - //size_t PARALLEL_THRESHOLD = (L3 CACHE * NUM PHYSICAL CORES); // 64 KB + static constexpr size_t PARALLEL_THRESHOLD = 64 * 1024; // 64 KB std::array rasterizer_read_areas{}; std::array rasterizer_write_areas{}; diff --git a/src/video_core/host1x/codecs/decoder.cpp b/src/video_core/host1x/codecs/decoder.cpp index cb17784b19..27559da2a9 100755 --- a/src/video_core/host1x/codecs/decoder.cpp +++ b/src/video_core/host1x/codecs/decoder.cpp @@ -12,9 +12,10 @@ namespace Tegra { -Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, const Host1x::NvdecCommon::NvdecRegisters& regs_, +Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, u32 syncpoint_, + const Host1x::NvdecCommon::NvdecRegisters& regs_, Host1x::FrameQueue& frame_queue_) - : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, id{id_}, frame_queue{ + : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, syncpoint{syncpoint_},id{id_}, frame_queue{ frame_queue_} {} Decoder::~Decoder() = default; diff --git a/src/video_core/host1x/codecs/decoder.h b/src/video_core/host1x/codecs/decoder.h index 22e6db8151..24e7d15801 100755 --- a/src/video_core/host1x/codecs/decoder.h +++ b/src/video_core/host1x/codecs/decoder.h @@ -41,7 +41,7 @@ public: [[nodiscard]] virtual std::string_view GetCurrentCodecName() const = 0; protected: - explicit Decoder(Host1x::Host1x& host1x, s32 id, + explicit Decoder(Host1x::Host1x& host1x, s32 id, u32 syncpoint, const Host1x::NvdecCommon::NvdecRegisters& regs, Host1x::FrameQueue& frame_queue); @@ -53,6 +53,7 @@ protected: Host1x::Host1x& host1x; Tegra::MemoryManager& memory_manager; const Host1x::NvdecCommon::NvdecRegisters& regs; + u32 syncpoint; s32 id; Host1x::FrameQueue& frame_queue; Host1x::NvdecCommon::VideoCodec codec; diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp index 0896fa6001..0b7cf0637f 100644 --- a/src/video_core/host1x/codecs/h264.cpp +++ b/src/video_core/host1x/codecs/h264.cpp @@ -29,8 +29,9 @@ constexpr std::array zig_zag_scan{ } // Anonymous namespace H264::H264(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_, + u32 syncpoint_, Host1x::FrameQueue& frame_queue_) - : Decoder{host1x_, id_, regs_, frame_queue_} { + : Decoder{host1x_, id_, syncpoint_, regs_, frame_queue_} { codec = Host1x::NvdecCommon::VideoCodec::H264; initialized = decode_api.Initialize(codec); } diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h index d946c6937d..5c94a76fcd 100644 --- a/src/video_core/host1x/codecs/h264.h +++ b/src/video_core/host1x/codecs/h264.h @@ -242,7 +242,7 @@ ASSERT_POSITION(weight_scale_4x4, 0x1C0); class H264 final : public Decoder { public: explicit H264(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id, - Host1x::FrameQueue& frame_queue); + u32 syncpoint, Host1x::FrameQueue& frame_queue); ~H264() override; H264(const H264&) = delete; diff --git a/src/video_core/host1x/codecs/vp8.cpp b/src/video_core/host1x/codecs/vp8.cpp index 6094f16e0e..300b5381ec 100644 --- a/src/video_core/host1x/codecs/vp8.cpp +++ b/src/video_core/host1x/codecs/vp8.cpp @@ -9,8 +9,8 @@ namespace Tegra::Decoders { VP8::VP8(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_, - Host1x::FrameQueue& frame_queue_) - : Decoder{host1x_, id_, regs_, frame_queue_} { + u32 syncpoint_, Host1x::FrameQueue& frame_queue_) + : Decoder{host1x_, id_, syncpoint_, regs_, frame_queue_} { codec = Host1x::NvdecCommon::VideoCodec::VP8; initialized = decode_api.Initialize(codec); } diff --git a/src/video_core/host1x/codecs/vp8.h b/src/video_core/host1x/codecs/vp8.h index 74800281d8..b36ceea4fe 100644 --- a/src/video_core/host1x/codecs/vp8.h +++ b/src/video_core/host1x/codecs/vp8.h @@ -29,7 +29,7 @@ enum class Vp8SurfaceIndex : u32 { class VP8 final : public Decoder { public: explicit VP8(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id, - Host1x::FrameQueue& frame_queue); + u32 syncpoint, Host1x::FrameQueue& frame_queue); ~VP8() override; VP8(const VP8&) = delete; diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp index f80709d785..fce16e9357 100644 --- a/src/video_core/host1x/codecs/vp9.cpp +++ b/src/video_core/host1x/codecs/vp9.cpp @@ -242,8 +242,8 @@ constexpr std::array map_lut{ } // Anonymous namespace VP9::VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_, - Host1x::FrameQueue& frame_queue_) - : Decoder{host1x_, id_, regs_, frame_queue_} { + u32 syncpoint_, Host1x::FrameQueue& frame_queue_) + : Decoder{host1x_, id_, syncpoint_, regs_, frame_queue_} { codec = Host1x::NvdecCommon::VideoCodec::VP9; initialized = decode_api.Initialize(codec); } @@ -900,6 +900,8 @@ std::span VP9::ComposeFrame() { vp9_hidden_frame = WasFrameHidden(); + host1x.GetSyncpointManager().IncrementGuest(syncpoint); + return GetFrameBytes(); } diff --git a/src/video_core/host1x/codecs/vp9.h b/src/video_core/host1x/codecs/vp9.h index 9d42033cb3..93c7d82682 100644 --- a/src/video_core/host1x/codecs/vp9.h +++ b/src/video_core/host1x/codecs/vp9.h @@ -113,8 +113,8 @@ private: class VP9 final : public Decoder { public: - explicit VP9(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id, - Host1x::FrameQueue& frame_queue); + VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_, + u32 syncpoint_, Host1x::FrameQueue& frame_queue_); ~VP9() override; VP9(const VP9&) = delete; diff --git a/src/video_core/host1x/nvdec.cpp b/src/video_core/host1x/nvdec.cpp index 1882ccb100..67ba3b0f70 100644 --- a/src/video_core/host1x/nvdec.cpp +++ b/src/video_core/host1x/nvdec.cpp @@ -48,13 +48,13 @@ void Nvdec::CreateDecoder(NvdecCommon::VideoCodec codec) { } switch (codec) { case NvdecCommon::VideoCodec::H264: - decoder = std::make_unique(host1x, regs, id, frame_queue); + decoder = std::make_unique(host1x, regs, id, syncpoint, frame_queue); break; case NvdecCommon::VideoCodec::VP8: - decoder = std::make_unique(host1x, regs, id, frame_queue); + decoder = std::make_unique(host1x, regs, id, syncpoint, frame_queue); break; case NvdecCommon::VideoCodec::VP9: - decoder = std::make_unique(host1x, regs, id, frame_queue); + decoder = std::make_unique(host1x, regs, id, syncpoint, frame_queue); break; default: UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());