From c897d998b787a9ca66a98a10f29e99b1dff99501 Mon Sep 17 00:00:00 2001 From: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com> Date: Fri, 1 Aug 2025 20:07:05 +0200 Subject: [PATCH] [vector_math, Dot]: Rewrite ARM Neon optimization --- src/common/vector_math.h | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/common/vector_math.h b/src/common/vector_math.h index a72d033f63..e16e39cbc6 100644 --- a/src/common/vector_math.h +++ b/src/common/vector_math.h @@ -648,22 +648,24 @@ template return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; } -template <> -[[nodiscard]] inline float Dot(const Vec4& a, const Vec4& b) { -#ifdef __ARM_NEON - float32x4_t va = vld1q_f32(a.AsArray()); - float32x4_t vb = vld1q_f32(b.AsArray()); - float32x4_t result = vmulq_f32(va, vb); -#if defined(__aarch64__) // Use vaddvq_f32 in ARMv8 architectures - return vaddvq_f32(result); -#else // Use manual addition for older architectures - float32x2_t sum2 = vadd_f32(vget_high_f32(result), vget_low_f32(result)); - return vget_lane_f32(vpadd_f32(sum2, sum2), 0); -#endif -#else - return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; -#endif +#ifdef __ARM_NEON__ +// NEON-accelerated overload for float Vec4 dot product +inline float Dot(const Vec4& a, const Vec4& b) { + // Load 4 floats into NEON registers + float32x4_t va = vld1q_f32(&a.x); + float32x4_t vb = vld1q_f32(&b.x); + // Element-wise multiply + float32x4_t prod = vmulq_f32(va, vb); + + // Horizontal add across the vector + #if defined(__aarch64__) + return vaddvq_f32(prod); + #else + float32x2_t sum2 = vadd_f32(vget_high_f32(prod), vget_low_f32(prod)); + return vget_lane_f32(vpadd_f32(sum2, sum2), 0); + #endif } +#endif template [[nodiscard]] constexpr Vec3 Cross(const Vec3& a,