[vector_math, Dot]: Rewrite ARM Neon optimization

This commit is contained in:
Gamer64 2025-08-01 20:07:05 +02:00 committed by crueter
parent 1bbb14c76d
commit c897d998b7

View file

@ -648,22 +648,24 @@ template <typename T>
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
} }
template <> #ifdef __ARM_NEON__
[[nodiscard]] inline float Dot(const Vec4<float>& a, const Vec4<float>& b) { // NEON-accelerated overload for float Vec4 dot product
#ifdef __ARM_NEON inline float Dot(const Vec4<float>& a, const Vec4<float>& b) {
float32x4_t va = vld1q_f32(a.AsArray()); // Load 4 floats into NEON registers
float32x4_t vb = vld1q_f32(b.AsArray()); float32x4_t va = vld1q_f32(&a.x);
float32x4_t result = vmulq_f32(va, vb); float32x4_t vb = vld1q_f32(&b.x);
#if defined(__aarch64__) // Use vaddvq_f32 in ARMv8 architectures // Element-wise multiply
return vaddvq_f32(result); float32x4_t prod = vmulq_f32(va, vb);
#else // Use manual addition for older architectures
float32x2_t sum2 = vadd_f32(vget_high_f32(result), vget_low_f32(result)); // Horizontal add across the vector
return vget_lane_f32(vpadd_f32(sum2, sum2), 0); #if defined(__aarch64__)
#endif return vaddvq_f32(prod);
#else #else
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; float32x2_t sum2 = vadd_f32(vget_high_f32(prod), vget_low_f32(prod));
#endif return vget_lane_f32(vpadd_f32(sum2, sum2), 0);
#endif
} }
#endif
template <typename T> template <typename T>
[[nodiscard]] constexpr Vec3<decltype(T{} * T{} - T{} * T{})> Cross(const Vec3<T>& a, [[nodiscard]] constexpr Vec3<decltype(T{} * T{} - T{} * T{})> Cross(const Vec3<T>& a,