Minor updates
All checks were successful
eden-license / license-header (pull_request) Successful in 13s
All checks were successful
eden-license / license-header (pull_request) Successful in 13s
This commit is contained in:
parent
c897d998b7
commit
37a410558e
1 changed files with 13 additions and 15 deletions
|
@ -648,24 +648,22 @@ template <typename T>
|
||||||
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
|
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __ARM_NEON__
|
template <>
|
||||||
// NEON-accelerated overload for float Vec4 dot product
|
[[nodiscard]] inline float Dot(const Vec4<float>& a, const Vec4<float>& b) {
|
||||||
inline float Dot(const Vec4<float>& a, const Vec4<float>& b) {
|
#ifdef __ARM_NEON
|
||||||
// Load 4 floats into NEON registers
|
|
||||||
float32x4_t va = vld1q_f32(&a.x);
|
float32x4_t va = vld1q_f32(&a.x);
|
||||||
float32x4_t vb = vld1q_f32(&b.x);
|
float32x4_t vb = vld1q_f32(&b.x);
|
||||||
// Element-wise multiply
|
float32x4_t result = vmulq_f32(va, vb);
|
||||||
float32x4_t prod = vmulq_f32(va, vb);
|
#if defined(__aarch64__) // Use vaddvq_f32 in ARMv8 architectures
|
||||||
|
return vaddvq_f32(result);
|
||||||
// Horizontal add across the vector
|
#else // Use manual addition for older architectures
|
||||||
#if defined(__aarch64__)
|
float32x2_t sum2 = vadd_f32(vget_high_f32(result), vget_low_f32(result));
|
||||||
return vaddvq_f32(prod);
|
return vget_lane_f32(vpadd_f32(sum2, sum2), 0);
|
||||||
#else
|
|
||||||
float32x2_t sum2 = vadd_f32(vget_high_f32(prod), vget_low_f32(prod));
|
|
||||||
return vget_lane_f32(vpadd_f32(sum2, sum2), 0);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
#else
|
||||||
|
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
[[nodiscard]] constexpr Vec3<decltype(T{} * T{} - T{} * T{})> Cross(const Vec3<T>& a,
|
[[nodiscard]] constexpr Vec3<decltype(T{} * T{} - T{} * T{})> Cross(const Vec3<T>& a,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue