diff --git a/src/common/vector_math.h b/src/common/vector_math.h index b4885835df..a72d033f63 100644 --- a/src/common/vector_math.h +++ b/src/common/vector_math.h @@ -1,9 +1,16 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: 2014 Tony Wasserka // SPDX-FileCopyrightText: 2014 Dolphin Emulator Project // SPDX-License-Identifier: BSD-3-Clause AND GPL-2.0-or-later #pragma once +#ifdef __ARM_NEON +#include +#endif + #include #include @@ -641,6 +648,23 @@ template return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; } +template <> +[[nodiscard]] inline float Dot(const Vec4& a, const Vec4& b) { +#ifdef __ARM_NEON + float32x4_t va = vld1q_f32(a.AsArray()); + float32x4_t vb = vld1q_f32(b.AsArray()); + float32x4_t result = vmulq_f32(va, vb); +#if defined(__aarch64__) // Use vaddvq_f32 in ARMv8 architectures + return vaddvq_f32(result); +#else // Use manual addition for older architectures + float32x2_t sum2 = vadd_f32(vget_high_f32(result), vget_low_f32(result)); + return vget_lane_f32(vpadd_f32(sum2, sum2), 0); +#endif +#else + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +#endif +} + template [[nodiscard]] constexpr Vec3 Cross(const Vec3& a, const Vec3& b) {