revert 623baac26b
All checks were successful
eden-license / license-header (pull_request) Successful in 24s

revert Use NEON intrinsics in Vec4 dot operation

PabloMK7: Changes the Vec4 dot operation to use NEON intrinsics on ARM devices.
This function is used every time a triangle is added to the rendered, so it can be considered hot code. The other vector operations are not used as much, so there is no gain to provide NEON operations for them.

The improvements from this change are most likely minimal.

Co-authored-by: PabloMK7 <hackyglitch2@gmail.com>
This commit is contained in:
Shinmegumi 2025-08-01 19:35:44 +02:00
parent e24b83adb9
commit 9d3e1998b1

View file

@ -1,16 +1,9 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: 2014 Tony Wasserka // SPDX-FileCopyrightText: 2014 Tony Wasserka
// SPDX-FileCopyrightText: 2014 Dolphin Emulator Project // SPDX-FileCopyrightText: 2014 Dolphin Emulator Project
// SPDX-License-Identifier: BSD-3-Clause AND GPL-2.0-or-later // SPDX-License-Identifier: BSD-3-Clause AND GPL-2.0-or-later
#pragma once #pragma once
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#include <cmath> #include <cmath>
#include <type_traits> #include <type_traits>
@ -648,23 +641,6 @@ template <typename T>
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
} }
template <>
[[nodiscard]] inline float Dot(const Vec4<float>& a, const Vec4<float>& b) {
#ifdef __ARM_NEON
float32x4_t va = vld1q_f32(a.AsArray());
float32x4_t vb = vld1q_f32(b.AsArray());
float32x4_t result = vmulq_f32(va, vb);
#if defined(__aarch64__) // Use vaddvq_f32 in ARMv8 architectures
return vaddvq_f32(result);
#else // Use manual addition for older architectures
float32x2_t sum2 = vadd_f32(vget_high_f32(result), vget_low_f32(result));
return vget_lane_f32(vpadd_f32(sum2, sum2), 0);
#endif
#else
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
#endif
}
template <typename T> template <typename T>
[[nodiscard]] constexpr Vec3<decltype(T{} * T{} - T{} * T{})> Cross(const Vec3<T>& a, [[nodiscard]] constexpr Vec3<decltype(T{} * T{} - T{} * T{})> Cross(const Vec3<T>& a,
const Vec3<T>& b) { const Vec3<T>& b) {