[microprofile]: Disable microprofile

Testing it out to see how it impacts.
Use NEON intrinsics in Vec4 dot operation
2025-08-01 19:22:15 +02:00 · 2025-08-01 18:30:45 +02:00
2 changed files with 25 additions and 1 deletions
--- a/src/common/microprofile.h
+++ b/src/common/microprofile.h
@ -5,7 +5,7 @@

 // Uncomment this to disable microprofile. This will get you cleaner profiles when using
 // external sampling profilers like "Very Sleepy", and will improve performance somewhat.
-// #define MICROPROFILE_ENABLED 0
+#define MICROPROFILE_ENABLED 0

 // Customized Citra settings.
 // This file wraps the MicroProfile header so that these are consistent everywhere.
--- a/src/common/vector_math.h
+++ b/src/common/vector_math.h
@ -1,9 +1,16 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: 2014 Tony Wasserka
 // SPDX-FileCopyrightText: 2014 Dolphin Emulator Project
 // SPDX-License-Identifier: BSD-3-Clause AND GPL-2.0-or-later

 #pragma once

+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
 #include <cmath>
 #include <type_traits>

@ -641,6 +648,23 @@ template <typename T>
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
 }

+template <>
+[[nodiscard]] inline float Dot(const Vec4<float>& a, const Vec4<float>& b) {
+#ifdef __ARM_NEON
+    float32x4_t va = vld1q_f32(a.AsArray());
+    float32x4_t vb = vld1q_f32(b.AsArray());
+    float32x4_t result = vmulq_f32(va, vb);
+#if defined(__aarch64__) // Use vaddvq_f32 in ARMv8 architectures
+    return vaddvq_f32(result);
+#else // Use manual addition for older architectures
+    float32x2_t sum2 = vadd_f32(vget_high_f32(result), vget_low_f32(result));
+    return vget_lane_f32(vpadd_f32(sum2, sum2), 0);
+#endif
+#else
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+#endif
+}
+
 template <typename T>
 [[nodiscard]] constexpr Vec3<decltype(T{} * T{} - T{} * T{})> Cross(const Vec3<T>& a,
                                                                    const Vec3<T>& b) {