From 1c927e659bf511f8f446653fcaaafe3c7bb9288d Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Wed, 23 Jul 2025 22:33:15 +0100
Subject: [PATCH 01/38] [sse2neon] Update to v1.8.0

---
 externals/sse2neon/sse2neon.h | 1475 ++++++++++++++++++---------------
 1 file changed, 820 insertions(+), 655 deletions(-)

diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h
index 66b93c1c74..4626e923fd 100755
--- a/externals/sse2neon/sse2neon.h
+++ b/externals/sse2neon/sse2neon.h
@@ -54,6 +54,7 @@
 //   Cuda Chen <clh960524@gmail.com>
 //   Aymen Qader <aymen.qader@arm.com>
 //   Anthony Roberts <anthony.roberts@linaro.org>
+//   Sean Luchen <seanluchen@google.com>
 
 /* Tunable configurations */
 
@@ -65,7 +66,7 @@
 #ifndef SSE2NEON_PRECISE_MINMAX
 #define SSE2NEON_PRECISE_MINMAX (0)
 #endif
-/* _mm_rcp_ps and _mm_div_ps */
+/* _mm_rcp_ps */
 #ifndef SSE2NEON_PRECISE_DIV
 #define SSE2NEON_PRECISE_DIV (0)
 #endif
@@ -113,6 +114,11 @@
 #warning "GCC versions earlier than 10 are not supported."
 #endif
 
+#if defined(__OPTIMIZE__) && !defined(SSE2NEON_SUPPRESS_WARNINGS)
+#warning \
+    "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon."
+#endif
+
 /* C language does not allow initializing a variable with a function call. */
 #ifdef __cplusplus
 #define _sse2neon_const static const
@@ -120,18 +126,34 @@
 #define _sse2neon_const const
 #endif
 
+#include <fenv.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <string.h>
 
-#if defined(_WIN32)
-/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
- * from both MinGW-w64 and MSVC.
- */
+FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val)
+{
+    double tmp;
+    memcpy(&tmp, &val, sizeof(uint64_t));
+    return tmp;
+}
+FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
+{
+    int64_t tmp;
+    memcpy(&tmp, &val, sizeof(uint64_t));
+    return tmp;
+}
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h> from MSVC. */
 #define SSE2NEON_ALLOC_DEFINED
 #endif
 
 /* If using MSVC */
 #ifdef _MSC_VER
+#if defined(_M_ARM64EC)
+#define _DISABLE_SOFTINTRIN_ 1
+#endif
 #include <intrin.h>
 #if SSE2NEON_INCLUDE_WINDOWS_H
 #include <processthreadsapi.h>
@@ -147,7 +169,7 @@
 #endif
 
 #if (defined(_M_AMD64) || defined(__x86_64__)) || \
-    (defined(_M_ARM64) || defined(__arm64__))
+    (defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__arm64__))
 #define SSE2NEON_HAS_BITSCAN64
 #endif
 #endif
@@ -183,7 +205,7 @@
     }
 
 /* Compiler barrier */
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
 #define SSE2NEON_BARRIER() _ReadWriteBarrier()
 #else
 #define SSE2NEON_BARRIER()                     \
@@ -230,7 +252,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #pragma GCC push_options
 #pragma GCC target("fpu=neon")
 #endif
-#elif defined(__aarch64__) || defined(_M_ARM64)
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #if !defined(__clang__) && !defined(_MSC_VER)
 #pragma GCC push_options
 #pragma GCC target("+simd")
@@ -244,12 +266,15 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #pragma GCC push_options
 #endif
 #else
-#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#error \
+    "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A \
+(you could try setting target explicitly with -march or -mcpu)"
 #endif
 #endif
 
 #include <arm_neon.h>
-#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
+#if (!defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)) && \
+    (__ARM_ARCH == 8)
 #if defined __has_include && __has_include(<arm_acle.h>)
 #include <arm_acle.h>
 #endif
@@ -267,7 +292,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #endif
 
 /* Rounding functions require either Aarch64 instructions or libm fallback */
-#if !defined(__aarch64__) && !defined(_M_ARM64)
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
 #include <math.h>
 #endif
 
@@ -276,7 +301,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
  * To write or access to these registers in user mode,
  * we have to perform syscall instead.
  */
-#if (!defined(__aarch64__) && !defined(_M_ARM64))
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
 #include <sys/time.h>
 #endif
 
@@ -315,6 +340,15 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
 
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_pd().
+ * Argument fp1 is a digit[01] that represents the fp from argument "b"
+ * of mm_shuffle_pd that will be placed in fp1 of result.
+ * fp0 is a digit[01] that represents the fp from argument "a" of mm_shuffle_pd
+ * that will be placed in fp0 of result.
+ */
+#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0))
+
 #if __has_builtin(__builtin_shufflevector)
 #define _sse2neon_shuffle(type, a, b, ...) \
     __builtin_shufflevector(a, b, __VA_ARGS__)
@@ -376,13 +410,18 @@ typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
 // On ARM 32-bit architecture, the float64x2_t is not supported.
 // The data type __m128d should be represented in a different way for related
 // intrinsic conversion.
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
 #else
 typedef float32x4_t __m128d;
 #endif
 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 
+// Some intrinsics operate on unaligned data types.
+typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
+typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
+typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
+
 // __int64 is defined in the Intrinsics Guide which maps to different datatype
 // in different data model
 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
@@ -472,7 +511,7 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 
 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
 
@@ -604,7 +643,7 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64)
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
 /* emulate vaddv u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 {
@@ -619,7 +658,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64)
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
 /* emulate vaddvq u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
 {
@@ -637,7 +676,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64)
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
 /* emulate vaddvq u16 variant */
 FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
 {
@@ -692,6 +731,13 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
  */
 
 /* Constants for use with _mm_prefetch. */
+#if defined(_M_ARM64EC)
+/* winnt.h already defines these constants as macros, so undefine them first. */
+#undef _MM_HINT_NTA
+#undef _MM_HINT_T0
+#undef _MM_HINT_T1
+#undef _MM_HINT_T2
+#endif
 enum _mm_hint {
     _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
     _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
@@ -707,7 +753,7 @@ typedef struct {
     uint8_t bit23 : 1;
     uint8_t bit24 : 1;
     uint8_t res2 : 7;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     uint32_t res3;
 #endif
 } fpcr_bitfield;
@@ -851,15 +897,15 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
 // supported by WoA has crypto extensions. If this changes in the future,
 // this can be verified via the runtime-only method of:
 // IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
-#if (defined(_M_ARM64) && !defined(__clang__)) || \
-    (defined(__ARM_FEATURE_CRYPTO) &&             \
+#if ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) || \
+    (defined(__ARM_FEATURE_CRYPTO) &&                                      \
      (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
 // Wraps vmull_p64
 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 {
     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     __n64 a1 = {a}, b1 = {b};
     return vreinterpretq_u64_p128(vmull_p64(a1, b1));
 #else
@@ -977,8 +1023,8 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 //   __m128i _mm_shuffle_epi32_default(__m128i a,
 //                                     __constrange(0, 255) int imm) {
 //       __m128i ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       ret[0] = a[(imm)        & 0x3];   ret[1] = a[((imm) >> 2) & 0x3];
+//       ret[2] = a[((imm) >> 4) & 0x03];  ret[3] = a[((imm) >> 6) & 0x03];
 //       return ret;
 //   }
 #define _mm_shuffle_epi32_default(a, imm)                                   \
@@ -1076,7 +1122,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
 }
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #define _mm_shuffle_epi32_splat(a, imm) \
     vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
 #else
@@ -1093,8 +1139,8 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
 //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
 //                                 __constrange(0, 255) int imm) {
 //       __m128 ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       ret[0] = a[(imm)        & 0x3];   ret[1] = a[((imm) >> 2) & 0x3];
+//       ret[2] = b[((imm) >> 4) & 0x03];  ret[3] = b[((imm) >> 6) & 0x03];
 //       return ret;
 //   }
 //
@@ -1516,7 +1562,7 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpret_m64_s32(
         vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
@@ -1541,7 +1587,7 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
                           0);
@@ -1672,7 +1718,7 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
 #else
@@ -1725,7 +1771,7 @@ FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_DIV
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -1763,14 +1809,18 @@ FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
 #if !defined(SSE2NEON_ALLOC_DEFINED)
 FORCE_INLINE void _mm_free(void *addr)
 {
+#if defined(_WIN32)
+    _aligned_free(addr);
+#else
     free(addr);
+#endif
 }
 #endif
 
 FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
 {
     uint64_t value;
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     value = _ReadStatusReg(ARM64_FPCR);
 #else
     __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
@@ -1780,10 +1830,10 @@ FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
 
 FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
 {
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     _WriteStatusReg(ARM64_FPCR, value);
 #else
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(value));  /* write */
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
 #endif
 }
 
@@ -1795,14 +1845,14 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
 {
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -1817,25 +1867,20 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
 {
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-    r.value = _sse2neon_get_fpcr();
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    if (r.field.bit22) {
-        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
-    } else {
-        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    switch (fegetround()) {
+    case FE_TONEAREST:
+        return _MM_ROUND_NEAREST;
+    case FE_DOWNWARD:
+        return _MM_ROUND_DOWN;
+    case FE_UPWARD:
+        return _MM_ROUND_UP;
+    case FE_TOWARDZERO:
+        return _MM_ROUND_TOWARD_ZERO;
+    default:
+        // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+        // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error
+        // cases we treat them as FE_TOWARDZERO (truncate).
+        return _MM_ROUND_TOWARD_ZERO;
     }
 }
 
@@ -1928,7 +1973,7 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
 {
     return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+        vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
 }
 
 // Load unaligned 64-bit integer from memory into the first element of dst.
@@ -1936,7 +1981,7 @@ FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 {
     return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+        vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
 }
 
 // Allocate size bytes of memory, aligned to the alignment specified in align,
@@ -1946,6 +1991,9 @@ FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 #if !defined(SSE2NEON_ALLOC_DEFINED)
 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
 {
+#if defined(_WIN32)
+    return _aligned_malloc(size, align);
+#else
     void *ptr;
     if (align == 1)
         return malloc(size);
@@ -1954,6 +2002,7 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
     if (!posix_memalign(&ptr, align, size))
         return ptr;
     return NULL;
+#endif
 }
 #endif
 
@@ -2117,7 +2166,7 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
 FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 {
     uint8x8_t input = vreinterpret_u8_m64(a);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     uint8x8_t tmp = vshr_n_u8(input, 7);
     return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
@@ -2138,7 +2187,7 @@ FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 FORCE_INLINE int _mm_movemask_ps(__m128 a)
 {
     uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     static const int32_t shift[4] = {0, 1, 2, 3};
     uint32x4_t tmp = vshrq_n_u32(input, 31);
     return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
@@ -2249,7 +2298,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
 FORCE_INLINE void _mm_prefetch(char const *p, int i)
 {
     (void) i;
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     switch (i) {
     case _MM_HINT_NTA:
         __prefetch2(p, 1);
@@ -2372,7 +2421,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
     uint64x1_t t = vpaddl_u32(vpaddl_u16(
         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
     return vreinterpret_m64_u16(
-        vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+        vset_lane_u16((uint16_t) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
 }
 
 // Macro: Set the flush zero bits of the MXCSR control and status register to
@@ -2385,14 +2434,14 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
     // regardless of the value of the FZ bit.
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -2400,10 +2449,10 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
 
     r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     _sse2neon_set_fpcr(r.value);
 #else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
 #endif
 }
 
@@ -2431,44 +2480,26 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
 {
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-    r.value = _sse2neon_get_fpcr();
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
     switch (rounding) {
-    case _MM_ROUND_TOWARD_ZERO:
-        r.field.bit22 = 1;
-        r.field.bit23 = 1;
+    case _MM_ROUND_NEAREST:
+        rounding = FE_TONEAREST;
         break;
     case _MM_ROUND_DOWN:
-        r.field.bit22 = 0;
-        r.field.bit23 = 1;
+        rounding = FE_DOWNWARD;
         break;
     case _MM_ROUND_UP:
-        r.field.bit22 = 1;
-        r.field.bit23 = 0;
+        rounding = FE_UPWARD;
         break;
-    default:  //_MM_ROUND_NEAREST
-        r.field.bit22 = 0;
-        r.field.bit23 = 0;
+    case _MM_ROUND_TOWARD_ZERO:
+        rounding = FE_TOWARDZERO;
+        break;
+    default:
+        // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+        // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as
+        // FE_TOWARDZERO (truncate).
+        rounding = FE_TOWARDZERO;
     }
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-    _sse2neon_set_fpcr(r.value);
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
+    fesetround(rounding);
 }
 
 // Copy single-precision (32-bit) floating-point element a to the lower element
@@ -2524,10 +2555,10 @@ FORCE_INLINE __m128 _mm_setzero_ps(void)
 // in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
 #ifdef _sse2neon_shuffle
-#define _mm_shuffle_pi16(a, imm)                                       \
-    vreinterpret_m64_s16(vshuffle_s16(                                 \
-        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
-        ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
+#define _mm_shuffle_pi16(a, imm)                                         \
+    vreinterpret_m64_s16(vshuffle_s16(                                   \
+        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), ((imm) & 0x3), \
+        (((imm) >> 2) & 0x3), (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3)))
 #else
 #define _mm_shuffle_pi16(a, imm)                                              \
     _sse2neon_define1(                                                        \
@@ -2658,7 +2689,8 @@ FORCE_INLINE void _mm_lfence(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
+    !SSE2NEON_PRECISE_SQRT
     return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
 #else
     float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
@@ -2887,7 +2919,7 @@ FORCE_INLINE __m128 _mm_undefined_ps(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -2903,7 +2935,7 @@ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -2962,15 +2994,21 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
+    c[0] = a0 + b0;
+    c[1] = a1 + b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -2981,14 +3019,16 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_add_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1];
+    c[0] = a0 + b0;
+    c[1] = a1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -3140,7 +3180,7 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
 #else
     return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
@@ -3212,7 +3252,7 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3238,17 +3278,21 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
 FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3260,15 +3304,16 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
 FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3307,17 +3352,21 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
 FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3329,15 +3378,16 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
 FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3349,17 +3399,21 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
 FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3371,15 +3425,16 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
 FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmple_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3421,17 +3476,21 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
 FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3443,14 +3502,15 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
 FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3462,7 +3522,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
 FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
 #else
@@ -3488,20 +3548,22 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
 FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3521,20 +3583,22 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
 FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3554,20 +3618,22 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
 FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3587,20 +3653,22 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
 FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3620,7 +3688,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
 FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     // Excluding NaNs, any two floating point numbers can be compared.
     uint64x2_t not_nan_a =
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
@@ -3628,19 +3696,17 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3652,17 +3718,15 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
 FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3674,7 +3738,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
 FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     // Two NaNs are not equal in comparison operation.
     uint64x2_t not_nan_a =
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
@@ -3683,19 +3747,17 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_s32(
         vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3707,17 +3769,15 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
 FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3729,13 +3789,13 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
 FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 >= *(double *) &b0);
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return a0 >= b0;
 #endif
 }
 
@@ -3744,13 +3804,14 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
 FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 > *(double *) &b0);
+    return a0 > b0;
 #endif
 }
 
@@ -3759,13 +3820,14 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
 FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 <= *(double *) &b0);
+    return a0 <= b0;
 #endif
 }
 
@@ -3774,13 +3836,14 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
 FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 < *(double *) &b0);
+    return a0 < b0;
 #endif
 }
 
@@ -3789,7 +3852,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
 FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
 #else
     uint32x4_t a_not_nan =
@@ -3818,7 +3881,7 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
 FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
 #else
@@ -3849,8 +3912,11 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
         vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
     return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
 #endif
 }
@@ -3861,8 +3927,11 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 {
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
     int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
@@ -3873,13 +3942,14 @@ FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
 #else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_ps(0, 0, (float) a1, (float) a0);
 #endif
 }
 
@@ -3888,7 +3958,7 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
 FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
 #else
@@ -3907,7 +3977,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 {
 #if defined(__ARM_FEATURE_FRINT)
     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
-#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
+#elif (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (_MM_GET_ROUNDING_MODE()) {
     case _MM_ROUND_NEAREST:
@@ -3961,7 +4031,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
 #else
@@ -3975,10 +4045,12 @@ FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
 #else
-    return ((double *) &a)[0];
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return _a;
 #endif
 }
 
@@ -3987,11 +4059,12 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
     return (int32_t) ret;
 #endif
 }
@@ -4001,11 +4074,12 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
     return (int64_t) ret;
 #endif
 }
@@ -4022,13 +4096,15 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
 FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(vsetq_lane_f32(
         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
         vreinterpretq_f32_m128(a), 0));
 #else
-    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
-                                                 vreinterpretq_f32_m128(a), 0));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0));
 #endif
 }
 
@@ -4056,13 +4132,13 @@ FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
 FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    double bf = (double) b;
+    int64_t _b = sse2neon_recast_f64_s64((double) b);
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4084,13 +4160,13 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
 FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    double bf = (double) b;
+    int64_t _b = sse2neon_recast_f64_s64((double) b);
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4121,12 +4197,12 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 {
     double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
 #else
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+    return vreinterpretq_m128d_s64(vsetq_lane_s64(
+        sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4135,8 +4211,9 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 {
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
     return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
 }
 
@@ -4145,8 +4222,9 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
 {
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
     int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
@@ -4164,8 +4242,9 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 {
-    double ret = *((double *) &a);
-    return (int32_t) ret;
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int32_t) _a;
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
@@ -4173,11 +4252,12 @@ FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int64_t) _a;
 #endif
 }
 
@@ -4191,15 +4271,21 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] / db[0];
-    c[1] = da[1] / db[1];
+    c[0] = a0 / b0;
+    c[1] = a1 / b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -4211,7 +4297,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     float64x2_t tmp =
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_f64(
@@ -4243,7 +4329,7 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
 FORCE_INLINE __m128d _mm_load_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vld1q_f64(p));
 #else
     const float *fp = (const float *) p;
@@ -4263,7 +4349,7 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
 FORCE_INLINE __m128d _mm_load_sd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
 #else
     const float *fp = (const float *) p;
@@ -4285,7 +4371,7 @@ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
 #else
     return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
@@ -4298,7 +4384,7 @@ FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
 #else
@@ -4324,7 +4410,7 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
 #else
@@ -4340,7 +4426,7 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     float64x2_t v = vld1q_f64(p);
     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
 #else
@@ -4361,7 +4447,7 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 {
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+    return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
 }
 
 // Load unaligned 32-bit integer from memory into the first element of dst.
@@ -4369,7 +4455,7 @@ FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
 {
     return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+        vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
 }
 
 // Multiply packed signed 16-bit integers in a and b, producing intermediate
@@ -4380,7 +4466,7 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
 {
     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
                               vget_low_s16(vreinterpretq_s16_m128i(b)));
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int32x4_t high =
         vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
 
@@ -4434,7 +4520,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
 FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #if SSE2NEON_PRECISE_MINMAX
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
@@ -4444,15 +4530,19 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 > b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 > b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
 
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
 #endif
 }
 
@@ -4462,12 +4552,14 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
 FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_max_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 > b0 ? a0 : b0, a1};
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
@@ -4495,7 +4587,7 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
 FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #if SSE2NEON_PRECISE_MINMAX
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
@@ -4505,14 +4597,18 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 < b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 < b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
 #endif
 }
 
@@ -4522,12 +4618,14 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
 FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_min_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 < b0 ? a0 : b0, a1};
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
@@ -4678,15 +4776,21 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] * db[0];
-    c[1] = da[1] * db[1];
+    c[0] = a0 * b0;
+    c[1] = a1 * b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -4739,7 +4843,7 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
     uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
     uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     uint32x4_t ab7654 =
         vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
     uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
@@ -4820,7 +4924,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
 FORCE_INLINE void _mm_pause(void)
 {
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     __isb(_ARM64_BARRIER_SY);
 #else
     __asm__ __volatile__("isb\n");
@@ -4895,11 +4999,11 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
                                   signed char b1,
                                   signed char b0)
 {
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    int8_t ALIGN_STRUCT(16) data[16] = {
+        (int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+        (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+        (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+        (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
     return (__m128i) vld1q_s8(data);
 }
 
@@ -4909,7 +5013,7 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 {
     double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
 #else
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
@@ -4926,7 +5030,7 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
 FORCE_INLINE __m128d _mm_set_sd(double a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
 #else
     return _mm_set_pd(0, a);
@@ -4973,10 +5077,11 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
 FORCE_INLINE __m128d _mm_set1_pd(double d)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
 #else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+    int64_t _d = sse2neon_recast_f64_s64(d);
+    return vreinterpretq_m128d_s64(vdupq_n_s64(_d));
 #endif
 }
 
@@ -5029,11 +5134,11 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
                                    signed char b14,
                                    signed char b15)
 {
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    int8_t ALIGN_STRUCT(16) data[16] = {
+        (int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+        (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+        (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+        (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
     return (__m128i) vld1q_s8(data);
 }
 
@@ -5049,7 +5154,7 @@ FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
 FORCE_INLINE __m128d _mm_setzero_pd(void)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vdupq_n_f64(0));
 #else
     return vreinterpretq_m128d_f32(vdupq_n_f32(0));
@@ -5136,12 +5241,12 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
 #define _mm_shuffle_pd(a, b, imm8)                                            \
     vreinterpretq_m128d_s64(                                                  \
         vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
-                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+                      (imm8) & 0x1, (((imm8) & 0x2) >> 1) + 2))
 #else
-#define _mm_shuffle_pd(a, b, imm8)                                     \
-    _mm_castsi128_pd(_mm_set_epi64x(                                   \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#define _mm_shuffle_pd(a, b, imm8)                                       \
+    _mm_castsi128_pd(_mm_set_epi64x(                                     \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), ((imm8) & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), (imm8) & 0x1)))
 #endif
 
 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
@@ -5222,7 +5327,7 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
     if (_sse2neon_unlikely(imm & ~15))
         return _mm_setzero_si128();
     return vreinterpretq_m128i_s16(
-        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16((int16_t) imm)));
 }
 
 // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
@@ -5250,13 +5355,13 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
 // dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
-#define _mm_slli_si128(a, imm)                                              \
-    _sse2neon_define1(                                                      \
-        __m128i, a, int8x16_t ret;                                          \
-        if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
-        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
-        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),      \
-                            ((imm <= 0 || imm > 15) ? 0 : (16 - imm)));     \
+#define _mm_slli_si128(a, imm)                                                \
+    _sse2neon_define1(                                                        \
+        __m128i, a, int8x16_t ret;                                            \
+        if (_sse2neon_unlikely((imm) == 0)) ret = vreinterpretq_s8_m128i(_a); \
+        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);        \
+        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),        \
+                            (((imm) <= 0 || (imm) > 15) ? 0 : (16 - (imm)))); \
         _sse2neon_return(vreinterpretq_m128i_s8(ret));)
 
 // Compute the square root of packed double-precision (64-bit) floating-point
@@ -5264,12 +5369,15 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
 FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double a0 = sqrt(((double *) &a)[0]);
-    double a1 = sqrt(((double *) &a)[1]);
-    return _mm_set_pd(a1, a0);
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double _a0 = sqrt(a0);
+    double _a1 = sqrt(a1);
+    return _mm_set_pd(_a1, _a0);
 #endif
 }
 
@@ -5279,10 +5387,13 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
 FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_sqrt_pd(b));
 #else
-    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+    double _a, _b;
+    _a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    _b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return _mm_set_pd(_a, sqrt(_b));
 #endif
 }
 
@@ -5295,7 +5406,7 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
     if (_sse2neon_unlikely(c & ~15))
         return _mm_cmplt_epi16(a, _mm_setzero_si128());
     return vreinterpretq_m128i_s16(
-        vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
+        vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t) -c)));
 }
 
 // Shift packed 32-bit integers in a right by count while shifting in sign bits,
@@ -5315,7 +5426,7 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
 {
-    const int count = (imm & ~15) ? 15 : imm;
+    const int16_t count = (imm & ~15) ? 15 : (int16_t) imm;
     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
 }
 
@@ -5377,13 +5488,13 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
 // store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                                \
-    _sse2neon_define0(                                                        \
-        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {       \
-            ret = _mm_setzero_si128();                                        \
-        } else {                                                              \
-            ret = vreinterpretq_m128i_u16(                                    \
-                vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
+#define _mm_srli_epi16(a, imm)                                                 \
+    _sse2neon_define0(                                                         \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {        \
+            ret = _mm_setzero_si128();                                         \
+        } else {                                                               \
+            ret = vreinterpretq_m128i_u16(vshlq_u16(                           \
+                vreinterpretq_u16_m128i(_a), vdupq_n_s16((int16_t) - (imm)))); \
         } _sse2neon_return(ret);)
 
 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
@@ -5419,7 +5530,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
         __m128i, a, int8x16_t ret;                                     \
         if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
         else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
-                            (imm > 15 ? 0 : imm));                     \
+                            ((imm) > 15 ? 0 : (imm)));                 \
         _sse2neon_return(vreinterpretq_m128i_s8(ret));)
 
 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
@@ -5428,7 +5539,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
 #else
     vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
@@ -5441,7 +5552,7 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
     vst1q_f64((float64_t *) mem_addr,
               vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
@@ -5457,7 +5568,7 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
@@ -5483,7 +5594,7 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
@@ -5502,7 +5613,7 @@ FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
@@ -5553,7 +5664,7 @@ FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
 {
 #if __has_builtin(__builtin_nontemporal_store)
     __builtin_nontemporal_store(a, (__m128d *) p);
-#elif defined(__aarch64__) || defined(_M_ARM64)
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     vst1q_f64(p, vreinterpretq_f64_m128d(a));
 #else
     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
@@ -5633,15 +5744,21 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
 //  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] - db[0];
-    c[1] = da[1] - db[1];
+    c[0] = a0 - b0;
+    c[1] = a1 - b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -5716,7 +5833,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
     __m128d a;
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     a = _mm_setzero_pd();
 #endif
     return a;
@@ -5730,7 +5847,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s16(
         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 #else
@@ -5746,7 +5863,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s32(
         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 #else
@@ -5762,7 +5879,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s64(
         vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -5777,7 +5894,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s8(
         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
@@ -5795,7 +5912,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -5810,7 +5927,7 @@ FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s16(
         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 #else
@@ -5826,7 +5943,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s32(
         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 #else
@@ -5842,7 +5959,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s64(
         vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -5857,7 +5974,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s8(
         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
@@ -5873,7 +5990,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -5910,7 +6027,7 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
 FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 {
     _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
                                              vreinterpretq_f64_m128d(b),
                                              vreinterpretq_f64_m128d(mask)));
@@ -5926,7 +6043,7 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 {
     _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_FMA) /* VFPv4+ */
     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
                                             vreinterpretq_f32_m128(mask),
@@ -5941,13 +6058,19 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[] = {da[0] + da[1], db[0] + db[1]};
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 + a1, b0 + b1};
     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
 #endif
 }
@@ -5957,7 +6080,7 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -5973,17 +6096,23 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 // Horizontally subtract adjacent pairs of double-precision (64-bit)
 // floating-point elements in a and b, and pack the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
-FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
     return vreinterpretq_m128d_f64(
-        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
+        vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
 #else
-    double *da = (double *) &_a;
-    double *db = (double *) &_b;
-    double c[] = {da[0] - da[1], db[0] - db[1]};
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 - a1, b0 - b1};
     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
 #endif
 }
@@ -5995,7 +6124,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 {
     float32x4_t a = vreinterpretq_f32_m128(_a);
     float32x4_t b = vreinterpretq_f32_m128(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
 #else
@@ -6020,7 +6149,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
 #else
@@ -6034,7 +6163,7 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
 #elif defined(_sse2neon_shuffle)
@@ -6053,7 +6182,7 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
 #elif defined(_sse2neon_shuffle)
@@ -6121,32 +6250,32 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
 // the result right by imm8 bytes, and store the low 16 bytes in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
 #if defined(__GNUC__) && !defined(__clang__)
-#define _mm_alignr_epi8(a, b, imm)                                            \
-    __extension__({                                                           \
-        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
-        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
-        __m128i ret;                                                          \
-        if (_sse2neon_unlikely((imm) & ~31))                                  \
-            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
-        else if (imm >= 16)                                                   \
-            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
-        else                                                                  \
-            ret =                                                             \
-                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
-        ret;                                                                  \
+#define _mm_alignr_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                 \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                 \
+        __m128i ret;                                               \
+        if (_sse2neon_unlikely((imm) & ~31))                       \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));           \
+        else if ((imm) >= 16)                                      \
+            ret = _mm_srli_si128(a, (imm) >= 16 ? (imm) - 16 : 0); \
+        else                                                       \
+            ret = vreinterpretq_m128i_u8(                          \
+                vextq_u8(_b, _a, (imm) < 16 ? (imm) : 0));         \
+        ret;                                                       \
     })
 
 #else
-#define _mm_alignr_epi8(a, b, imm)                                          \
-    _sse2neon_define2(                                                      \
-        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a);         \
-        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;           \
-        if (_sse2neon_unlikely((imm) & ~31)) ret =                          \
-            vreinterpretq_m128i_u8(vdupq_n_u8(0));                          \
-        else if (imm >= 16) ret =                                           \
-            _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0);                   \
-        else ret =                                                          \
-            vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
+#define _mm_alignr_epi8(a, b, imm)                                  \
+    _sse2neon_define2(                                              \
+        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
+        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;   \
+        if (_sse2neon_unlikely((imm) & ~31)) ret =                  \
+            vreinterpretq_m128i_u8(vdupq_n_u8(0));                  \
+        else if ((imm) >= 16) ret =                                 \
+            _mm_srli_si128(_a, (imm) >= 16 ? (imm) - 16 : 0);       \
+        else ret = vreinterpretq_m128i_u8(                          \
+            vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0));            \
         _sse2neon_return(ret);)
 
 #endif
@@ -6162,7 +6291,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
             uint8x8_t tmp_low;                                              \
             uint8x8_t tmp_high;                                             \
             if ((imm) >= 8) {                                               \
-                const int idx = (imm) -8;                                   \
+                const int idx = (imm) - 8;                                  \
                 tmp_low = vreinterpret_u8_m64(_a);                          \
                 tmp_high = vdup_n_u8(0);                                    \
                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
@@ -6181,7 +6310,7 @@ FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
 #else
     return vreinterpretq_m128i_s16(
@@ -6197,7 +6326,7 @@ FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
 #else
     return vreinterpretq_m128i_s32(
@@ -6229,7 +6358,7 @@ FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
     return vreinterpretq_s64_s16(
@@ -6254,7 +6383,7 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t res = vuzp_s16(a, b);
@@ -6269,7 +6398,7 @@ FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s16(
         vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
@@ -6285,7 +6414,7 @@ FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s32(
         vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
 #else
@@ -6301,7 +6430,7 @@ FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t c = vuzp_s16(a, b);
@@ -6316,7 +6445,7 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
 {
     int32x2_t a = vreinterpret_s32_m64(_a);
     int32x2_t b = vreinterpret_s32_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
 #else
     int32x2x2_t c = vuzp_s32(a, b);
@@ -6331,7 +6460,7 @@ FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s16(
         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
@@ -6347,7 +6476,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t c = vuzp_s16(a, b);
@@ -6362,7 +6491,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     uint8x16_t a = vreinterpretq_u8_m128i(_a);
     int8x16_t b = vreinterpretq_s8_m128i(_b);
     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
@@ -6466,7 +6595,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
     uint8x16_t idx_masked =
         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
 #elif defined(__GNUC__)
     int8x16_t ret;
@@ -6512,7 +6641,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
     // (b < 0) ? 0xFFFF : 0
     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
     // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
 #else
     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
@@ -6541,7 +6670,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
 
     // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
 #else
     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
@@ -6570,7 +6699,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
 
     // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
 #else
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
@@ -6599,7 +6728,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
 
     // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
 #else
     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
@@ -6628,7 +6757,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
 
     // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
 #else
     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
@@ -6657,7 +6786,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
 
     // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
 #else
     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
@@ -6683,14 +6812,14 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
     _sse2neon_define2(                                                  \
         __m128i, a, b,                                                  \
         const uint16_t _mask[8] =                                       \
-            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0);   \
+            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 1)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 2)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 3)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 4)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 5)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 6)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 7)) ? (uint16_t) - 1 : 0x0);  \
         uint16x8_t _mask_vec = vld1q_u16(_mask);                        \
         uint16x8_t __a = vreinterpretq_u16_m128i(_a);                   \
         uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
@@ -6715,11 +6844,9 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
 {
-    const uint32_t ALIGN_STRUCT(16)
-        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    const uint32_t ALIGN_STRUCT(16) data[4] = {
+        (imm8 & (1 << 0)) ? UINT32_MAX : 0, (imm8 & (1 << 1)) ? UINT32_MAX : 0,
+        (imm8 & (1 << 2)) ? UINT32_MAX : 0, (imm8 & (1 << 3)) ? UINT32_MAX : 0};
     uint32x4_t mask = vld1q_u32(data);
     float32x4_t a = vreinterpretq_f32_m128(_a);
     float32x4_t b = vreinterpretq_f32_m128(_b);
@@ -6746,7 +6873,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
 {
     uint64x2_t mask =
         vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     float64x2_t a = vreinterpretq_f64_m128d(_a);
     float64x2_t b = vreinterpretq_f64_m128d(_b);
     return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
@@ -6776,11 +6903,13 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
 FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double *f = (double *) &a;
-    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(ceil(a1), ceil(a0));
 #endif
 }
 
@@ -6790,7 +6919,7 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
 #else
@@ -6823,7 +6952,7 @@ FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
 // in dst
 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_u64(
         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
 #else
@@ -6980,7 +7109,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
         _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
     __m128d tmp = _mm_and_pd(mul, mulMask);
 #else
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
                              : 0;
@@ -6988,16 +7117,28 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
                              : 0;
 #else
-    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
-    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double d0 = (imm & 0x10) ? a0 * b0 : 0;
+    double d1 = (imm & 0x20) ? a1 * b1 : 0;
 #endif
     __m128d tmp = _mm_set_pd(d1, d0);
 #endif
     // Sum the products
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
 #else
-    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+    double _tmp0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
+    double _tmp1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
+    double sum = _tmp0 + _tmp1;
 #endif
     // Conditionally store the sum
     const __m128d sumMask =
@@ -7014,7 +7155,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 {
     float32x4_t elementwise_prod = _mm_mul_ps(a, b);
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     /* shortcuts */
     if (imm == 0xFF) {
         return _mm_set1_ps(vaddvq_f32(elementwise_prod));
@@ -7084,11 +7225,13 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
 FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double *f = (double *) &a;
-    return _mm_set_pd(floor(f[1]), floor(f[0]));
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(floor(a1), floor(a0));
 #endif
 }
 
@@ -7098,7 +7241,7 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
 #else
@@ -7157,24 +7300,24 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
 // element from b into tmp using the control in imm8. Store tmp to dst using
 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
-#define _mm_insert_ps(a, b, imm8)                                            \
-    _sse2neon_define2(                                                       \
-        __m128, a, b,                                                        \
-        float32x4_t tmp1 =                                                   \
-            vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3),            \
-                           vreinterpretq_f32_m128(_a), 0);                   \
-        float32x4_t tmp2 =                                                   \
-            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                          \
-                           vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
-        const uint32_t data[4] =                                             \
-            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,             \
-                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,             \
-                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,             \
-                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);            \
-        uint32x4_t mask = vld1q_u32(data);                                   \
-        float32x4_t all_zeros = vdupq_n_f32(0);                              \
-                                                                             \
-        _sse2neon_return(vreinterpretq_m128_f32(                             \
+#define _mm_insert_ps(a, b, imm8)                                              \
+    _sse2neon_define2(                                                         \
+        __m128, a, b,                                                          \
+        float32x4_t tmp1 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(_b, ((imm8) >> 6) & 0x3),            \
+                           vreinterpretq_f32_m128(_a), 0);                     \
+        float32x4_t tmp2 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                            \
+                           vreinterpretq_f32_m128(_a), (((imm8) >> 4) & 0x3)); \
+        const uint32_t data[4] =                                               \
+            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,               \
+                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,               \
+                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,               \
+                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);              \
+        uint32x4_t mask = vld1q_u32(data);                                     \
+        float32x4_t all_zeros = vdupq_n_f32(0);                                \
+                                                                               \
+        _sse2neon_return(vreinterpretq_m128_f32(                               \
             vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
 
 // Compare packed signed 32-bit integers in a and b, and store packed maximum
@@ -7256,7 +7399,7 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
 {
     __m128i dst;
     uint16_t min, idx = 0;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     // Find the minimum value
     min = vminvq_u16(vreinterpretq_u16_m128i(a));
 
@@ -7359,7 +7502,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
     c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
     uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
     c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     // |0|4|2|6|
     c04 = vpaddq_s16(c04, c26);
     // |1|5|3|7|
@@ -7419,7 +7562,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
 FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     switch (rounding) {
     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
         return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
@@ -7488,7 +7631,7 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (rounding) {
     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
@@ -7621,7 +7764,7 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
     uint64x2_t zeros = vbicq_u64(m, v);
 
     // If both 128-bit variables are populated (non-zero) then return 1.
-    // For comparision purposes, first compact each var down to 32-bits.
+    // For comparison purposes, first compact each var down to 32-bits.
     uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
 
     // if folding minimum is non-zero then both vars must be non-zero
@@ -7635,9 +7778,9 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 {
-    int64x2_t s64 =
+    int64x2_t s64_vec =
         vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+    return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
 }
 
 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
@@ -7655,9 +7798,9 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
 {
-    int64x2_t s64 =
+    int64x2_t s64_vec =
         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+    return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
 }
 
 /* SSE4.2 */
@@ -7825,40 +7968,40 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
                                       SSE2NEON_CAT(u, size)))                \
     } while (0)
 
-#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
-    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
-                                                int lb)                       \
-    {                                                                         \
-        __m128i mtx[16];                                                      \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
-        return SSE2NEON_CAT(                                                  \
-            _sse2neon_aggregate_equal_any_,                                   \
-            SSE2NEON_CAT(                                                     \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
-                                             type))))(la, lb, mtx);           \
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                               \
+    static uint16_t _sse2neon_cmp_##type##_equal_any(__m128i a, int la, \
+                                                     __m128i b, int lb) \
+    {                                                                   \
+        __m128i mtx[16];                                                \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),    \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));      \
+        return SSE2NEON_CAT(                                            \
+            _sse2neon_aggregate_equal_any_,                             \
+            SSE2NEON_CAT(                                               \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                  \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
+                                             type))))(la, lb, mtx);     \
     }
 
-#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
-    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
-                                                 int lb)                       \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_RANGES(                                                        \
-            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_ranges_,                                       \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
-                                             type))))(la, lb, mtx);            \
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)          \
+    static uint16_t _sse2neon_cmp_##us##type##_ranges(__m128i a, int la,     \
+                                                      __m128i b, int lb)     \
+    {                                                                        \
+        __m128i mtx[16];                                                     \
+        PCMPSTR_RANGES(                                                      \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);    \
+        return SSE2NEON_CAT(                                                 \
+            _sse2neon_aggregate_ranges_,                                     \
+            SSE2NEON_CAT(                                                    \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                       \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,      \
+                                             type))))(la, lb, mtx);          \
     }
 
 #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
-    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
-                                                    __m128i b, int lb)         \
+    static uint16_t _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,    \
+                                                         __m128i b, int lb)    \
     {                                                                          \
         __m128i mtx[16];                                                       \
         PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
@@ -7872,29 +8015,34 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
     }
 
-static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_equal_any_8x16(int la,
+                                                   int lb,
+                                                   __m128i mtx[16])
 {
-    int res = 0;
+    uint16_t res = 0;
     int m = (1 << la) - 1;
     uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
     uint8x16_t vec = vcombine_u8(t_lo, t_hi);
     for (int j = 0; j < lb; j++) {
         mtx[j] = vreinterpretq_m128i_u8(
             vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
         mtx[j] = vreinterpretq_m128i_u8(
             vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        uint16_t tmp =
+            _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
         res |= (tmp << j);
     }
     return res;
 }
 
-static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_equal_any_16x8(int la,
+                                                   int lb,
+                                                   __m128i mtx[16])
 {
-    int res = 0;
-    int m = (1 << la) - 1;
+    uint16_t res = 0;
+    uint16_t m = (uint16_t) (1 << la) - 1;
     uint16x8_t vec =
         vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
     for (int j = 0; j < lb; j++) {
@@ -7902,7 +8050,8 @@ static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
             vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
         mtx[j] = vreinterpretq_m128i_u16(
             vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        uint16_t tmp =
+            _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
         res |= (tmp << j);
     }
     return res;
@@ -7916,10 +8065,10 @@ static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
 
 SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
 
-static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
 {
-    int res = 0;
-    int m = (1 << la) - 1;
+    uint16_t res = 0;
+    uint16_t m = (uint16_t) (1 << la) - 1;
     uint16x8_t vec =
         vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
     for (int j = 0; j < lb; j++) {
@@ -7931,24 +8080,24 @@ static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
             vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
         uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
                                        vreinterpretq_u32_m128i(tmp));
-#if defined(__aarch64__) || defined(_M_ARM64)
-        int t = vaddvq_u32(vec_res) ? 1 : 0;
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+        uint16_t t = vaddvq_u32(vec_res) ? 1 : 0;
 #else
         uint64x2_t sumh = vpaddlq_u32(vec_res);
-        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+        uint16_t t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
 #endif
         res |= (t << j);
     }
     return res;
 }
 
-static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
 {
-    int res = 0;
-    int m = (1 << la) - 1;
+    uint16_t res = 0;
+    uint16_t m = (uint16_t) ((1 << la) - 1);
     uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
     uint8x16_t vec = vcombine_u8(t_lo, t_hi);
     for (int j = 0; j < lb; j++) {
         mtx[j] = vreinterpretq_m128i_u8(
@@ -7959,7 +8108,7 @@ static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
             vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
         uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
                                        vreinterpretq_u16_m128i(tmp));
-        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        uint16_t t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
         res |= (t << j);
     }
     return res;
@@ -7981,22 +8130,25 @@ SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
 #undef SSE2NEON_CMP_RANGES_IS_BYTE
 #undef SSE2NEON_CMP_RANGES_IS_WORD
 
-static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a,
+                                              int la,
+                                              __m128i b,
+                                              int lb)
 {
     uint8x16_t mtx =
         vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x10000 - (1 << la);
-    int tb = 0x10000 - (1 << lb);
+    uint16_t m0 = (la < lb) ? 0 : (uint16_t) ((1 << la) - (1 << lb));
+    uint16_t m1 = (uint16_t) (0x10000 - (1 << la));
+    uint16_t tb = (uint16_t) (0x10000 - (1 << lb));
     uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
     uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
     vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
-    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
-    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
-    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
-    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
-    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+    vec0_lo = vtst_u8(vdup_n_u8((uint8_t) m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8((uint8_t) (m0 >> 8)), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8((uint8_t) m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8((uint8_t) (m1 >> 8)), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8((uint8_t) tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8((uint8_t) (tb >> 8)), vec_mask);
 
     res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
     res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
@@ -8005,17 +8157,20 @@ static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
     res_lo = vand_u8(res_lo, vec_mask);
     res_hi = vand_u8(res_hi, vec_mask);
 
-    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
-    return res;
+    return _sse2neon_vaddv_u8(res_lo) +
+           (uint16_t) (_sse2neon_vaddv_u8(res_hi) << 8);
 }
 
-static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+static uint16_t _sse2neon_cmp_word_equal_each(__m128i a,
+                                              int la,
+                                              __m128i b,
+                                              int lb)
 {
     uint16x8_t mtx =
         vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x100 - (1 << la);
-    int tb = 0x100 - (1 << lb);
+    uint16_t m0 = (uint16_t) ((la < lb) ? 0 : ((1 << la) - (1 << lb)));
+    uint16_t m1 = (uint16_t) (0x100 - (1 << la));
+    uint16_t tb = (uint16_t) (0x100 - (1 << lb));
     uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
     uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
     uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
@@ -8030,18 +8185,22 @@ static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
 
 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
-    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
-        int bound, int la, int lb, __m128i mtx[16])                            \
+    static uint16_t                                                            \
+        _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(          \
+            int bound, int la, int lb, __m128i mtx[16])                        \
     {                                                                          \
-        int res = 0;                                                           \
-        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint16_t res = 0;                                                      \
+        uint16_t m1 =                                                          \
+            (uint16_t) (SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la));  \
         uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
             vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
             vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
         uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
-            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
-                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
-            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+            vcombine_u##size(                                                  \
+                vtst_u##size(vdup_n_u##size((uint##size##_t) m1), vec_mask),   \
+                vtst_u##size(vdup_n_u##size((uint##size##_t)(m1 >> 8)),        \
+                             vec_mask)),                                       \
+            vtstq_u##size(vdupq_n_u##size((uint##size##_t) m1), vec_mask));    \
         uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
         uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
         for (int j = 0; j < lb; j++) {                                         \
@@ -8058,7 +8217,7 @@ static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
             int val = 1;                                                       \
             for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
                 val &= ptr[k * bound + j];                                     \
-            res += val << i;                                                   \
+            res += (uint16_t) (val << i);                                      \
         }                                                                      \
         return res;                                                            \
     }
@@ -8105,14 +8264,17 @@ enum {
     SSE2NEON_CMPESTR_LIST
 #undef _
 };
-typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+typedef uint16_t (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
 static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
 #define _(name, func_suffix) _sse2neon_##func_suffix,
     SSE2NEON_CMPESTR_LIST
 #undef _
 };
 
-FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+FORCE_INLINE uint16_t _sse2neon_sido_negative(int res,
+                                              int lb,
+                                              int imm8,
+                                              int bound)
 {
     switch (imm8 & 0x30) {
     case _SIDD_NEGATIVE_POLARITY:
@@ -8125,12 +8287,12 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
         break;
     }
 
-    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+    return (uint16_t) (res & ((bound == 8) ? 0xFF : 0xFFFF));
 }
 
 FORCE_INLINE int _sse2neon_clz(unsigned int x)
 {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
     unsigned long cnt = 0;
     if (_BitScanReverse(&cnt, x))
         return 31 - cnt;
@@ -8142,7 +8304,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x)
 
 FORCE_INLINE int _sse2neon_ctz(unsigned int x)
 {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
     unsigned long cnt = 0;
     if (_BitScanForward(&cnt, x))
         return cnt;
@@ -8174,7 +8336,7 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 #define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
 
 #define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
-    const int var = (imm & 0x01) ? 8 : 16
+    const int var = ((imm) & 0x01) ? 8 : 16
 
 #define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
     int tmp1 = la ^ (la >> 31);                  \
@@ -8189,28 +8351,28 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
 // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
 // string a and b.
-#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
-    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
-    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                         \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                               \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                               \
+    uint16_t r2 = (_sse2neon_cmpfunc_table[(imm8) & 0x0f])(a, la, b, lb); \
     r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
 
-#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
-    return (r2 == 0) ? bound                                     \
-                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
-                                      : _sse2neon_ctz(r2))
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)            \
+    return (r2 == 0) ? bound                                       \
+                     : (((imm8) & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                        : _sse2neon_ctz(r2))
 
 #define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
     __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
-    if (imm8 & 0x40) {                                                         \
+    if ((imm8) & 0x40) {                                                       \
         if (bound == 8) {                                                      \
             uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
                                        vld1q_u16(_sse2neon_cmpestr_mask16b));  \
             dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
                 tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
         } else {                                                               \
-            uint8x16_t vec_r2 =                                                \
-                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t vec_r2 = vcombine_u8(vdup_n_u8((uint8_t) r2),           \
+                                            vdup_n_u8((uint8_t) (r2 >> 8)));   \
             uint8x16_t tmp =                                                   \
                 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
             dst = vreinterpretq_m128i_u8(                                      \
@@ -8221,8 +8383,8 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
             dst = vreinterpretq_m128i_u16(                                     \
                 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
         } else {                                                               \
-            dst = vreinterpretq_m128i_u8(                                      \
-                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+            dst = vreinterpretq_m128i_u8(vsetq_lane_u8(                        \
+                (uint8_t) (r2 & 0xff), vreinterpretq_u8_m128i(dst), 0));       \
         }                                                                      \
     }                                                                          \
     return dst
@@ -8325,7 +8487,7 @@ FORCE_INLINE int _mm_cmpestrz(__m128i a,
 
 #define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
     do {                                                                 \
-        if (imm8 & 0x01) {                                               \
+        if ((imm8) & 0x01) {                                             \
             uint16x8_t equal_mask_##str =                                \
                 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
             uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
@@ -8423,7 +8585,7 @@ FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
 // in b for greater than.
 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_u64(
         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -8443,11 +8605,11 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    (defined(_M_ARM64) && !defined(__clang__))
+    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
     crc = __crc32ch(crc, v);
 #else
-    crc = _mm_crc32_u8(crc, v & 0xff);
-    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+    crc = _mm_crc32_u8(crc, (uint8_t) (v & 0xff));
+    crc = _mm_crc32_u8(crc, (uint8_t) ((v >> 8) & 0xff));
 #endif
     return crc;
 }
@@ -8462,11 +8624,11 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    (defined(_M_ARM64) && !defined(__clang__))
+    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
     crc = __crc32cw(crc, v);
 #else
-    crc = _mm_crc32_u16(crc, v & 0xffff);
-    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+    crc = _mm_crc32_u16(crc, (uint16_t) (v & 0xffff));
+    crc = _mm_crc32_u16(crc, (uint16_t) ((v >> 16) & 0xffff));
 #endif
     return crc;
 }
@@ -8480,11 +8642,11 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
-#elif (defined(_M_ARM64) && !defined(__clang__))
+#elif ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
     crc = __crc32cd((uint32_t) crc, v);
 #else
-    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) (v & 0xffffffff));
+    crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) ((v >> 32) & 0xffffffff));
 #endif
     return crc;
 }
@@ -8499,7 +8661,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    (defined(_M_ARM64) && !defined(__clang__))
+    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
     crc = __crc32cb(crc, v);
 #else
     crc ^= v;
@@ -8530,7 +8692,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
     crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
 #else  // Fall back to the generic table lookup approach
     // Adapted from: https://create.stephan-brumme.com/crc32/
-    // Apply half-byte comparision algorithm for the best ratio between
+    // Apply half-byte comparison algorithm for the best ratio between
     // performance and lookup table.
 
     // The lookup table just needs to store every 16th entry
@@ -8550,7 +8712,8 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
 
 /* AES */
 
-#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
+#if !defined(__ARM_FEATURE_CRYPTO) && \
+    ((!defined(_M_ARM64) && !defined(_M_ARM64EC)) || defined(__clang__))
 /* clang-format off */
 #define SSE2NEON_AES_SBOX(w)                                           \
     {                                                                  \
@@ -8641,7 +8804,7 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
 #undef SSE2NEON_AES_H0
 
 /* x_time function and matrix multiply function */
-#if !defined(__aarch64__) && !defined(_M_ARM64)
+#if !defined(__aarch64__)
 #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
 #define SSE2NEON_MULTIPLY(x, y)                                  \
     (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
@@ -8657,7 +8820,7 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
 // for more information.
 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__)
     static const uint8_t shift_rows[] = {
         0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
         0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
@@ -8697,9 +8860,9 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
 #define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
     (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
      ((uint32_t) (b1) << 8) | (uint32_t) (b0))
-// muliplying 'x' by 2 in GF(2^8)
+// multiplying 'x' by 2 in GF(2^8)
 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
-// muliplying 'x' by 3 in GF(2^8)
+// multiplying 'x' by 3 in GF(2^8)
 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
 #define SSE2NEON_AES_U0(p) \
     SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
@@ -8784,7 +8947,7 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
     v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
 
     w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
-                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
+                                 0x1b);  // multiplying 'v' by 2 in GF(2^8)
     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
 
@@ -8816,7 +8979,8 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
                   SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
     }
 
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)),
+                         RoundKey);
 #endif
 }
 
@@ -8866,7 +9030,7 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
         _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
     };
 
-    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8(v)), RoundKey);
 #endif
 }
 
@@ -8904,7 +9068,8 @@ FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
         v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
     }
 
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)),
+                         RoundKey);
 #endif
 }
 
@@ -9058,7 +9223,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
     // AESE does ShiftRows and SubBytes on A
     uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
 
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
     uint8x16_t dest = {
         // Undo ShiftRows step from AESE and extract X1 and X3
         u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
@@ -9129,14 +9294,14 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
 {
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -9150,7 +9315,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #if __has_builtin(__builtin_popcount)
     return __builtin_popcount(a);
 #elif defined(_MSC_VER)
@@ -9179,7 +9344,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #if __has_builtin(__builtin_popcountll)
     return __builtin_popcountll(a);
 #elif defined(_MSC_VER)
@@ -9210,14 +9375,14 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
     // regardless of the value of the FZ bit.
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -9225,10 +9390,10 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 
     r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     _sse2neon_set_fpcr(r.value);
 #else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
 #endif
 }
 
@@ -9236,7 +9401,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
 FORCE_INLINE uint64_t _rdtsc(void)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     uint64_t val;
 
     /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
@@ -9245,7 +9410,7 @@ FORCE_INLINE uint64_t _rdtsc(void)
      * bits wide and it is attributed with the flag 'cap_user_time_short'
      * is true.
      */
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
 #else
     __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));

From 96bdc60796bd686822a82a0c64e291253a77af74 Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Wed, 23 Jul 2025 22:33:46 +0100
Subject: [PATCH 02/38] [sse2neon] update to stable

---
 externals/sse2neon/sse2neon.h | 749 ++++++++++++++++------------------
 1 file changed, 358 insertions(+), 391 deletions(-)

diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h
index 4626e923fd..79b90fe864 100755
--- a/externals/sse2neon/sse2neon.h
+++ b/externals/sse2neon/sse2neon.h
@@ -1,6 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2015-2024 SSE2NEON Contributors
-// SPDX-License-Identifier: MIT
-
 #ifndef SSE2NEON_H
 #define SSE2NEON_H
 
@@ -131,17 +128,17 @@
 #include <stdlib.h>
 #include <string.h>
 
-FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val)
+FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64)
 {
-    double tmp;
-    memcpy(&tmp, &val, sizeof(uint64_t));
-    return tmp;
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
 }
-FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
+FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
 {
-    int64_t tmp;
-    memcpy(&tmp, &val, sizeof(uint64_t));
-    return tmp;
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(uint64_t));
+    return i64;
 }
 
 #if defined(_WIN32) && !defined(__MINGW32__)
@@ -151,9 +148,6 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
 
 /* If using MSVC */
 #ifdef _MSC_VER
-#if defined(_M_ARM64EC)
-#define _DISABLE_SOFTINTRIN_ 1
-#endif
 #include <intrin.h>
 #if SSE2NEON_INCLUDE_WINDOWS_H
 #include <processthreadsapi.h>
@@ -169,7 +163,7 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
 #endif
 
 #if (defined(_M_AMD64) || defined(__x86_64__)) || \
-    (defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__arm64__))
+    (defined(_M_ARM64) || defined(__arm64__))
 #define SSE2NEON_HAS_BITSCAN64
 #endif
 #endif
@@ -252,7 +246,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #pragma GCC push_options
 #pragma GCC target("fpu=neon")
 #endif
-#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#elif defined(__aarch64__) || defined(_M_ARM64)
 #if !defined(__clang__) && !defined(_MSC_VER)
 #pragma GCC push_options
 #pragma GCC target("+simd")
@@ -273,8 +267,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #endif
 
 #include <arm_neon.h>
-#if (!defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)) && \
-    (__ARM_ARCH == 8)
+#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
 #if defined __has_include && __has_include(<arm_acle.h>)
 #include <arm_acle.h>
 #endif
@@ -292,7 +285,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #endif
 
 /* Rounding functions require either Aarch64 instructions or libm fallback */
-#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 #include <math.h>
 #endif
 
@@ -301,7 +294,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
  * To write or access to these registers in user mode,
  * we have to perform syscall instead.
  */
-#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#if (!defined(__aarch64__) && !defined(_M_ARM64))
 #include <sys/time.h>
 #endif
 
@@ -410,7 +403,7 @@ typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
 // On ARM 32-bit architecture, the float64x2_t is not supported.
 // The data type __m128d should be represented in a different way for related
 // intrinsic conversion.
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
 #else
 typedef float32x4_t __m128d;
@@ -511,7 +504,7 @@ typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
 
 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
 
@@ -643,7 +636,7 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 /* emulate vaddv u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 {
@@ -658,7 +651,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 /* emulate vaddvq u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
 {
@@ -676,7 +669,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 /* emulate vaddvq u16 variant */
 FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
 {
@@ -731,13 +724,6 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
  */
 
 /* Constants for use with _mm_prefetch. */
-#if defined(_M_ARM64EC)
-/* winnt.h already defines these constants as macros, so undefine them first. */
-#undef _MM_HINT_NTA
-#undef _MM_HINT_T0
-#undef _MM_HINT_T1
-#undef _MM_HINT_T2
-#endif
 enum _mm_hint {
     _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
     _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
@@ -753,7 +739,7 @@ typedef struct {
     uint8_t bit23 : 1;
     uint8_t bit24 : 1;
     uint8_t res2 : 7;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     uint32_t res3;
 #endif
 } fpcr_bitfield;
@@ -897,8 +883,8 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
 // supported by WoA has crypto extensions. If this changes in the future,
 // this can be verified via the runtime-only method of:
 // IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
-#if ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) || \
-    (defined(__ARM_FEATURE_CRYPTO) &&                                      \
+#if (defined(_M_ARM64) && !defined(__clang__)) || \
+    (defined(__ARM_FEATURE_CRYPTO) &&             \
      (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
 // Wraps vmull_p64
 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
@@ -1023,8 +1009,8 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 //   __m128i _mm_shuffle_epi32_default(__m128i a,
 //                                     __constrange(0, 255) int imm) {
 //       __m128i ret;
-//       ret[0] = a[(imm)        & 0x3];   ret[1] = a[((imm) >> 2) & 0x3];
-//       ret[2] = a[((imm) >> 4) & 0x03];  ret[3] = a[((imm) >> 6) & 0x03];
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
 //       return ret;
 //   }
 #define _mm_shuffle_epi32_default(a, imm)                                   \
@@ -1122,7 +1108,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
 }
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #define _mm_shuffle_epi32_splat(a, imm) \
     vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
 #else
@@ -1139,8 +1125,8 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
 //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
 //                                 __constrange(0, 255) int imm) {
 //       __m128 ret;
-//       ret[0] = a[(imm)        & 0x3];   ret[1] = a[((imm) >> 2) & 0x3];
-//       ret[2] = b[((imm) >> 4) & 0x03];  ret[3] = b[((imm) >> 6) & 0x03];
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
 //       return ret;
 //   }
 //
@@ -1562,7 +1548,7 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpret_m64_s32(
         vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
@@ -1587,7 +1573,7 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
                           0);
@@ -1718,7 +1704,7 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
 #else
@@ -1771,7 +1757,7 @@ FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -1845,14 +1831,14 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
 {
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -1991,18 +1977,20 @@ FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 #if !defined(SSE2NEON_ALLOC_DEFINED)
 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
 {
-#if defined(_WIN32)
-    return _aligned_malloc(size, align);
-#else
     void *ptr;
     if (align == 1)
         return malloc(size);
     if (align == 2 || (sizeof(void *) == 8 && align == 4))
         align = sizeof(void *);
+#if defined(_WIN32)
+    ptr = _aligned_malloc(size, align);
+    if (ptr)
+        return ptr;
+#else
     if (!posix_memalign(&ptr, align, size))
         return ptr;
-    return NULL;
 #endif
+    return NULL;
 }
 #endif
 
@@ -2166,7 +2154,7 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
 FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 {
     uint8x8_t input = vreinterpret_u8_m64(a);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     uint8x8_t tmp = vshr_n_u8(input, 7);
     return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
@@ -2187,7 +2175,7 @@ FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 FORCE_INLINE int _mm_movemask_ps(__m128 a)
 {
     uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static const int32_t shift[4] = {0, 1, 2, 3};
     uint32x4_t tmp = vshrq_n_u32(input, 31);
     return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
@@ -2421,7 +2409,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
     uint64x1_t t = vpaddl_u32(vpaddl_u16(
         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
     return vreinterpret_m64_u16(
-        vset_lane_u16((uint16_t) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+        vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
 }
 
 // Macro: Set the flush zero bits of the MXCSR control and status register to
@@ -2434,14 +2422,14 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
     // regardless of the value of the FZ bit.
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -2449,7 +2437,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
 
     r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     _sse2neon_set_fpcr(r.value);
 #else
     __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
@@ -2555,10 +2543,10 @@ FORCE_INLINE __m128 _mm_setzero_ps(void)
 // in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
 #ifdef _sse2neon_shuffle
-#define _mm_shuffle_pi16(a, imm)                                         \
-    vreinterpret_m64_s16(vshuffle_s16(                                   \
-        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), ((imm) & 0x3), \
-        (((imm) >> 2) & 0x3), (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3)))
+#define _mm_shuffle_pi16(a, imm)                                       \
+    vreinterpret_m64_s16(vshuffle_s16(                                 \
+        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+        ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
 #else
 #define _mm_shuffle_pi16(a, imm)                                              \
     _sse2neon_define1(                                                        \
@@ -2689,8 +2677,7 @@ FORCE_INLINE void _mm_lfence(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
-    !SSE2NEON_PRECISE_SQRT
+#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT
     return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
 #else
     float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
@@ -2919,7 +2906,7 @@ FORCE_INLINE __m128 _mm_undefined_ps(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -2935,7 +2922,7 @@ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -2994,7 +2981,7 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3019,7 +3006,7 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_add_pd(a, b));
 #else
     double a0, a1, b0;
@@ -3180,7 +3167,7 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
 #else
     return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
@@ -3252,7 +3239,7 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3278,7 +3265,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
 FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3304,7 +3291,7 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
 FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
@@ -3352,7 +3339,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
 FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3378,7 +3365,7 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
 FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
@@ -3399,7 +3386,7 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
 FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3425,7 +3412,7 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
 FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmple_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
@@ -3476,7 +3463,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
 FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3502,7 +3489,7 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
 FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
     double a0, b0;
@@ -3522,7 +3509,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
 FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
 #else
@@ -3548,7 +3535,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
 FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
@@ -3583,7 +3570,7 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
 FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
@@ -3618,7 +3605,7 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
 FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
@@ -3653,7 +3640,7 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
 FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
@@ -3688,7 +3675,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
 FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     // Excluding NaNs, any two floating point numbers can be compared.
     uint64x2_t not_nan_a =
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
@@ -3718,7 +3705,7 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
 FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
 #else
     double a0, b0;
@@ -3738,7 +3725,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
 FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     // Two NaNs are not equal in comparison operation.
     uint64x2_t not_nan_a =
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
@@ -3769,7 +3756,7 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
 FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
     double a0, b0;
@@ -3789,7 +3776,7 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
 FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
 #else
     double a0, b0;
@@ -3804,7 +3791,7 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
 FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
 #else
     double a0, b0;
@@ -3820,7 +3807,7 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
 FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
 #else
     double a0, b0;
@@ -3836,7 +3823,7 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
 FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
 #else
     double a0, b0;
@@ -3852,7 +3839,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
 FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
 #else
     uint32x4_t a_not_nan =
@@ -3881,7 +3868,7 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
 FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
 #else
@@ -3942,7 +3929,7 @@ FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
 #else
@@ -3958,7 +3945,7 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
 FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
 #else
@@ -3977,7 +3964,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 {
 #if defined(__ARM_FEATURE_FRINT)
     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
-#elif (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (_MM_GET_ROUNDING_MODE()) {
     case _MM_ROUND_NEAREST:
@@ -4031,7 +4018,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
 #else
@@ -4045,7 +4032,7 @@ FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
 #else
     double _a =
@@ -4059,7 +4046,7 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
@@ -4074,7 +4061,7 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
@@ -4096,7 +4083,7 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
 FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(vsetq_lane_f32(
         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
         vreinterpretq_f32_m128(a), 0));
@@ -4132,7 +4119,7 @@ FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
 FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
@@ -4160,7 +4147,7 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
 FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
@@ -4197,7 +4184,7 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 {
     double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
 #else
@@ -4252,7 +4239,7 @@ FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     double _a =
@@ -4271,7 +4258,7 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -4297,7 +4284,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float64x2_t tmp =
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_f64(
@@ -4329,7 +4316,7 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
 FORCE_INLINE __m128d _mm_load_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vld1q_f64(p));
 #else
     const float *fp = (const float *) p;
@@ -4349,7 +4336,7 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
 FORCE_INLINE __m128d _mm_load_sd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
 #else
     const float *fp = (const float *) p;
@@ -4371,7 +4358,7 @@ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
 #else
     return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
@@ -4384,7 +4371,7 @@ FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
 #else
@@ -4410,7 +4397,7 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
 #else
@@ -4426,7 +4413,7 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float64x2_t v = vld1q_f64(p);
     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
 #else
@@ -4466,7 +4453,7 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
 {
     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
                               vget_low_s16(vreinterpretq_s16_m128i(b)));
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int32x4_t high =
         vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
 
@@ -4520,7 +4507,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
 FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #if SSE2NEON_PRECISE_MINMAX
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
@@ -4552,7 +4539,7 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
 FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_max_pd(a, b));
 #else
     double a0, a1, b0;
@@ -4587,7 +4574,7 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
 FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #if SSE2NEON_PRECISE_MINMAX
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
@@ -4618,7 +4605,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
 FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_min_pd(a, b));
 #else
     double a0, a1, b0;
@@ -4776,7 +4763,7 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -4843,7 +4830,7 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
     uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
     uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     uint32x4_t ab7654 =
         vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
     uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
@@ -5013,7 +5000,7 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 {
     double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
 #else
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
@@ -5030,7 +5017,7 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
 FORCE_INLINE __m128d _mm_set_sd(double a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
 #else
     return _mm_set_pd(0, a);
@@ -5077,7 +5064,7 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
 FORCE_INLINE __m128d _mm_set1_pd(double d)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
 #else
     int64_t _d = sse2neon_recast_f64_s64(d);
@@ -5154,7 +5141,7 @@ FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
 FORCE_INLINE __m128d _mm_setzero_pd(void)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vdupq_n_f64(0));
 #else
     return vreinterpretq_m128d_f32(vdupq_n_f32(0));
@@ -5241,12 +5228,12 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
 #define _mm_shuffle_pd(a, b, imm8)                                            \
     vreinterpretq_m128d_s64(                                                  \
         vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
-                      (imm8) & 0x1, (((imm8) & 0x2) >> 1) + 2))
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
 #else
-#define _mm_shuffle_pd(a, b, imm8)                                       \
-    _mm_castsi128_pd(_mm_set_epi64x(                                     \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(b), ((imm8) & 0x2) >> 1), \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(a), (imm8) & 0x1)))
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
 #endif
 
 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
@@ -5327,7 +5314,7 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
     if (_sse2neon_unlikely(imm & ~15))
         return _mm_setzero_si128();
     return vreinterpretq_m128i_s16(
-        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16((int16_t) imm)));
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
 }
 
 // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
@@ -5355,13 +5342,13 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
 // dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
-#define _mm_slli_si128(a, imm)                                                \
-    _sse2neon_define1(                                                        \
-        __m128i, a, int8x16_t ret;                                            \
-        if (_sse2neon_unlikely((imm) == 0)) ret = vreinterpretq_s8_m128i(_a); \
-        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);        \
-        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),        \
-                            (((imm) <= 0 || (imm) > 15) ? 0 : (16 - (imm)))); \
+#define _mm_slli_si128(a, imm)                                              \
+    _sse2neon_define1(                                                      \
+        __m128i, a, int8x16_t ret;                                          \
+        if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
+        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
+        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),      \
+                            ((imm <= 0 || imm > 15) ? 0 : (16 - imm)));     \
         _sse2neon_return(vreinterpretq_m128i_s8(ret));)
 
 // Compute the square root of packed double-precision (64-bit) floating-point
@@ -5369,7 +5356,7 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
 FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
 #else
     double a0, a1;
@@ -5387,7 +5374,7 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
 FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_sqrt_pd(b));
 #else
     double _a, _b;
@@ -5406,7 +5393,7 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
     if (_sse2neon_unlikely(c & ~15))
         return _mm_cmplt_epi16(a, _mm_setzero_si128());
     return vreinterpretq_m128i_s16(
-        vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t) -c)));
+        vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
 }
 
 // Shift packed 32-bit integers in a right by count while shifting in sign bits,
@@ -5426,7 +5413,7 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
 {
-    const int16_t count = (imm & ~15) ? 15 : (int16_t) imm;
+    const int count = (imm & ~15) ? 15 : imm;
     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
 }
 
@@ -5488,13 +5475,13 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
 // store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                                 \
-    _sse2neon_define0(                                                         \
-        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {        \
-            ret = _mm_setzero_si128();                                         \
-        } else {                                                               \
-            ret = vreinterpretq_m128i_u16(vshlq_u16(                           \
-                vreinterpretq_u16_m128i(_a), vdupq_n_s16((int16_t) - (imm)))); \
+#define _mm_srli_epi16(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u16(                                    \
+                vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
         } _sse2neon_return(ret);)
 
 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
@@ -5530,7 +5517,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
         __m128i, a, int8x16_t ret;                                     \
         if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
         else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
-                            ((imm) > 15 ? 0 : (imm)));                 \
+                            (imm > 15 ? 0 : imm));                     \
         _sse2neon_return(vreinterpretq_m128i_s8(ret));)
 
 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
@@ -5539,7 +5526,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
 #else
     vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
@@ -5552,7 +5539,7 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
     vst1q_f64((float64_t *) mem_addr,
               vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
@@ -5568,7 +5555,7 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
@@ -5594,7 +5581,7 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
@@ -5613,7 +5600,7 @@ FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
@@ -5664,7 +5651,7 @@ FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
 {
 #if __has_builtin(__builtin_nontemporal_store)
     __builtin_nontemporal_store(a, (__m128d *) p);
-#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#elif defined(__aarch64__) || defined(_M_ARM64)
     vst1q_f64(p, vreinterpretq_f64_m128d(a));
 #else
     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
@@ -5744,7 +5731,7 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
 //  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -5847,7 +5834,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s16(
         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 #else
@@ -5863,7 +5850,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s32(
         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 #else
@@ -5879,7 +5866,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s64(
         vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -5894,7 +5881,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s8(
         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
@@ -5912,7 +5899,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -5927,7 +5914,7 @@ FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s16(
         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 #else
@@ -5943,7 +5930,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s32(
         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 #else
@@ -5959,7 +5946,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s64(
         vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -5974,7 +5961,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s8(
         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
@@ -5990,7 +5977,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -6027,7 +6014,7 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
 FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 {
     _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
                                              vreinterpretq_f64_m128d(b),
                                              vreinterpretq_f64_m128d(mask)));
@@ -6043,7 +6030,7 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 {
     _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_FMA) /* VFPv4+ */
     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
                                             vreinterpretq_f32_m128(mask),
@@ -6058,7 +6045,7 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -6080,7 +6067,7 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -6098,7 +6085,7 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
 FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
     return vreinterpretq_m128d_f64(
@@ -6124,7 +6111,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 {
     float32x4_t a = vreinterpretq_f32_m128(_a);
     float32x4_t b = vreinterpretq_f32_m128(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
 #else
@@ -6149,7 +6136,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
 #else
@@ -6163,7 +6150,7 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
 #elif defined(_sse2neon_shuffle)
@@ -6182,7 +6169,7 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
 #elif defined(_sse2neon_shuffle)
@@ -6250,32 +6237,32 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
 // the result right by imm8 bytes, and store the low 16 bytes in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
 #if defined(__GNUC__) && !defined(__clang__)
-#define _mm_alignr_epi8(a, b, imm)                                 \
-    __extension__({                                                \
-        uint8x16_t _a = vreinterpretq_u8_m128i(a);                 \
-        uint8x16_t _b = vreinterpretq_u8_m128i(b);                 \
-        __m128i ret;                                               \
-        if (_sse2neon_unlikely((imm) & ~31))                       \
-            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));           \
-        else if ((imm) >= 16)                                      \
-            ret = _mm_srli_si128(a, (imm) >= 16 ? (imm) - 16 : 0); \
-        else                                                       \
-            ret = vreinterpretq_m128i_u8(                          \
-                vextq_u8(_b, _a, (imm) < 16 ? (imm) : 0));         \
-        ret;                                                       \
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
     })
 
 #else
-#define _mm_alignr_epi8(a, b, imm)                                  \
-    _sse2neon_define2(                                              \
-        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
-        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;   \
-        if (_sse2neon_unlikely((imm) & ~31)) ret =                  \
-            vreinterpretq_m128i_u8(vdupq_n_u8(0));                  \
-        else if ((imm) >= 16) ret =                                 \
-            _mm_srli_si128(_a, (imm) >= 16 ? (imm) - 16 : 0);       \
-        else ret = vreinterpretq_m128i_u8(                          \
-            vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0));            \
+#define _mm_alignr_epi8(a, b, imm)                                          \
+    _sse2neon_define2(                                                      \
+        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a);         \
+        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;           \
+        if (_sse2neon_unlikely((imm) & ~31)) ret =                          \
+            vreinterpretq_m128i_u8(vdupq_n_u8(0));                          \
+        else if (imm >= 16) ret =                                           \
+            _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0);                   \
+        else ret =                                                          \
+            vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
         _sse2neon_return(ret);)
 
 #endif
@@ -6310,7 +6297,7 @@ FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
 #else
     return vreinterpretq_m128i_s16(
@@ -6326,7 +6313,7 @@ FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
 #else
     return vreinterpretq_m128i_s32(
@@ -6358,7 +6345,7 @@ FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
     return vreinterpretq_s64_s16(
@@ -6383,7 +6370,7 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t res = vuzp_s16(a, b);
@@ -6398,7 +6385,7 @@ FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s16(
         vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
@@ -6414,7 +6401,7 @@ FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s32(
         vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
 #else
@@ -6430,7 +6417,7 @@ FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t c = vuzp_s16(a, b);
@@ -6445,7 +6432,7 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
 {
     int32x2_t a = vreinterpret_s32_m64(_a);
     int32x2_t b = vreinterpret_s32_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
 #else
     int32x2x2_t c = vuzp_s32(a, b);
@@ -6460,7 +6447,7 @@ FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s16(
         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
@@ -6476,7 +6463,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t c = vuzp_s16(a, b);
@@ -6491,7 +6478,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     uint8x16_t a = vreinterpretq_u8_m128i(_a);
     int8x16_t b = vreinterpretq_s8_m128i(_b);
     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
@@ -6595,7 +6582,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
     uint8x16_t idx_masked =
         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
 #elif defined(__GNUC__)
     int8x16_t ret;
@@ -6641,7 +6628,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
     // (b < 0) ? 0xFFFF : 0
     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
     // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
 #else
     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
@@ -6670,7 +6657,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
 
     // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
 #else
     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
@@ -6699,7 +6686,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
 
     // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
 #else
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
@@ -6728,7 +6715,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
 
     // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
 #else
     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
@@ -6757,7 +6744,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
 
     // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
 #else
     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
@@ -6786,7 +6773,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
 
     // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
 #else
     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
@@ -6844,9 +6831,11 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
 {
-    const uint32_t ALIGN_STRUCT(16) data[4] = {
-        (imm8 & (1 << 0)) ? UINT32_MAX : 0, (imm8 & (1 << 1)) ? UINT32_MAX : 0,
-        (imm8 & (1 << 2)) ? UINT32_MAX : 0, (imm8 & (1 << 3)) ? UINT32_MAX : 0};
+    const uint32_t
+        ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                                    ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                                    ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                                    ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
     uint32x4_t mask = vld1q_u32(data);
     float32x4_t a = vreinterpretq_f32_m128(_a);
     float32x4_t b = vreinterpretq_f32_m128(_b);
@@ -6873,7 +6862,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
 {
     uint64x2_t mask =
         vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float64x2_t a = vreinterpretq_f64_m128d(_a);
     float64x2_t b = vreinterpretq_f64_m128d(_b);
     return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
@@ -6903,7 +6892,7 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
 FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
 #else
     double a0, a1;
@@ -6919,7 +6908,7 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
 #else
@@ -6952,7 +6941,7 @@ FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
 // in dst
 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_u64(
         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
 #else
@@ -7109,7 +7098,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
         _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
     __m128d tmp = _mm_and_pd(mul, mulMask);
 #else
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
                              : 0;
@@ -7131,7 +7120,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
     __m128d tmp = _mm_set_pd(d1, d0);
 #endif
     // Sum the products
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
 #else
     double _tmp0 = sse2neon_recast_u64_f64(
@@ -7155,7 +7144,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 {
     float32x4_t elementwise_prod = _mm_mul_ps(a, b);
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     /* shortcuts */
     if (imm == 0xFF) {
         return _mm_set1_ps(vaddvq_f32(elementwise_prod));
@@ -7225,7 +7214,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
 FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
 #else
     double a0, a1;
@@ -7241,7 +7230,7 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
 #else
@@ -7300,24 +7289,24 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
 // element from b into tmp using the control in imm8. Store tmp to dst using
 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
-#define _mm_insert_ps(a, b, imm8)                                              \
-    _sse2neon_define2(                                                         \
-        __m128, a, b,                                                          \
-        float32x4_t tmp1 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(_b, ((imm8) >> 6) & 0x3),            \
-                           vreinterpretq_f32_m128(_a), 0);                     \
-        float32x4_t tmp2 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                            \
-                           vreinterpretq_f32_m128(_a), (((imm8) >> 4) & 0x3)); \
-        const uint32_t data[4] =                                               \
-            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,               \
-                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,               \
-                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,               \
-                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);              \
-        uint32x4_t mask = vld1q_u32(data);                                     \
-        float32x4_t all_zeros = vdupq_n_f32(0);                                \
-                                                                               \
-        _sse2neon_return(vreinterpretq_m128_f32(                               \
+#define _mm_insert_ps(a, b, imm8)                                            \
+    _sse2neon_define2(                                                       \
+        __m128, a, b,                                                        \
+        float32x4_t tmp1 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3),            \
+                           vreinterpretq_f32_m128(_a), 0);                   \
+        float32x4_t tmp2 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                          \
+                           vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
+        const uint32_t data[4] =                                             \
+            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);            \
+        uint32x4_t mask = vld1q_u32(data);                                   \
+        float32x4_t all_zeros = vdupq_n_f32(0);                              \
+                                                                             \
+        _sse2neon_return(vreinterpretq_m128_f32(                             \
             vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
 
 // Compare packed signed 32-bit integers in a and b, and store packed maximum
@@ -7399,7 +7388,7 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
 {
     __m128i dst;
     uint16_t min, idx = 0;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     // Find the minimum value
     min = vminvq_u16(vreinterpretq_u16_m128i(a));
 
@@ -7502,7 +7491,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
     c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
     uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
     c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     // |0|4|2|6|
     c04 = vpaddq_s16(c04, c26);
     // |1|5|3|7|
@@ -7562,7 +7551,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
 FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     switch (rounding) {
     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
         return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
@@ -7631,7 +7620,7 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (rounding) {
     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
@@ -7778,9 +7767,9 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 {
-    int64x2_t s64_vec =
+    int64x2_t s64 =
         vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
-    return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
 }
 
 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
@@ -7798,9 +7787,9 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
 {
-    int64x2_t s64_vec =
+    int64x2_t s64 =
         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
 }
 
 /* SSE4.2 */
@@ -7968,40 +7957,40 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
                                       SSE2NEON_CAT(u, size)))                \
     } while (0)
 
-#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                               \
-    static uint16_t _sse2neon_cmp_##type##_equal_any(__m128i a, int la, \
-                                                     __m128i b, int lb) \
-    {                                                                   \
-        __m128i mtx[16];                                                \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),    \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));      \
-        return SSE2NEON_CAT(                                            \
-            _sse2neon_aggregate_equal_any_,                             \
-            SSE2NEON_CAT(                                               \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                  \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
-                                             type))))(la, lb, mtx);     \
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
     }
 
-#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)          \
-    static uint16_t _sse2neon_cmp_##us##type##_ranges(__m128i a, int la,     \
-                                                      __m128i b, int lb)     \
-    {                                                                        \
-        __m128i mtx[16];                                                     \
-        PCMPSTR_RANGES(                                                      \
-            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);    \
-        return SSE2NEON_CAT(                                                 \
-            _sse2neon_aggregate_ranges_,                                     \
-            SSE2NEON_CAT(                                                    \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                       \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,      \
-                                             type))))(la, lb, mtx);          \
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
     }
 
 #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
-    static uint16_t _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,    \
-                                                         __m128i b, int lb)    \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
     {                                                                          \
         __m128i mtx[16];                                                       \
         PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
@@ -8015,34 +8004,29 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
     }
 
-static uint16_t _sse2neon_aggregate_equal_any_8x16(int la,
-                                                   int lb,
-                                                   __m128i mtx[16])
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
 {
-    uint16_t res = 0;
+    int res = 0;
     int m = (1 << la) - 1;
     uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
     uint8x16_t vec = vcombine_u8(t_lo, t_hi);
     for (int j = 0; j < lb; j++) {
         mtx[j] = vreinterpretq_m128i_u8(
             vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
         mtx[j] = vreinterpretq_m128i_u8(
             vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        uint16_t tmp =
-            _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
         res |= (tmp << j);
     }
     return res;
 }
 
-static uint16_t _sse2neon_aggregate_equal_any_16x8(int la,
-                                                   int lb,
-                                                   __m128i mtx[16])
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
 {
-    uint16_t res = 0;
-    uint16_t m = (uint16_t) (1 << la) - 1;
+    int res = 0;
+    int m = (1 << la) - 1;
     uint16x8_t vec =
         vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
     for (int j = 0; j < lb; j++) {
@@ -8050,8 +8034,7 @@ static uint16_t _sse2neon_aggregate_equal_any_16x8(int la,
             vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
         mtx[j] = vreinterpretq_m128i_u16(
             vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        uint16_t tmp =
-            _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
         res |= (tmp << j);
     }
     return res;
@@ -8065,10 +8048,10 @@ static uint16_t _sse2neon_aggregate_equal_any_16x8(int la,
 
 SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
 
-static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
 {
-    uint16_t res = 0;
-    uint16_t m = (uint16_t) (1 << la) - 1;
+    int res = 0;
+    int m = (1 << la) - 1;
     uint16x8_t vec =
         vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
     for (int j = 0; j < lb; j++) {
@@ -8080,24 +8063,24 @@ static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
             vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
         uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
                                        vreinterpretq_u32_m128i(tmp));
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
-        uint16_t t = vaddvq_u32(vec_res) ? 1 : 0;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
 #else
         uint64x2_t sumh = vpaddlq_u32(vec_res);
-        uint16_t t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
 #endif
         res |= (t << j);
     }
     return res;
 }
 
-static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
 {
-    uint16_t res = 0;
-    uint16_t m = (uint16_t) ((1 << la) - 1);
+    int res = 0;
+    int m = (1 << la) - 1;
     uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
     uint8x16_t vec = vcombine_u8(t_lo, t_hi);
     for (int j = 0; j < lb; j++) {
         mtx[j] = vreinterpretq_m128i_u8(
@@ -8108,7 +8091,7 @@ static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
             vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
         uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
                                        vreinterpretq_u16_m128i(tmp));
-        uint16_t t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
         res |= (t << j);
     }
     return res;
@@ -8130,25 +8113,22 @@ SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
 #undef SSE2NEON_CMP_RANGES_IS_BYTE
 #undef SSE2NEON_CMP_RANGES_IS_WORD
 
-static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a,
-                                              int la,
-                                              __m128i b,
-                                              int lb)
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
 {
     uint8x16_t mtx =
         vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
-    uint16_t m0 = (la < lb) ? 0 : (uint16_t) ((1 << la) - (1 << lb));
-    uint16_t m1 = (uint16_t) (0x10000 - (1 << la));
-    uint16_t tb = (uint16_t) (0x10000 - (1 << lb));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
     uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
     uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
     vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    vec0_lo = vtst_u8(vdup_n_u8((uint8_t) m0), vec_mask);
-    vec0_hi = vtst_u8(vdup_n_u8((uint8_t) (m0 >> 8)), vec_mask);
-    vec1_lo = vtst_u8(vdup_n_u8((uint8_t) m1), vec_mask);
-    vec1_hi = vtst_u8(vdup_n_u8((uint8_t) (m1 >> 8)), vec_mask);
-    tmp_lo = vtst_u8(vdup_n_u8((uint8_t) tb), vec_mask);
-    tmp_hi = vtst_u8(vdup_n_u8((uint8_t) (tb >> 8)), vec_mask);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
 
     res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
     res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
@@ -8157,20 +8137,17 @@ static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a,
     res_lo = vand_u8(res_lo, vec_mask);
     res_hi = vand_u8(res_hi, vec_mask);
 
-    return _sse2neon_vaddv_u8(res_lo) +
-           (uint16_t) (_sse2neon_vaddv_u8(res_hi) << 8);
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
 }
 
-static uint16_t _sse2neon_cmp_word_equal_each(__m128i a,
-                                              int la,
-                                              __m128i b,
-                                              int lb)
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
 {
     uint16x8_t mtx =
         vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    uint16_t m0 = (uint16_t) ((la < lb) ? 0 : ((1 << la) - (1 << lb)));
-    uint16_t m1 = (uint16_t) (0x100 - (1 << la));
-    uint16_t tb = (uint16_t) (0x100 - (1 << lb));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
     uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
     uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
     uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
@@ -8185,22 +8162,18 @@ static uint16_t _sse2neon_cmp_word_equal_each(__m128i a,
 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
 
 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
-    static uint16_t                                                            \
-        _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(          \
-            int bound, int la, int lb, __m128i mtx[16])                        \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
     {                                                                          \
-        uint16_t res = 0;                                                      \
-        uint16_t m1 =                                                          \
-            (uint16_t) (SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la));  \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
         uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
             vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
             vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
         uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
-            vcombine_u##size(                                                  \
-                vtst_u##size(vdup_n_u##size((uint##size##_t) m1), vec_mask),   \
-                vtst_u##size(vdup_n_u##size((uint##size##_t)(m1 >> 8)),        \
-                             vec_mask)),                                       \
-            vtstq_u##size(vdupq_n_u##size((uint##size##_t) m1), vec_mask));    \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
         uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
         uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
         for (int j = 0; j < lb; j++) {                                         \
@@ -8217,7 +8190,7 @@ static uint16_t _sse2neon_cmp_word_equal_each(__m128i a,
             int val = 1;                                                       \
             for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
                 val &= ptr[k * bound + j];                                     \
-            res += (uint16_t) (val << i);                                      \
+            res += val << i;                                                   \
         }                                                                      \
         return res;                                                            \
     }
@@ -8264,17 +8237,14 @@ enum {
     SSE2NEON_CMPESTR_LIST
 #undef _
 };
-typedef uint16_t (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
 static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
 #define _(name, func_suffix) _sse2neon_##func_suffix,
     SSE2NEON_CMPESTR_LIST
 #undef _
 };
 
-FORCE_INLINE uint16_t _sse2neon_sido_negative(int res,
-                                              int lb,
-                                              int imm8,
-                                              int bound)
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
 {
     switch (imm8 & 0x30) {
     case _SIDD_NEGATIVE_POLARITY:
@@ -8287,7 +8257,7 @@ FORCE_INLINE uint16_t _sse2neon_sido_negative(int res,
         break;
     }
 
-    return (uint16_t) (res & ((bound == 8) ? 0xFF : 0xFFFF));
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
 }
 
 FORCE_INLINE int _sse2neon_clz(unsigned int x)
@@ -8336,7 +8306,7 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 #define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
 
 #define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
-    const int var = ((imm) & 0x01) ? 8 : 16
+    const int var = (imm & 0x01) ? 8 : 16
 
 #define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
     int tmp1 = la ^ (la >> 31);                  \
@@ -8351,28 +8321,28 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
 // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
 // string a and b.
-#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                         \
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                               \
-    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                               \
-    uint16_t r2 = (_sse2neon_cmpfunc_table[(imm8) & 0x0f])(a, la, b, lb); \
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
     r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
 
-#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)            \
-    return (r2 == 0) ? bound                                       \
-                     : (((imm8) & 0x40) ? (31 - _sse2neon_clz(r2)) \
-                                        : _sse2neon_ctz(r2))
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
 
 #define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
     __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
-    if ((imm8) & 0x40) {                                                       \
+    if (imm8 & 0x40) {                                                         \
         if (bound == 8) {                                                      \
             uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
                                        vld1q_u16(_sse2neon_cmpestr_mask16b));  \
             dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
                 tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
         } else {                                                               \
-            uint8x16_t vec_r2 = vcombine_u8(vdup_n_u8((uint8_t) r2),           \
-                                            vdup_n_u8((uint8_t) (r2 >> 8)));   \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
             uint8x16_t tmp =                                                   \
                 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
             dst = vreinterpretq_m128i_u8(                                      \
@@ -8383,8 +8353,8 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
             dst = vreinterpretq_m128i_u16(                                     \
                 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
         } else {                                                               \
-            dst = vreinterpretq_m128i_u8(vsetq_lane_u8(                        \
-                (uint8_t) (r2 & 0xff), vreinterpretq_u8_m128i(dst), 0));       \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
         }                                                                      \
     }                                                                          \
     return dst
@@ -8487,7 +8457,7 @@ FORCE_INLINE int _mm_cmpestrz(__m128i a,
 
 #define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
     do {                                                                 \
-        if ((imm8) & 0x01) {                                             \
+        if (imm8 & 0x01) {                                               \
             uint16x8_t equal_mask_##str =                                \
                 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
             uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
@@ -8585,7 +8555,7 @@ FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
 // in b for greater than.
 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_u64(
         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -8605,11 +8575,11 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
+    (defined(_M_ARM64) && !defined(__clang__))
     crc = __crc32ch(crc, v);
 #else
-    crc = _mm_crc32_u8(crc, (uint8_t) (v & 0xff));
-    crc = _mm_crc32_u8(crc, (uint8_t) ((v >> 8) & 0xff));
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
 #endif
     return crc;
 }
@@ -8624,11 +8594,11 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
+    (defined(_M_ARM64) && !defined(__clang__))
     crc = __crc32cw(crc, v);
 #else
-    crc = _mm_crc32_u16(crc, (uint16_t) (v & 0xffff));
-    crc = _mm_crc32_u16(crc, (uint16_t) ((v >> 16) & 0xffff));
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
 #endif
     return crc;
 }
@@ -8642,11 +8612,11 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
-#elif ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
+#elif (defined(_M_ARM64) && !defined(__clang__))
     crc = __crc32cd((uint32_t) crc, v);
 #else
-    crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) (v & 0xffffffff));
-    crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) ((v >> 32) & 0xffffffff));
+    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
 #endif
     return crc;
 }
@@ -8661,7 +8631,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
+    (defined(_M_ARM64) && !defined(__clang__))
     crc = __crc32cb(crc, v);
 #else
     crc ^= v;
@@ -8712,8 +8682,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
 
 /* AES */
 
-#if !defined(__ARM_FEATURE_CRYPTO) && \
-    ((!defined(_M_ARM64) && !defined(_M_ARM64EC)) || defined(__clang__))
+#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
 /* clang-format off */
 #define SSE2NEON_AES_SBOX(w)                                           \
     {                                                                  \
@@ -8804,7 +8773,7 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
 #undef SSE2NEON_AES_H0
 
 /* x_time function and matrix multiply function */
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
 #define SSE2NEON_MULTIPLY(x, y)                                  \
     (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
@@ -8820,7 +8789,7 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
 // for more information.
 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static const uint8_t shift_rows[] = {
         0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
         0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
@@ -8979,8 +8948,7 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
                   SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
     }
 
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)),
-                         RoundKey);
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
 #endif
 }
 
@@ -9030,7 +8998,7 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
         _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
     };
 
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8(v)), RoundKey);
+    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
 #endif
 }
 
@@ -9068,8 +9036,7 @@ FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
         v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
     }
 
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)),
-                         RoundKey);
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
 #endif
 }
 
@@ -9294,14 +9261,14 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
 {
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -9315,7 +9282,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #if __has_builtin(__builtin_popcount)
     return __builtin_popcount(a);
 #elif defined(_MSC_VER)
@@ -9344,7 +9311,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #if __has_builtin(__builtin_popcountll)
     return __builtin_popcountll(a);
 #elif defined(_MSC_VER)
@@ -9375,14 +9342,14 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
     // regardless of the value of the FZ bit.
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -9390,7 +9357,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 
     r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     _sse2neon_set_fpcr(r.value);
 #else
     __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
@@ -9401,7 +9368,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
 FORCE_INLINE uint64_t _rdtsc(void)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     uint64_t val;
 
     /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the

From 47b703067e51b6d418f561eae261529442d13f28 Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Sat, 30 Aug 2025 15:17:30 +0200
Subject: [PATCH 03/38] [settings] fix unreachable code warning in fastmem bool
 (#347)

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/347
Reviewed-by: Shinmegumi <shinmegumi@eden-emu.dev>
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 src/common/settings.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common/settings.cpp b/src/common/settings.cpp
index 19140bce0d..d4f16f4853 100644
--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@@ -163,8 +163,9 @@ bool IsFastmemEnabled() {
     }
 #if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__sun__)
     return false;
-#endif
+#else
     return true;
+#endif
 }
 
 static bool is_nce_enabled = false;

From f005f6a3abcf59641af47f420569457d20c58d05 Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Sat, 30 Aug 2025 17:03:56 +0200
Subject: [PATCH 04/38] [compat] fix freebsd mmap virtual base (#354)

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/354
Reviewed-by: Shinmegumi <shinmegumi@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 src/common/host_memory.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/common/host_memory.cpp b/src/common/host_memory.cpp
index edb64de8ec..e70ac216cb 100644
--- a/src/common/host_memory.cpp
+++ b/src/common/host_memory.cpp
@@ -417,14 +417,11 @@ static void* ChooseVirtualBase(size_t virtual_size) {
 #else
 
 static void* ChooseVirtualBase(size_t virtual_size) {
-#if defined(__OpenBSD__) || defined(__sun__) || defined(__HAIKU__) || defined(__managarm__)
+#if defined(__FreeBSD__) || defined(__DragonFly__) || defined(__OpenBSD__) || defined(__sun__) || defined(__HAIKU__) || defined(__managarm__) || defined(__AIX__)
     void* virtual_base = mmap(nullptr, virtual_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE | MAP_ALIGNED_SUPER, -1, 0);
-
-    if (virtual_base != MAP_FAILED) {
+    if (virtual_base != MAP_FAILED)
         return virtual_base;
-    }
 #endif
-
     return mmap(nullptr, virtual_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
 }
 

From ab015bc7305d7eec23451ec2f7b577e26f1e1ba1 Mon Sep 17 00:00:00 2001
From: JPikachu <jpikachu@eden-emu.dev>
Date: Sat, 30 Aug 2025 19:35:53 +0200
Subject: [PATCH 05/38] [VK] Fix asserts with incorrect memory allocations
 (#357)

This fixes many assertions with incorrect memory allocations. Regression introduced in PR 334.

Co-authored-by: JPikachu <jpikachu.eden@gmail.com>
Co-authored-by: MaranBr <maranbr@outlook.com>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/357
Reviewed-by: Lizzie <lizzie@eden-emu.dev>
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: JPikachu <jpikachu@eden-emu.dev>
Co-committed-by: JPikachu <jpikachu@eden-emu.dev>
---
 src/video_core/vulkan_common/vulkan_device.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index cfa88850a0..95c0d974cc 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -1378,13 +1378,13 @@ void Device::CollectPhysicalMemoryInfo() {
         device_access_memory += mem_properties.memoryHeaps[element].size;
     }
     if (!is_integrated) {
-        const u64 reserve_memory = std::min<u64>(device_access_memory / 4, 2_GiB);
+        const u64 reserve_memory = std::min<u64>(device_access_memory / 8, 1_GiB);
         device_access_memory -= reserve_memory;
 
         if (Settings::values.vram_usage_mode.GetValue() != Settings::VramUsageMode::Aggressive) {
             // Account for resolution scaling in memory limits
-            const size_t normal_memory = 8_GiB;
-            const size_t scaler_memory = 2_GiB * Settings::values.resolution_info.ScaleUp(1);
+            const size_t normal_memory = 6_GiB;
+            const size_t scaler_memory = 1_GiB * Settings::values.resolution_info.ScaleUp(1);
             device_access_memory =
                 std::min<u64>(device_access_memory, normal_memory + scaler_memory);
         }
@@ -1393,7 +1393,7 @@ void Device::CollectPhysicalMemoryInfo() {
     }
     const s64 available_memory = static_cast<s64>(device_access_memory - device_initial_usage);
     device_access_memory = static_cast<u64>(std::max<s64>(
-        std::min<s64>(available_memory - 4_GiB, 6_GiB), std::min<s64>(local_memory, 6_GiB)));
+        std::min<s64>(available_memory - 8_GiB, 6_GiB), std::min<s64>(local_memory, 6_GiB)));
 }
 
 void Device::CollectToolingInfo() {

From 76de9d6c8c5adf73170d99ba74f8d0c61f0c60dd Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Sat, 30 Aug 2025 20:32:21 +0200
Subject: [PATCH 06/38] [cmake, compat] fix solaris boost build once and for
 all (#364)

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/364
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d11b58bf1f..55ed83c929 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -406,8 +406,10 @@ if (YUZU_USE_CPM)
 
         if (NOT MSVC)
             # boost sucks
-            if (NOT PLATFORM_LINUX AND NOT ANDROID)
-                target_compile_definitions(boost_container INTERFACE BOOST_HAS_PTHREADS)
+            # Solaris (and probably other NIXes) need explicit pthread definition
+            if (PLATFORM_SUN)
+                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthreads")
+                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthreads")
             endif()
 
             target_compile_options(boost_heap INTERFACE -Wno-shadow)

From 3b4c1beb0c400c566058063b9a77af033532d080 Mon Sep 17 00:00:00 2001
From: crueter <crueter@eden-emu.dev>
Date: Sat, 30 Aug 2025 20:32:28 +0200
Subject: [PATCH 07/38] [desktop] only warn on firmware for qlaunch/games
 (#363)

- only warns about too new/missing for home menu
- only warns about missing for games that need it (mk8dx)

Signed-off-by: crueter <crueter@eden-emu.dev>

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/363
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Reviewed-by: MaranBr <maranbr@outlook.com>
---
 src/yuzu/main.cpp     | 58 ++++++++++++++++++++++++++-----------------
 src/yuzu/main.h       |  1 -
 src/yuzu/uisettings.h |  3 +++
 3 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 2c3c46114d..a080132958 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -553,9 +553,6 @@ GMainWindow::GMainWindow(bool has_broken_vulkan)
     // Gen keys if necessary
     OnCheckFirmwareDecryption();
 
-    // Check firmware
-    OnCheckFirmware();
-
     game_list->LoadCompatibilityList();
     // force reload on first load to ensure add-ons get updated
     game_list->PopulateAsync(UISettings::values.game_dirs, false);
@@ -4459,7 +4456,6 @@ void GMainWindow::InstallFirmware(const QString& location, bool recursive) {
 
     progress.close();
     OnCheckFirmwareDecryption();
-    OnCheckFirmware();
 }
 
 void GMainWindow::OnInstallFirmware() {
@@ -4580,7 +4576,6 @@ void GMainWindow::OnInstallDecryptionKeys() {
     }
 
     OnCheckFirmwareDecryption();
-    OnCheckFirmware();
 }
 
 void GMainWindow::OnAbout() {
@@ -4609,6 +4604,7 @@ void GMainWindow::OnToggleStatusBar() {
 void GMainWindow::OnGameListRefresh() {
     // force reload add-ons etc
     game_list->ForceRefreshGameDirectory();
+    SetFirmwareVersion();
 }
 
 void GMainWindow::OnAlbum() {
@@ -4707,13 +4703,42 @@ void GMainWindow::OnOpenControllerMenu() {
 }
 
 void GMainWindow::OnHomeMenu() {
+    auto result = FirmwareManager::VerifyFirmware(*system.get());
+
+    switch (result) {
+    case FirmwareManager::ErrorFirmwareMissing:
+        QMessageBox::warning(this, tr("No firmware available"),
+                             tr("Please install firmware to use the Home Menu."));
+        return;
+    case FirmwareManager::ErrorFirmwareCorrupted:
+        QMessageBox::warning(this, tr("Firmware Corrupted"),
+                             tr(FirmwareManager::GetFirmwareCheckString(result)));
+        return;
+    case FirmwareManager::ErrorFirmwareTooNew: {
+        if (!UISettings::values.show_fw_warning.GetValue()) break;
+
+        QMessageBox box(QMessageBox::Warning,
+                        tr("Firmware Too New"),
+                        tr(FirmwareManager::GetFirmwareCheckString(result)) + tr("\nContinue anyways?"),
+                        QMessageBox::Yes | QMessageBox::No,
+                        this);
+
+        QCheckBox *checkbox = new QCheckBox(tr("Don't show again"));
+        box.setCheckBox(checkbox);
+
+        int button = box.exec();
+        if (checkbox->isChecked()) {
+            UISettings::values.show_fw_warning.SetValue(false);
+        }
+
+        if (button == static_cast<int>(QMessageBox::No)) return;
+        break;
+    } default:
+        break;
+    }
+
     constexpr u64 QLaunchId = static_cast<u64>(Service::AM::AppletProgramId::QLaunch);
     auto bis_system = system->GetFileSystemController().GetSystemNANDContents();
-    if (!bis_system) {
-        QMessageBox::warning(this, tr("No firmware available"),
-                             tr("Please install the firmware to use the Home Menu."));
-        return;
-    }
 
     auto qlaunch_applet_nca = bis_system->GetEntry(QLaunchId, FileSys::ContentRecordType::Program);
     if (!qlaunch_applet_nca) {
@@ -5240,19 +5265,6 @@ void GMainWindow::OnCheckFirmwareDecryption() {
     UpdateMenuState();
 }
 
-void GMainWindow::OnCheckFirmware() {
-    auto result = FirmwareManager::VerifyFirmware(*system.get());
-
-    switch (result) {
-    case FirmwareManager::FirmwareGood:
-        break;
-    default:
-        QMessageBox::warning(this, tr("Firmware Read Error"),
-                             tr(FirmwareManager::GetFirmwareCheckString(result)));
-        break;
-    }
-}
-
 bool GMainWindow::CheckFirmwarePresence() {
     return FirmwareManager::CheckFirmwarePresence(*system.get());
 }
diff --git a/src/yuzu/main.h b/src/yuzu/main.h
index 8a34a9f075..b1c5669a41 100644
--- a/src/yuzu/main.h
+++ b/src/yuzu/main.h
@@ -424,7 +424,6 @@ private slots:
     void OnCreateHomeMenuShortcut(GameListShortcutTarget target);
     void OnCaptureScreenshot();
     void OnCheckFirmwareDecryption();
-    void OnCheckFirmware();
     void OnLanguageChanged(const QString& locale);
     void OnMouseActivity();
     bool OnShutdownBegin();
diff --git a/src/yuzu/uisettings.h b/src/yuzu/uisettings.h
index 85de0ae72d..3322b31ca3 100644
--- a/src/yuzu/uisettings.h
+++ b/src/yuzu/uisettings.h
@@ -212,6 +212,9 @@ struct Values {
     // Play time
     Setting<bool> show_play_time{linkage, true, "show_play_time", Category::UiGameList};
 
+    // misc
+    Setting<bool> show_fw_warning{linkage, true, "show_fw_warning", Category::Miscellaneous};
+
     bool configuration_applied;
     bool reset_to_defaults;
     bool shortcut_already_warned{false};

From 7ca197d90084406b07fe3b9b1b65780a6c7147ff Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Sat, 30 Aug 2025 23:08:04 +0200
Subject: [PATCH 08/38] [qt, compat] fix freedesktop stuffs on Solaris/OpenBSD
 (#360)

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/360
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 src/yuzu/main.cpp      | 72 ++++++++++++++++++++----------------------
 src/yuzu/util/util.cpp |  5 ++-
 2 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index a080132958..7600b8b5da 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -3091,34 +3091,7 @@ bool GMainWindow::CreateShortcutLink(const std::filesystem::path& shortcut_path,
                                      const std::filesystem::path& command,
                                      const std::string& arguments, const std::string& categories,
                                      const std::string& keywords, const std::string& name) try {
-#if defined(__linux__) || defined(__FreeBSD__) // Linux and FreeBSD
-    std::filesystem::path shortcut_path_full = shortcut_path / (name + ".desktop");
-    std::ofstream shortcut_stream(shortcut_path_full, std::ios::binary | std::ios::trunc);
-    if (!shortcut_stream.is_open()) {
-        LOG_ERROR(Frontend, "Failed to create shortcut");
-        return false;
-    }
-    // TODO: Migrate fmt::print to std::print in futures STD C++ 23.
-    fmt::print(shortcut_stream, "[Desktop Entry]\n");
-    fmt::print(shortcut_stream, "Type=Application\n");
-    fmt::print(shortcut_stream, "Version=1.0\n");
-    fmt::print(shortcut_stream, "Name={}\n", name);
-    if (!comment.empty()) {
-        fmt::print(shortcut_stream, "Comment={}\n", comment);
-    }
-    if (std::filesystem::is_regular_file(icon_path)) {
-        fmt::print(shortcut_stream, "Icon={}\n", icon_path.string());
-    }
-    fmt::print(shortcut_stream, "TryExec={}\n", command.string());
-    fmt::print(shortcut_stream, "Exec={} {}\n", command.string(), arguments);
-    if (!categories.empty()) {
-        fmt::print(shortcut_stream, "Categories={}\n", categories);
-    }
-    if (!keywords.empty()) {
-        fmt::print(shortcut_stream, "Keywords={}\n", keywords);
-    }
-    return true;
-#elif defined(_WIN32) // Windows
+#ifdef _WIN32 // Windows
     HRESULT hr = CoInitialize(nullptr);
     if (FAILED(hr)) {
         LOG_ERROR(Frontend, "CoInitialize failed");
@@ -3180,7 +3153,34 @@ bool GMainWindow::CreateShortcutLink(const std::filesystem::path& shortcut_path,
         return false;
     }
     return true;
-#else                 // Unsupported platform
+#elif defined(__unix__) && !defined(__APPLE__) && !defined(__ANDROID__) // Any desktop NIX
+    std::filesystem::path shortcut_path_full = shortcut_path / (name + ".desktop");
+    std::ofstream shortcut_stream(shortcut_path_full, std::ios::binary | std::ios::trunc);
+    if (!shortcut_stream.is_open()) {
+        LOG_ERROR(Frontend, "Failed to create shortcut");
+        return false;
+    }
+    // TODO: Migrate fmt::print to std::print in futures STD C++ 23.
+    fmt::print(shortcut_stream, "[Desktop Entry]\n");
+    fmt::print(shortcut_stream, "Type=Application\n");
+    fmt::print(shortcut_stream, "Version=1.0\n");
+    fmt::print(shortcut_stream, "Name={}\n", name);
+    if (!comment.empty()) {
+        fmt::print(shortcut_stream, "Comment={}\n", comment);
+    }
+    if (std::filesystem::is_regular_file(icon_path)) {
+        fmt::print(shortcut_stream, "Icon={}\n", icon_path.string());
+    }
+    fmt::print(shortcut_stream, "TryExec={}\n", command.string());
+    fmt::print(shortcut_stream, "Exec={} {}\n", command.string(), arguments);
+    if (!categories.empty()) {
+        fmt::print(shortcut_stream, "Categories={}\n", categories);
+    }
+    if (!keywords.empty()) {
+        fmt::print(shortcut_stream, "Keywords={}\n", keywords);
+    }
+    return true;
+#else // Unsupported platform
     return false;
 #endif
 } catch (const std::exception& e) {
@@ -3225,7 +3225,7 @@ bool GMainWindow::MakeShortcutIcoPath(const u64 program_id, const std::string_vi
 #if defined(_WIN32)
     out_icon_path = Common::FS::GetEdenPath(Common::FS::EdenPath::IconsDir);
     ico_extension = "ico";
-#elif defined(__linux__) || defined(__FreeBSD__)
+#elif defined(__unix__) && !defined(__APPLE__) && !defined(__ANDROID__)
     out_icon_path = Common::FS::GetDataDirectory("XDG_DATA_HOME") / "icons/hicolor/256x256";
 #endif
     // Create icons directory if it doesn't exist
@@ -4878,7 +4878,7 @@ void GMainWindow::CreateShortcut(const std::string& game_path, const u64 program
         }
     }
 
-#if defined(__linux__)
+#if defined(__unix__) && !defined(__APPLE__) && !defined(__ANDROID__)
     // Special case for AppImages
     // Warn once if we are making a shortcut to a volatile AppImage
     if (command.string().ends_with(".AppImage") && !UISettings::values.shortcut_already_warned) {
@@ -4888,7 +4888,7 @@ void GMainWindow::CreateShortcut(const std::string& game_path, const u64 program
         }
         UISettings::values.shortcut_already_warned = true;
     }
-#endif // __linux__
+#endif
 
     // Create shortcut
     std::string arguments{arguments_};
@@ -5742,17 +5742,13 @@ int main(int argc, char* argv[]) {
 #ifdef _WIN32
     // Increases the maximum open file limit to 8192
     _setmaxstdio(8192);
-#endif
-
-#ifdef __APPLE__
+#elif defined(__APPLE__)
     // If you start a bundle (binary) on OSX without the Terminal, the working directory is "/".
     // But since we require the working directory to be the executable path for the location of
     // the user folder in the Qt Frontend, we need to cd into that working directory
     const auto bin_path = Common::FS::GetBundleDirectory() / "..";
     chdir(Common::FS::PathToUTF8String(bin_path).c_str());
-#endif
-
-#ifdef __linux__
+#elif defined(__unix__) && !defined(__ANDROID__)
     // Set the DISPLAY variable in order to open web browsers
     // TODO (lat9nq): Find a better solution for AppImages to start external applications
     if (QString::fromLocal8Bit(qgetenv("DISPLAY")).isEmpty()) {
diff --git a/src/yuzu/util/util.cpp b/src/yuzu/util/util.cpp
index e22cf84bf1..551df7b4cd 100644
--- a/src/yuzu/util/util.cpp
+++ b/src/yuzu/util/util.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: 2015 Citra Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -138,7 +141,7 @@ bool SaveIconToFile(const std::filesystem::path& icon_path, const QImage& image)
     icon_file.Close();
 
     return true;
-#elif defined(__linux__) || defined(__FreeBSD__)
+#elif defined(__unix__) && !defined(__APPLE__) && !defined(__ANDROID__)
     // Convert and write the icon as a PNG
     if (!image.save(QString::fromStdString(icon_path.string()))) {
         LOG_ERROR(Frontend, "Could not write icon as PNG to file");

From 1c3ca17cfb0beaf4696598115ca0c80473a929d0 Mon Sep 17 00:00:00 2001
From: crueter <crueter@eden-emu.dev>
Date: Sun, 31 Aug 2025 00:12:06 +0200
Subject: [PATCH 09/38] [dynarmic] fix annoying gcc/clang error (#365)

caused qt creator to crash somehow geg

Signed-off-by: crueter <crueter@eden-emu.dev>

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/365
Reviewed-by: Lizzie <lizzie@eden-emu.dev>
Reviewed-by: MaranBr <maranbr@outlook.com>
---
 .../src/dynarmic/common/lut_from_list.h       | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/dynarmic/src/dynarmic/common/lut_from_list.h b/src/dynarmic/src/dynarmic/common/lut_from_list.h
index ed9e3dc046..c904e2c041 100644
--- a/src/dynarmic/src/dynarmic/common/lut_from_list.h
+++ b/src/dynarmic/src/dynarmic/common/lut_from_list.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 /* This file is part of the dynarmic project.
  * Copyright (c) 2018 MerryMage
  * SPDX-License-Identifier: 0BSD
@@ -19,6 +22,16 @@
 
 namespace Dynarmic::Common {
 
+// prevents this function from printing 56,000 character warning messages
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wno-stack-usage"
+#endif
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wno-stack-usage"
+#endif
+
 template<typename Function, typename... Values>
 inline auto GenerateLookupTableFromList(Function f, mcl::mp::list<Values...>) {
 #ifdef _MSC_VER
@@ -34,4 +47,11 @@ inline auto GenerateLookupTableFromList(Function f, mcl::mp::list<Values...>) {
     return MapT(pair_array.begin(), pair_array.end());
 }
 
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
 }  // namespace Dynarmic::Common

From 21c77bdcac2390c6b15091296786a9239c6ca529 Mon Sep 17 00:00:00 2001
From: crueter <crueter@eden-emu.dev>
Date: Sun, 31 Aug 2025 03:10:34 +0200
Subject: [PATCH 10/38] [cmake] fix ffmpeg libdrm on macos (#367)

Signed-off-by: crueter <crueter@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/367
Reviewed-by: Shinmegumi <shinmegumi@eden-emu.dev>
Reviewed-by: Lizzie <lizzie@eden-emu.dev>
---
 externals/ffmpeg/CMakeLists.txt | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/externals/ffmpeg/CMakeLists.txt b/externals/ffmpeg/CMakeLists.txt
index 54c852f831..ff35c8dc2c 100644
--- a/externals/ffmpeg/CMakeLists.txt
+++ b/externals/ffmpeg/CMakeLists.txt
@@ -63,20 +63,22 @@ if (NOT WIN32 AND NOT ANDROID)
     set(FFmpeg_HWACCEL_INCLUDE_DIRS)
     set(FFmpeg_HWACCEL_LDFLAGS)
 
-    # In Solaris needs explicit linking for ffmpeg which links to /lib/amd64/libX11.so
-    if(PLATFORM_SUN)
-        list(APPEND FFmpeg_HWACCEL_LIBRARIES
-            X11
-            "/usr/lib/xorg/amd64/libdrm.so")
-    else()
-        pkg_check_modules(LIBDRM libdrm REQUIRED)
-        list(APPEND FFmpeg_HWACCEL_LIBRARIES
-            ${LIBDRM_LIBRARIES})
-        list(APPEND FFmpeg_HWACCEL_INCLUDE_DIRS
-            ${LIBDRM_INCLUDE_DIRS})
+    if (NOT APPLE)
+        # In Solaris needs explicit linking for ffmpeg which links to /lib/amd64/libX11.so
+        if(PLATFORM_SUN)
+            list(APPEND FFmpeg_HWACCEL_LIBRARIES
+                X11
+                "/usr/lib/xorg/amd64/libdrm.so")
+        else()
+            pkg_check_modules(LIBDRM libdrm REQUIRED)
+            list(APPEND FFmpeg_HWACCEL_LIBRARIES
+                ${LIBDRM_LIBRARIES})
+            list(APPEND FFmpeg_HWACCEL_INCLUDE_DIRS
+                ${LIBDRM_INCLUDE_DIRS})
+        endif()
+        list(APPEND FFmpeg_HWACCEL_FLAGS
+            --enable-libdrm)
     endif()
-    list(APPEND FFmpeg_HWACCEL_FLAGS
-        --enable-libdrm)
 
     if(LIBVA_FOUND)
         find_package(X11 REQUIRED)

From 39e27bc954dd16d296266eaa08ea613e8e5da38d Mon Sep 17 00:00:00 2001
From: Producdevity <y.gherbi.dev@gmail.com>
Date: Sun, 31 Aug 2025 03:33:54 +0200
Subject: [PATCH 11/38] [android] fix intent-auto-driver-install (#369)

Resolving drivers based on the artifact name was too buggy and inconsistent, this PR improves it. Well, I like to think it does

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/369
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: Lizzie <lizzie@eden-emu.dev>
Co-authored-by: Producdevity <y.gherbi.dev@gmail.com>
Co-committed-by: Producdevity <y.gherbi.dev@gmail.com>
---
 .../yuzu_emu/utils/CustomSettingsHandler.kt   | 27 ++++--
 .../org/yuzu/yuzu_emu/utils/DriverResolver.kt | 94 +++++++++++++++++--
 2 files changed, 102 insertions(+), 19 deletions(-)

diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/CustomSettingsHandler.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/CustomSettingsHandler.kt
index a317be14d5..377313d0aa 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/CustomSettingsHandler.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/CustomSettingsHandler.kt
@@ -124,11 +124,16 @@ object CustomSettingsHandler {
 
         // Check for driver requirements if activity and driverViewModel are provided
         if (activity != null && driverViewModel != null) {
-            val driverPath = extractDriverPath(customSettings)
-            if (driverPath != null) {
-                Log.info("[CustomSettingsHandler] Custom settings specify driver: $driverPath")
+            val rawDriverPath = extractDriverPath(customSettings)
+            if (rawDriverPath != null) {
+                // Normalize to local storage path (we only store drivers under driverStoragePath)
+                val driverFilename = rawDriverPath.substringAfterLast('/')
+                    .substringAfterLast('\\')
+                val localDriverPath = "${GpuDriverHelper.driverStoragePath}$driverFilename"
+                Log.info("[CustomSettingsHandler] Custom settings specify driver: $rawDriverPath (normalized: $localDriverPath)")
+
                 // Check if driver exists in the driver storage
-                val driverFile = File(driverPath)
+                val driverFile = File(localDriverPath)
                 if (!driverFile.exists()) {
                     Log.info("[CustomSettingsHandler] Driver not found locally: ${driverFile.name}")
 
@@ -182,7 +187,7 @@ object CustomSettingsHandler {
                         }
 
                         // Attempt to download and install the driver
-                        val driverUri = DriverResolver.ensureDriverAvailable(driverPath, activity) { progress ->
+                        val driverUri = DriverResolver.ensureDriverAvailable(driverFilename, activity) { progress ->
                             progressChannel.trySend(progress.toInt())
                         }
 
@@ -209,12 +214,12 @@ object CustomSettingsHandler {
                             return null
                         }
 
-                        // Verify the downloaded driver
-                        val installedFile = File(driverPath)
+                        // Verify the downloaded driver (from normalized local path)
+                        val installedFile = File(localDriverPath)
                         val metadata = GpuDriverHelper.getMetadataFromZip(installedFile)
                         if (metadata.name == null) {
                             Log.error(
-                                "[CustomSettingsHandler] Downloaded driver is invalid: $driverPath"
+                                "[CustomSettingsHandler] Downloaded driver is invalid: $localDriverPath"
                             )
                             Toast.makeText(
                                 activity,
@@ -232,7 +237,7 @@ object CustomSettingsHandler {
                         }
 
                         // Add to driver list
-                        driverViewModel.onDriverAdded(Pair(driverPath, metadata))
+                        driverViewModel.onDriverAdded(Pair(localDriverPath, metadata))
                         Log.info(
                             "[CustomSettingsHandler] Successfully downloaded and installed driver: ${metadata.name}"
                         )
@@ -268,7 +273,7 @@ object CustomSettingsHandler {
                     // Driver exists, verify it's valid
                     val metadata = GpuDriverHelper.getMetadataFromZip(driverFile)
                     if (metadata.name == null) {
-                        Log.error("[CustomSettingsHandler] Invalid driver file: $driverPath")
+                        Log.error("[CustomSettingsHandler] Invalid driver file: $localDriverPath")
                         Toast.makeText(
                             activity,
                             activity.getString(
@@ -459,6 +464,8 @@ object CustomSettingsHandler {
 
             if (inGpuDriverSection && trimmed.startsWith("driver_path=")) {
                 return trimmed.substringAfter("driver_path=")
+                    .trim()
+                    .removeSurrounding("\"", "\"")
             }
         }
 
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/DriverResolver.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/DriverResolver.kt
index 74f98ccbd2..2072344bdf 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/DriverResolver.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/DriverResolver.kt
@@ -68,6 +68,48 @@ object DriverResolver {
         val filename: String
     )
 
+    // Matching helpers
+    private val KNOWN_SUFFIXES = listOf(
+        ".adpkg.zip",
+        ".zip",
+        ".7z",
+        ".tar.gz",
+        ".tar.xz",
+        ".rar"
+    )
+
+    private fun stripKnownSuffixes(name: String): String {
+        var result = name
+        var changed: Boolean
+        do {
+            changed = false
+            for (s in KNOWN_SUFFIXES) {
+                if (result.endsWith(s, ignoreCase = true)) {
+                    result = result.dropLast(s.length)
+                    changed = true
+                }
+            }
+        } while (changed)
+        return result
+    }
+
+    private fun normalizeName(name: String): String {
+        val base = stripKnownSuffixes(name.lowercase())
+        // Remove non-alphanumerics to make substring checks resilient
+        return base.replace(Regex("[^a-z0-9]+"), " ").trim()
+    }
+
+    private fun tokenize(name: String): Set<String> =
+        normalizeName(name).split(Regex("\\s+")).filter { it.isNotBlank() }.toSet()
+
+    // Jaccard similarity between two sets
+    private fun jaccard(a: Set<String>, b: Set<String>): Double {
+        if (a.isEmpty() || b.isEmpty()) return 0.0
+        val inter = a.intersect(b).size.toDouble()
+        val uni = a.union(b).size.toDouble()
+        return if (uni == 0.0) 0.0 else inter / uni
+    }
+
     /**
      * Resolve a driver download URL from its filename
      * @param filename The driver filename (e.g., "turnip_mrpurple-T19-toasted.adpkg.zip")
@@ -98,7 +140,7 @@ object DriverResolver {
                 async {
                     searchRepository(repoPath, filename)
                 }
-            }.mapNotNull { it.await() }.firstOrNull().also { resolved ->
+            }.firstNotNullOfOrNull { it.await() }.also { resolved ->
                 // Cache the result if found
                 resolved?.let {
                     urlCache[filename] = it
@@ -119,22 +161,56 @@ object DriverResolver {
                     releaseCache[repoPath] = it
                 }
 
-                // Search through all releases and artifacts
+                // First pass: exact name (case-insensitive) against asset filenames
+                val target = filename.lowercase()
                 for (release in releases) {
                     for (artifact in release.artifacts) {
-                        if (artifact.name == filename) {
-                            Log.info(
-                                "[DriverResolver] Found $filename in $repoPath/${release.tagName}"
-                            )
+                        if (artifact.name.equals(filename, ignoreCase = true) || artifact.name.lowercase() == target) {
+                            Log.info("[DriverResolver] Found $filename in $repoPath/${release.tagName}")
                             return@withContext ResolvedDriver(
                                 downloadUrl = artifact.url.toString(),
                                 repoPath = repoPath,
                                 releaseTag = release.tagName,
-                                filename = filename
+                                filename = artifact.name
                             )
                         }
                     }
                 }
+
+                // Second pass: fuzzy match by asset filenames only
+                val reqNorm = normalizeName(filename)
+                val reqTokens = tokenize(filename)
+                var best: ResolvedDriver? = null
+                var bestScore = 0.0
+
+                for (release in releases) {
+                    for (artifact in release.artifacts) {
+                        val artNorm = normalizeName(artifact.name)
+                        val artTokens = tokenize(artifact.name)
+
+                        var score = jaccard(reqTokens, artTokens)
+                        // Boost if one normalized name contains the other
+                        if (artNorm.contains(reqNorm) || reqNorm.contains(artNorm)) {
+                            score = maxOf(score, 0.92)
+                        }
+
+                        if (score > bestScore) {
+                            bestScore = score
+                            best = ResolvedDriver(
+                                downloadUrl = artifact.url.toString(),
+                                repoPath = repoPath,
+                                releaseTag = release.tagName,
+                                filename = artifact.name
+                            )
+                        }
+                    }
+                }
+
+                // Threshold to avoid bad guesses, this worked fine in testing but might need tuning
+                if (best != null && bestScore >= 0.6) {
+                    Log.info("[DriverResolver] Fuzzy matched $filename -> ${best.filename} in ${best.repoPath} (score=%.2f)".format(bestScore))
+                    return@withContext best
+                }
                 null
             } catch (e: Exception) {
                 Log.error("[DriverResolver] Failed to search $repoPath: ${e.message}")
@@ -296,8 +372,8 @@ object DriverResolver {
         context: Context,
         onProgress: ((Float) -> Unit)? = null
     ): Uri? {
-        // Extract filename from path
-        val filename = driverPath.substringAfterLast('/')
+        // Extract filename from path (support both separators)
+        val filename = driverPath.substringAfterLast('/').substringAfterLast('\\')
 
         // Check if driver already exists locally
         val localPath = "${GpuDriverHelper.driverStoragePath}$filename"

From 4b5a8e06219ad0de3d27faac19b7ecb986175bbc Mon Sep 17 00:00:00 2001
From: Guo Yunhe <i@guoyunhe.me>
Date: Sun, 31 Aug 2025 04:56:23 +0200
Subject: [PATCH 12/38] [cmake] changed app id from org.eden_emu.eden to
 dev.eden_emu.eden (#237)

it is better to match app id with website domain

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/237
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: Guo Yunhe <i@guoyunhe.me>
Co-committed-by: Guo Yunhe <i@guoyunhe.me>
---
 .ci/linux/eden.dwfsprof                                   | 2 +-
 .ci/linux/package.sh                                      | 8 ++++----
 .ci/update-icons.sh                                       | 2 +-
 CMakeLists.txt                                            | 8 ++++----
 ...rg.eden_emu.eden.desktop => dev.eden_emu.eden.desktop} | 2 +-
 ...u.eden.metainfo.xml => dev.eden_emu.eden.metainfo.xml} | 0
 dist/{org.eden_emu.eden.svg => dev.eden_emu.eden.svg}     | 0
 dist/{org.eden_emu.eden.xml => dev.eden_emu.eden.xml}     | 0
 src/yuzu/main.cpp                                         | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)
 rename dist/{org.eden_emu.eden.desktop => dev.eden_emu.eden.desktop} (95%)
 rename dist/{org.eden_emu.eden.metainfo.xml => dev.eden_emu.eden.metainfo.xml} (100%)
 rename dist/{org.eden_emu.eden.svg => dev.eden_emu.eden.svg} (100%)
 rename dist/{org.eden_emu.eden.xml => dev.eden_emu.eden.xml} (100%)

diff --git a/.ci/linux/eden.dwfsprof b/.ci/linux/eden.dwfsprof
index bc360f0d46..9a3bee6f14 100644
--- a/.ci/linux/eden.dwfsprof
+++ b/.ci/linux/eden.dwfsprof
@@ -1,6 +1,6 @@
 AppRun
 eden.desktop
-org.eden_emu.eden.desktop
+dev.eden_emu.eden.desktop
 shared/bin/eden
 shared/lib/lib.path
 shared/lib/ld-linux-x86-64.so.2
diff --git a/.ci/linux/package.sh b/.ci/linux/package.sh
index 911fea2f7b..837cfe07ef 100755
--- a/.ci/linux/package.sh
+++ b/.ci/linux/package.sh
@@ -59,15 +59,15 @@ VERSION="$(echo "$EDEN_TAG")"
 mkdir -p ./AppDir
 cd ./AppDir
 
-cp ../dist/org.eden_emu.eden.desktop .
-cp ../dist/org.eden_emu.eden.svg .
+cp ../dist/dev.eden_emu.eden.desktop .
+cp ../dist/dev.eden_emu.eden.svg .
 
-ln -sf ./org.eden_emu.eden.svg ./.DirIcon
+ln -sf ./dev.eden_emu.eden.svg ./.DirIcon
 
 UPINFO='gh-releases-zsync|eden-emulator|Releases|latest|*.AppImage.zsync'
 
 if [ "$DEVEL" = 'true' ]; then
-	sed -i 's|Name=Eden|Name=Eden Nightly|' ./org.eden_emu.eden.desktop
+	sed -i 's|Name=Eden|Name=Eden Nightly|' ./dev.eden_emu.eden.desktop
  	UPINFO="$(echo "$UPINFO" | sed 's|Releases|nightly|')"
 fi
 
diff --git a/.ci/update-icons.sh b/.ci/update-icons.sh
index 99adbfae66..4feb2abd24 100755
--- a/.ci/update-icons.sh
+++ b/.ci/update-icons.sh
@@ -6,7 +6,7 @@
 which png2icns || [ which yay && yay libicns ] || exit
 which magick || exit
 
-export EDEN_SVG_ICO="dist/org.eden_emu.eden.svg"
+export EDEN_SVG_ICO="dist/dev.eden_emu.eden.svg"
 svgo --multipass $EDEN_SVG_ICO
 
 magick -density 256x256 -background transparent $EDEN_SVG_ICO \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55ed83c929..9abca561f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -858,14 +858,14 @@ endif()
 # https://specifications.freedesktop.org/shared-mime-info-spec/shared-mime-info-spec-latest.html
 # https://www.freedesktop.org/software/appstream/docs/
 if(ENABLE_QT AND UNIX AND NOT APPLE)
-    install(FILES "dist/org.eden_emu.eden.desktop"
+    install(FILES "dist/dev.eden_emu.eden.desktop"
         DESTINATION "share/applications")
-    install(FILES "dist/org.eden_emu.eden.svg"
+    install(FILES "dist/dev.eden_emu.eden.svg"
         DESTINATION "share/icons/hicolor/scalable/apps")
 
     # TODO: these files need to be updated.
-    install(FILES "dist/org.eden_emu.eden.xml"
+    install(FILES "dist/dev.eden_emu.eden.xml"
         DESTINATION "share/mime/packages")
-    install(FILES "dist/org.eden_emu.eden.metainfo.xml"
+    install(FILES "dist/dev.eden_emu.eden.metainfo.xml"
         DESTINATION "share/metainfo")
 endif()
diff --git a/dist/org.eden_emu.eden.desktop b/dist/dev.eden_emu.eden.desktop
similarity index 95%
rename from dist/org.eden_emu.eden.desktop
rename to dist/dev.eden_emu.eden.desktop
index d012ab6d07..5d2d7cd8c5 100644
--- a/dist/org.eden_emu.eden.desktop
+++ b/dist/dev.eden_emu.eden.desktop
@@ -10,7 +10,7 @@ Type=Application
 Name=Eden
 GenericName=Switch Emulator
 Comment=Nintendo Switch video game console emulator
-Icon=org.eden_emu.eden
+Icon=dev.eden_emu.eden
 TryExec=eden
 Exec=eden %f
 Categories=Game;Emulator;Qt;
diff --git a/dist/org.eden_emu.eden.metainfo.xml b/dist/dev.eden_emu.eden.metainfo.xml
similarity index 100%
rename from dist/org.eden_emu.eden.metainfo.xml
rename to dist/dev.eden_emu.eden.metainfo.xml
diff --git a/dist/org.eden_emu.eden.svg b/dist/dev.eden_emu.eden.svg
similarity index 100%
rename from dist/org.eden_emu.eden.svg
rename to dist/dev.eden_emu.eden.svg
diff --git a/dist/org.eden_emu.eden.xml b/dist/dev.eden_emu.eden.xml
similarity index 100%
rename from dist/org.eden_emu.eden.xml
rename to dist/dev.eden_emu.eden.xml
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 7600b8b5da..4c6b176c56 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -5757,7 +5757,7 @@ int main(int argc, char* argv[]) {
 
     // Fix the Wayland appId. This needs to match the name of the .desktop file without the .desktop
     // suffix.
-    QGuiApplication::setDesktopFileName(QStringLiteral("org.eden_emu.eden"));
+    QGuiApplication::setDesktopFileName(QStringLiteral("dev.eden_emu.eden"));
 #endif
 
     SetHighDPIAttributes();

From 8dba6a2cb46baf14c8c8f9c44a24f94fa26ecfe0 Mon Sep 17 00:00:00 2001
From: SDK-Chan <sdkchan@eden-emu.dev>
Date: Sun, 31 Aug 2025 07:32:54 +0200
Subject: [PATCH 13/38] [gpu/NVDRV] Finalize, improve AllocObjCtx (#333)

Improves object allocation per channel, only allowing max amount of 6 objects contexts per channel.
Previously objects were stored in a heap allocated vector which is sub-optimal for performance reasons.
The new implementation instead uses a stack based array with a O(1) approach.
This should boost performance in games which heavily rely on object context creation.

Co-authored-by: MaranBr <maranbr@outlook.com>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/333
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Co-authored-by: SDK-Chan <sdkchan@eden-emu.dev>
Co-committed-by: SDK-Chan <sdkchan@eden-emu.dev>
---
 .../hle/service/nvdrv/devices/nvhost_gpu.cpp  | 57 ++++++++++++++-----
 .../hle/service/nvdrv/devices/nvhost_gpu.h    |  5 +-
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 95bf18dbf7..5f754650d9 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -219,28 +219,55 @@ NvResult nvhost_gpu::AllocGPFIFOEx2(IoctlAllocGpfifoEx& params, DeviceFD fd) {
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::AllocateObjectContext(IoctlAllocObjCtx& params) {
-    LOG_DEBUG(Service_NVDRV, "called, class_num={:X}, flags={:X}, obj_id={:X}", params.class_num,
-                params.flags, params.obj_id);
+s32_le nvhost_gpu::GetObjectContextClassNumberIndex(CtxClasses class_number) {
+    constexpr s32_le invalid_class_number_index = -1;
+    switch (class_number) {
+    case CtxClasses::Ctx2D: return 0;
+    case CtxClasses::Ctx3D: return 1;
+    case CtxClasses::CtxCompute: return 2;
+    case CtxClasses::CtxKepler: return 3;
+    case CtxClasses::CtxDMA: return 4;
+    case CtxClasses::CtxChannelGPFIFO: return 5;
+    default: return invalid_class_number_index;
+    }
+}
 
-    if (!channel_state->initialized) {
+NvResult nvhost_gpu::AllocateObjectContext(IoctlAllocObjCtx& params) {
+    LOG_DEBUG(Service_NVDRV, "called, class_num={:#X}, flags={:#X}, obj_id={:#X}", params.class_num,
+              params.flags, params.obj_id);
+
+    if (!channel_state || !channel_state->initialized) {
         LOG_CRITICAL(Service_NVDRV, "No address space bound to allocate a object context!");
         return NvResult::NotInitialized;
     }
 
-    switch (static_cast<CtxClasses>(params.class_num)) { 
-    case CtxClasses::Ctx2D:
-    case CtxClasses::Ctx3D:
-    case CtxClasses::CtxCompute:
-    case CtxClasses::CtxKepler:
-    case CtxClasses::CtxDMA:
-    case CtxClasses::CtxChannelGPFIFO:
-        ctxObj_params.push_back(params);
-        return NvResult::Success;
-    default:
-        LOG_ERROR(Service_NVDRV, "Invalid class number for object context: {:X}", params.class_num);
+    std::scoped_lock lk(channel_mutex);
+
+    if (params.flags) {
+        LOG_WARNING(Service_NVDRV, "non-zero flags={:#X} for class={:#X}", params.flags,
+                    params.class_num);
+
+        constexpr u32 allowed_mask{};
+        params.flags = allowed_mask;
+    }
+
+    s32_le ctx_class_number_index = 
+        GetObjectContextClassNumberIndex(static_cast<CtxClasses>(params.class_num));
+    if (ctx_class_number_index < 0) {
+        LOG_ERROR(Service_NVDRV, "Invalid class number for object context: {:#X}",
+                  params.class_num);
         return NvResult::BadParameter;
     }
+
+    if (ctxObjs[ctx_class_number_index].has_value()) {
+        LOG_ERROR(Service_NVDRV, "Object context for class {:#X} already allocated on this channel",
+                  params.class_num);
+        return NvResult::AlreadyAllocated;
+    }
+
+    ctxObjs[ctx_class_number_index] = params;
+
+    return NvResult::Success;
 }
 
 static boost::container::small_vector<Tegra::CommandHeader, 512> BuildWaitCommandList(
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
index a017cc50d0..fb0a5be959 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
@@ -172,7 +172,7 @@ private:
     s32_le nvmap_fd{};
     u64_le user_data{};
     IoctlZCullBind zcull_params{};
-    std::vector<IoctlAllocObjCtx> ctxObj_params{};
+    std::array<std::optional<IoctlAllocObjCtx>, 6> ctxObjs{};
     u32_le channel_priority{};
     u32_le channel_timeslice{};
 
@@ -184,9 +184,12 @@ private:
     NvResult SetChannelPriority(IoctlChannelSetPriority& params);
     NvResult AllocGPFIFOEx(IoctlAllocGpfifoEx& params, DeviceFD fd);
     NvResult AllocGPFIFOEx2(IoctlAllocGpfifoEx& params, DeviceFD fd);
+
+    s32_le GetObjectContextClassNumberIndex(CtxClasses class_number);
     NvResult AllocateObjectContext(IoctlAllocObjCtx& params);
 
     NvResult SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, Tegra::CommandList&& entries);
+
     NvResult SubmitGPFIFOBase1(IoctlSubmitGpfifo& params,
                                std::span<Tegra::CommandListHeader> commands, bool kickoff = false);
     NvResult SubmitGPFIFOBase2(IoctlSubmitGpfifo& params,

From 10c76568b89d1cf2b8bc950a6b633443bd2bb728 Mon Sep 17 00:00:00 2001
From: Caio Oliveira <caiooliveirafarias0@gmail.com>
Date: Sun, 31 Aug 2025 08:40:46 +0200
Subject: [PATCH 14/38] [common, fs] include missing header introduced on #330
 (#370)

Signed-off-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/370
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: Lizzie <lizzie@eden-emu.dev>
Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Co-committed-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
---
 src/common/string_util.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/common/string_util.h b/src/common/string_util.h
index 8ed87cdadc..4358541b14 100644
--- a/src/common/string_util.h
+++ b/src/common/string_util.h
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <algorithm>
 #include <cstddef>
 #include <span>
 #include <string>

From e60fd4b68bc8d43d141bfba73086518861044678 Mon Sep 17 00:00:00 2001
From: wildcard <wildcard@eden-emu.dev>
Date: Mon, 1 Sep 2025 00:20:03 +0200
Subject: [PATCH 15/38] [VMA] Phase 3:- Hand all allocation & binding to VMA
 (#362)

This patch completely removes the Custom Sub allocator with VMA and delegates everything to the VMA.
Overall, the patch integrates VMA and simplifies memory management.
Once these changes pass the testing, it will be used as a base for further improvement.
Note to testers, test for stability and performance.

Co-authored-by: crueter <crueter@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/362
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: MaranBr <maranbr@outlook.com>
Co-authored-by: wildcard <wildcard@eden-emu.dev>
Co-committed-by: wildcard <wildcard@eden-emu.dev>
---
 externals/CMakeLists.txt                      |   4 +
 src/android/app/src/main/jni/CMakeLists.txt   |   2 +-
 src/video_core/vulkan_common/vma.h            |   4 +-
 .../vulkan_common/vulkan_device.cpp           |  30 +-
 .../vulkan_common/vulkan_memory_allocator.cpp | 642 ++++++++----------
 .../vulkan_common/vulkan_memory_allocator.h   | 203 +++---
 6 files changed, 411 insertions(+), 474 deletions(-)

diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index b209b48db9..f66423a672 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -147,6 +147,10 @@ add_subdirectory(nx_tzdb)
 # VMA
 AddJsonPackage(vulkan-memory-allocator)
 
+if (VulkanMemoryAllocator_ADDED AND MSVC)
+    target_compile_options(VulkanMemoryAllocator INTERFACE /wd4189)
+endif()
+
 if (NOT TARGET LLVM::Demangle)
     add_library(demangle demangle/ItaniumDemangle.cpp)
     target_include_directories(demangle PUBLIC ./demangle)
diff --git a/src/android/app/src/main/jni/CMakeLists.txt b/src/android/app/src/main/jni/CMakeLists.txt
index 1e30b16d96..9dbee1fcef 100644
--- a/src/android/app/src/main/jni/CMakeLists.txt
+++ b/src/android/app/src/main/jni/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(yuzu-android SHARED
 
 set_property(TARGET yuzu-android PROPERTY IMPORTED_LOCATION ${FFmpeg_LIBRARY_DIR})
 
-target_link_libraries(yuzu-android PRIVATE audio_core common core input_common frontend_common Vulkan::Headers)
+target_link_libraries(yuzu-android PRIVATE audio_core common core input_common frontend_common Vulkan::Headers GPUOpen::VulkanMemoryAllocator)
 target_link_libraries(yuzu-android PRIVATE android camera2ndk EGL glad jnigraphics log)
 if (ARCHITECTURE_arm64)
     target_link_libraries(yuzu-android PRIVATE adrenotools)
diff --git a/src/video_core/vulkan_common/vma.h b/src/video_core/vulkan_common/vma.h
index 6e25aa1bdf..911c1114b2 100644
--- a/src/video_core/vulkan_common/vma.h
+++ b/src/video_core/vulkan_common/vma.h
@@ -1,3 +1,5 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -8,4 +10,4 @@
 #define VMA_STATIC_VULKAN_FUNCTIONS 0
 #define VMA_DYNAMIC_VULKAN_FUNCTIONS 1
 
-#include <vk_mem_alloc.h>
+#include "vk_mem_alloc.h"
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 95c0d974cc..4d74bf00a5 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -753,18 +753,24 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
     functions.vkGetInstanceProcAddr = dld.vkGetInstanceProcAddr;
     functions.vkGetDeviceProcAddr = dld.vkGetDeviceProcAddr;
 
-    const VmaAllocatorCreateInfo allocator_info = {
-        .flags = VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT,
-        .physicalDevice = physical,
-        .device = *logical,
-        .preferredLargeHeapBlockSize = 0,
-        .pAllocationCallbacks = nullptr,
-        .pDeviceMemoryCallbacks = nullptr,
-        .pHeapSizeLimit = nullptr,
-        .pVulkanFunctions = &functions,
-        .instance = instance,
-        .vulkanApiVersion = VK_API_VERSION_1_1,
-        .pTypeExternalMemoryHandleTypes = nullptr,
+    VmaAllocatorCreateFlags flags = VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT;
+    if (extensions.memory_budget) {
+        flags |= VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT;
+    }
+    const VmaAllocatorCreateInfo allocator_info{
+            .flags = flags,
+            .physicalDevice = physical,
+            .device = *logical,
+            .preferredLargeHeapBlockSize = is_integrated
+                                           ? (64u * 1024u * 1024u)
+                                           : (256u * 1024u * 1024u),
+            .pAllocationCallbacks = nullptr,
+            .pDeviceMemoryCallbacks = nullptr,
+            .pHeapSizeLimit = nullptr,
+            .pVulkanFunctions = &functions,
+            .instance = instance,
+            .vulkanApiVersion = ApiVersion(),
+            .pTypeExternalMemoryHandleTypes = nullptr,
     };
 
     vk::Check(vmaCreateAllocator(&allocator_info, &allocator));
diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
index 2e37615f99..4ab420afea 100644
--- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
+++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
@@ -6,7 +6,10 @@
 
 #include <algorithm>
 #include <bit>
+#include <limits>
 #include <optional>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "common/alignment.h"
@@ -21,379 +24,302 @@
 #include "video_core/vulkan_common/vulkan_wrapper.h"
 
 namespace Vulkan {
-namespace {
-struct Range {
-    u64 begin;
-    u64 end;
+    namespace {
 
-    [[nodiscard]] bool Contains(u64 iterator, u64 size) const noexcept {
-        return iterator < end && begin < iterator + size;
-    }
-};
+// Helpers translating MemoryUsage to flags/usage
 
-[[nodiscard]] u64 AllocationChunkSize(u64 required_size) {
-    static constexpr std::array sizes{
-        0x1000ULL << 10,  0x1400ULL << 10,  0x1800ULL << 10,  0x1c00ULL << 10, 0x2000ULL << 10,
-        0x3200ULL << 10,  0x4000ULL << 10,  0x6000ULL << 10,  0x8000ULL << 10, 0xA000ULL << 10,
-        0x10000ULL << 10, 0x18000ULL << 10, 0x20000ULL << 10,
-    };
-    static_assert(std::is_sorted(sizes.begin(), sizes.end()));
-
-    const auto it = std::ranges::lower_bound(sizes, required_size);
-    return it != sizes.end() ? *it : Common::AlignUp(required_size, 4ULL << 20);
-}
-
-[[nodiscard]] VkMemoryPropertyFlags MemoryUsagePropertyFlags(MemoryUsage usage) {
-    switch (usage) {
-    case MemoryUsage::DeviceLocal:
-        return VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
-    case MemoryUsage::Upload:
-        return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
-    case MemoryUsage::Download:
-        return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-               VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
-    case MemoryUsage::Stream:
-        return VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-               VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
-    }
-    ASSERT_MSG(false, "Invalid memory usage={}", usage);
-    return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
-}
-
-[[nodiscard]] VkMemoryPropertyFlags MemoryUsagePreferredVmaFlags(MemoryUsage usage) {
-    return usage != MemoryUsage::DeviceLocal ? VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
-                                             : VkMemoryPropertyFlagBits{};
-}
-
-[[nodiscard]] VmaAllocationCreateFlags MemoryUsageVmaFlags(MemoryUsage usage) {
-    switch (usage) {
-    case MemoryUsage::Upload:
-    case MemoryUsage::Stream:
-        return VMA_ALLOCATION_CREATE_MAPPED_BIT |
-               VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
-    case MemoryUsage::Download:
-        return VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
-    case MemoryUsage::DeviceLocal:
-        return {};
-    }
-    return {};
-}
-
-[[nodiscard]] VmaMemoryUsage MemoryUsageVma(MemoryUsage usage) {
-    switch (usage) {
-    case MemoryUsage::DeviceLocal:
-    case MemoryUsage::Stream:
-        return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
-    case MemoryUsage::Upload:
-    case MemoryUsage::Download:
-        return VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
-    }
-    return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
-}
-
-} // Anonymous namespace
-
-class MemoryAllocation {
-public:
-    explicit MemoryAllocation(MemoryAllocator* const allocator_, vk::DeviceMemory memory_,
-                              VkMemoryPropertyFlags properties, u64 allocation_size_, u32 type)
-        : allocator{allocator_}, memory{std::move(memory_)}, allocation_size{allocation_size_},
-          property_flags{properties}, shifted_memory_type{1U << type} {}
-
-    MemoryAllocation& operator=(const MemoryAllocation&) = delete;
-    MemoryAllocation(const MemoryAllocation&) = delete;
-
-    MemoryAllocation& operator=(MemoryAllocation&&) = delete;
-    MemoryAllocation(MemoryAllocation&&) = delete;
-
-    [[nodiscard]] std::optional<MemoryCommit> Commit(VkDeviceSize size, VkDeviceSize alignment) {
-        const std::optional<u64> alloc = FindFreeRegion(size, alignment);
-        if (!alloc) {
-            // Signal out of memory, it'll try to do more allocations.
-            return std::nullopt;
+        [[maybe_unused]] VkMemoryPropertyFlags MemoryUsagePropertyFlags(MemoryUsage usage) {
+            switch (usage) {
+                case MemoryUsage::DeviceLocal:
+                    return VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+                case MemoryUsage::Upload:
+                    return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                           VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+                case MemoryUsage::Download:
+                    return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                           VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                           VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+                case MemoryUsage::Stream:
+                    return VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                           VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                           VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+            }
+            ASSERT_MSG(false, "Invalid memory usage={}", usage);
+            return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
         }
-        const Range range{
-            .begin = *alloc,
-            .end = *alloc + size,
+
+        [[nodiscard]] VkMemoryPropertyFlags MemoryUsagePreferredVmaFlags(MemoryUsage usage) {
+            return usage != MemoryUsage::DeviceLocal ? VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
+                                                     : VkMemoryPropertyFlagBits{};
+        }
+
+        [[nodiscard]] VmaAllocationCreateFlags MemoryUsageVmaFlags(MemoryUsage usage) {
+            switch (usage) {
+                case MemoryUsage::Upload:
+                case MemoryUsage::Stream:
+                    return VMA_ALLOCATION_CREATE_MAPPED_BIT |
+                           VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
+                case MemoryUsage::Download:
+                    return VMA_ALLOCATION_CREATE_MAPPED_BIT |
+                           VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+                case MemoryUsage::DeviceLocal:
+                    return {};
+            }
+            return {};
+        }
+
+        [[nodiscard]] VmaMemoryUsage MemoryUsageVma(MemoryUsage usage) {
+            switch (usage) {
+                case MemoryUsage::DeviceLocal:
+                case MemoryUsage::Stream:
+                    return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+                case MemoryUsage::Upload:
+                case MemoryUsage::Download:
+                    return VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
+            }
+            return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+        }
+
+
+// This avoids calling vkGetBufferMemoryRequirements* directly.
+        template<typename T>
+        static VkBuffer GetVkHandleFromBuffer(const T &buf) {
+            if constexpr (requires { static_cast<VkBuffer>(buf); }) {
+                return static_cast<VkBuffer>(buf);
+            } else if constexpr (requires {{ buf.GetHandle() } -> std::convertible_to<VkBuffer>; }) {
+                return buf.GetHandle();
+            } else if constexpr (requires {{ buf.Handle() } -> std::convertible_to<VkBuffer>; }) {
+                return buf.Handle();
+            } else if constexpr (requires {{ buf.vk_handle() } -> std::convertible_to<VkBuffer>; }) {
+                return buf.vk_handle();
+            } else {
+                static_assert(sizeof(T) == 0, "Cannot extract VkBuffer handle from vk::Buffer");
+                return VK_NULL_HANDLE;
+            }
+        }
+
+    } // namespace
+
+//MemoryCommit is now VMA-backed
+    MemoryCommit::MemoryCommit(VmaAllocator alloc, VmaAllocation a,
+                               const VmaAllocationInfo &info) noexcept
+            : allocator{alloc}, allocation{a}, memory{info.deviceMemory},
+              offset{info.offset}, size{info.size}, mapped_ptr{info.pMappedData} {}
+
+    MemoryCommit::~MemoryCommit() { Release(); }
+
+    MemoryCommit::MemoryCommit(MemoryCommit &&rhs) noexcept
+            : allocator{std::exchange(rhs.allocator, nullptr)},
+              allocation{std::exchange(rhs.allocation, nullptr)},
+              memory{std::exchange(rhs.memory, VK_NULL_HANDLE)},
+              offset{std::exchange(rhs.offset, 0)},
+              size{std::exchange(rhs.size, 0)},
+              mapped_ptr{std::exchange(rhs.mapped_ptr, nullptr)} {}
+
+    MemoryCommit &MemoryCommit::operator=(MemoryCommit &&rhs) noexcept {
+        if (this != &rhs) {
+            Release();
+            allocator = std::exchange(rhs.allocator, nullptr);
+            allocation = std::exchange(rhs.allocation, nullptr);
+            memory = std::exchange(rhs.memory, VK_NULL_HANDLE);
+            offset = std::exchange(rhs.offset, 0);
+            size = std::exchange(rhs.size, 0);
+            mapped_ptr = std::exchange(rhs.mapped_ptr, nullptr);
+        }
+        return *this;
+    }
+
+    std::span<u8> MemoryCommit::Map()
+    {
+        if (!allocation) return {};
+        if (!mapped_ptr) {
+            if (vmaMapMemory(allocator, allocation, &mapped_ptr) != VK_SUCCESS) return {};
+        }
+        const size_t n = static_cast<size_t>(std::min<VkDeviceSize>(size,
+                                                                    std::numeric_limits<size_t>::max()));
+        return std::span<u8>{static_cast<u8 *>(mapped_ptr), n};
+    }
+
+    std::span<const u8> MemoryCommit::Map() const
+    {
+        if (!allocation) return {};
+        if (!mapped_ptr) {
+            void *p = nullptr;
+            if (vmaMapMemory(allocator, allocation, &p) != VK_SUCCESS) return {};
+            const_cast<MemoryCommit *>(this)->mapped_ptr = p;
+        }
+        const size_t n = static_cast<size_t>(std::min<VkDeviceSize>(size,
+                                                                    std::numeric_limits<size_t>::max()));
+        return std::span<const u8>{static_cast<const u8 *>(mapped_ptr), n};
+    }
+
+    void MemoryCommit::Unmap()
+    {
+        if (allocation && mapped_ptr) {
+            vmaUnmapMemory(allocator, allocation);
+            mapped_ptr = nullptr;
+        }
+    }
+
+    void MemoryCommit::Release() {
+        if (allocation && allocator) {
+            if (mapped_ptr) {
+                vmaUnmapMemory(allocator, allocation);
+                mapped_ptr = nullptr;
+            }
+            vmaFreeMemory(allocator, allocation);
+        }
+        allocation = nullptr;
+        allocator = nullptr;
+        memory = VK_NULL_HANDLE;
+        offset = 0;
+        size = 0;
+    }
+
+    MemoryAllocator::MemoryAllocator(const Device &device_)
+            : device{device_}, allocator{device.GetAllocator()},
+              properties{device_.GetPhysical().GetMemoryProperties().memoryProperties},
+              buffer_image_granularity{
+                      device_.GetPhysical().GetProperties().limits.bufferImageGranularity} {
+
+        // Preserve the previous "RenderDoc small heap" trimming behavior that we had in original vma minus the heap bug
+        if (device.HasDebuggingToolAttached())
+        {
+            using namespace Common::Literals;
+            ForEachDeviceLocalHostVisibleHeap(device, [this](size_t heap_idx, VkMemoryHeap &heap) {
+                if (heap.size <= 256_MiB) {
+                    for (u32 t = 0; t < properties.memoryTypeCount; ++t) {
+                        if (properties.memoryTypes[t].heapIndex == heap_idx) {
+                            valid_memory_types &= ~(1u << t);
+                        }
+                    }
+                }
+            });
+        }
+    }
+
+    MemoryAllocator::~MemoryAllocator() = default;
+
+    vk::Image MemoryAllocator::CreateImage(const VkImageCreateInfo &ci) const
+    {
+        const VmaAllocationCreateInfo alloc_ci = {
+                .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT,
+                .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
+                .requiredFlags = 0,
+                .preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+                .memoryTypeBits = 0,
+                .pool = VK_NULL_HANDLE,
+                .pUserData = nullptr,
+                .priority = 0.f,
         };
-        commits.insert(std::ranges::upper_bound(commits, *alloc, {}, &Range::begin), range);
-        return std::make_optional<MemoryCommit>(this, *memory, *alloc, *alloc + size);
+
+        VkImage handle{};
+        VmaAllocation allocation{};
+        vk::Check(vmaCreateImage(allocator, &ci, &alloc_ci, &handle, &allocation, nullptr));
+        return vk::Image(handle, ci.usage, *device.GetLogical(), allocator, allocation,
+                         device.GetDispatchLoader());
     }
 
-    void Free(u64 begin) {
-        const auto it = std::ranges::find(commits, begin, &Range::begin);
-        ASSERT_MSG(it != commits.end(), "Invalid commit");
-        commits.erase(it);
-        if (commits.empty()) {
-            // Do not call any code involving 'this' after this call, the object will be destroyed
-            allocator->ReleaseMemory(this);
-        }
+    vk::Buffer
+    MemoryAllocator::CreateBuffer(const VkBufferCreateInfo &ci, MemoryUsage usage) const
+    {
+        const VmaAllocationCreateInfo alloc_ci = {
+                .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage),
+                .usage = MemoryUsageVma(usage),
+                .requiredFlags = 0,
+                .preferredFlags = MemoryUsagePreferredVmaFlags(usage),
+                .memoryTypeBits = usage == MemoryUsage::Stream ? 0u : valid_memory_types,
+                .pool = VK_NULL_HANDLE,
+                .pUserData = nullptr,
+                .priority = 0.f,
+        };
+
+        VkBuffer handle{};
+        VmaAllocationInfo alloc_info{};
+        VmaAllocation allocation{};
+        VkMemoryPropertyFlags property_flags{};
+
+        vk::Check(vmaCreateBuffer(allocator, &ci, &alloc_ci, &handle, &allocation, &alloc_info));
+        vmaGetAllocationMemoryProperties(allocator, allocation, &property_flags);
+
+        u8 *data = reinterpret_cast<u8 *>(alloc_info.pMappedData);
+        const std::span<u8> mapped_data = data ? std::span<u8>{data, ci.size} : std::span<u8>{};
+        const bool is_coherent = (property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;
+
+        return vk::Buffer(handle, *device.GetLogical(), allocator, allocation, mapped_data,
+                          is_coherent,
+                          device.GetDispatchLoader());
     }
 
-    [[nodiscard]] std::span<u8> Map() {
-        if (memory_mapped_span.empty()) {
-            u8* const raw_pointer = memory.Map(0, allocation_size);
-            memory_mapped_span = std::span<u8>(raw_pointer, allocation_size);
-        }
-        return memory_mapped_span;
-    }
+    MemoryCommit MemoryAllocator::Commit(const VkMemoryRequirements &reqs, MemoryUsage usage)
+    {
+        const auto vma_usage = MemoryUsageVma(usage);
+        VmaAllocationCreateInfo ci{};
+        ci.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage);
+        ci.usage = vma_usage;
+        ci.memoryTypeBits = reqs.memoryTypeBits & valid_memory_types;
+        ci.requiredFlags = 0;
+        ci.preferredFlags = MemoryUsagePreferredVmaFlags(usage);
 
-    /// Returns whether this allocation is compatible with the arguments.
-    [[nodiscard]] bool IsCompatible(VkMemoryPropertyFlags flags, u32 type_mask) const {
-        return (flags & property_flags) == flags && (type_mask & shifted_memory_type) != 0;
-    }
+        VmaAllocation a{};
+        VmaAllocationInfo info{};
 
+        VkResult res = vmaAllocateMemory(allocator, &reqs, &ci, &a, &info);
 
-private:
-    [[nodiscard]] static constexpr u32 ShiftType(u32 type) {
-        return 1U << type;
-    }
+        if (res != VK_SUCCESS) {
+            // Relax 1: drop budget constraint
+            auto ci2 = ci;
+            ci2.flags &= ~VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT;
+            res = vmaAllocateMemory(allocator, &reqs, &ci2, &a, &info);
 
-    [[nodiscard]] std::optional<u64> FindFreeRegion(u64 size, u64 alignment) noexcept {
-        ASSERT(std::has_single_bit(alignment));
-        const u64 alignment_log2 = std::countr_zero(alignment);
-        std::optional<u64> candidate;
-        u64 iterator = 0;
-        auto commit = commits.begin();
-        while (iterator + size <= allocation_size) {
-            candidate = candidate.value_or(iterator);
-            if (commit == commits.end()) {
-                break;
+            // Relax 2: if we preferred DEVICE_LOCAL, drop that preference
+            if (res != VK_SUCCESS && (ci.preferredFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) {
+                auto ci3 = ci2;
+                ci3.preferredFlags &= ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+                res = vmaAllocateMemory(allocator, &reqs, &ci3, &a, &info);
             }
-            if (commit->Contains(*candidate, size)) {
-                candidate = std::nullopt;
+        }
+
+        vk::Check(res);
+        return MemoryCommit(allocator, a, info);
+    }
+
+    MemoryCommit MemoryAllocator::Commit(const vk::Buffer &buffer, MemoryUsage usage) {
+        // Allocate memory appropriate for this buffer automatically
+        const auto vma_usage = MemoryUsageVma(usage);
+
+        VmaAllocationCreateInfo ci{};
+        ci.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage);
+        ci.usage = vma_usage;
+        ci.requiredFlags = 0;
+        ci.preferredFlags = MemoryUsagePreferredVmaFlags(usage);
+        ci.pool = VK_NULL_HANDLE;
+        ci.pUserData = nullptr;
+        ci.priority = 0.0f;
+
+        const VkBuffer raw = *buffer;
+
+        VmaAllocation a{};
+        VmaAllocationInfo info{};
+
+        // Let VMA infer memory requirements from the buffer
+        VkResult res = vmaAllocateMemoryForBuffer(allocator, raw, &ci, &a, &info);
+
+        if (res != VK_SUCCESS) {
+            auto ci2 = ci;
+            ci2.flags &= ~VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT;
+            res = vmaAllocateMemoryForBuffer(allocator, raw, &ci2, &a, &info);
+
+            if (res != VK_SUCCESS && (ci.preferredFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) {
+                auto ci3 = ci2;
+                ci3.preferredFlags &= ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+                res = vmaAllocateMemoryForBuffer(allocator, raw, &ci3, &a, &info);
             }
-            iterator = Common::AlignUpLog2(commit->end, alignment_log2);
-            ++commit;
         }
-        return candidate;
+
+        vk::Check(res);
+        vk::Check(vmaBindBufferMemory2(allocator, a, 0, raw, nullptr));
+        return MemoryCommit(allocator, a, info);
     }
 
-    MemoryAllocator* const allocator;           ///< Parent memory allocation.
-    const vk::DeviceMemory memory;              ///< Vulkan memory allocation handler.
-    const u64 allocation_size;                  ///< Size of this allocation.
-    const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags.
-    const u32 shifted_memory_type;              ///< Shifted Vulkan memory type.
-    std::vector<Range> commits;                 ///< All commit ranges done from this allocation.
-    std::span<u8> memory_mapped_span; ///< Memory mapped span. Empty if not queried before.
-};
-
-MemoryCommit::MemoryCommit(MemoryAllocation* allocation_, VkDeviceMemory memory_, u64 begin_,
-                           u64 end_) noexcept
-    : allocation{allocation_}, memory{memory_}, begin{begin_}, end{end_} {}
-
-MemoryCommit::~MemoryCommit() {
-    Release();
-}
-
-MemoryCommit& MemoryCommit::operator=(MemoryCommit&& rhs) noexcept {
-    Release();
-    allocation = std::exchange(rhs.allocation, nullptr);
-    memory = rhs.memory;
-    begin = rhs.begin;
-    end = rhs.end;
-    span = std::exchange(rhs.span, std::span<u8>{});
-    return *this;
-}
-
-MemoryCommit::MemoryCommit(MemoryCommit&& rhs) noexcept
-    : allocation{std::exchange(rhs.allocation, nullptr)}, memory{rhs.memory}, begin{rhs.begin},
-      end{rhs.end}, span{std::exchange(rhs.span, std::span<u8>{})} {}
-
-std::span<u8> MemoryCommit::Map() {
-    if (span.empty()) {
-        span = allocation->Map().subspan(begin, end - begin);
-    }
-    return span;
-}
-
-void MemoryCommit::Release() {
-    if (allocation) {
-        allocation->Free(begin);
-    }
-}
-
-MemoryAllocator::MemoryAllocator(const Device& device_)
-    : device{device_}, allocator{device.GetAllocator()},
-      properties{device_.GetPhysical().GetMemoryProperties().memoryProperties},
-      buffer_image_granularity{
-          device_.GetPhysical().GetProperties().limits.bufferImageGranularity} {
-    // GPUs not supporting rebar may only have a region with less than 256MB host visible/device
-    // local memory. In that case, opening 2 RenderDoc captures side-by-side is not possible due to
-    // the heap running out of memory. With RenderDoc attached and only a small host/device region,
-    // only allow the stream buffer in this memory heap.
-    if (device.HasDebuggingToolAttached()) {
-        using namespace Common::Literals;
-        ForEachDeviceLocalHostVisibleHeap(device, [this](size_t index, VkMemoryHeap& heap) {
-            if (heap.size <= 256_MiB) {
-                valid_memory_types &= ~(1u << index);
-            }
-        });
-    }
-}
-
-MemoryAllocator::~MemoryAllocator() = default;
-
-vk::Image MemoryAllocator::CreateImage(const VkImageCreateInfo& ci) const {
-    const VmaAllocationCreateInfo alloc_ci = {
-        .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT,
-        .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
-        .requiredFlags = 0,
-        .preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-        .memoryTypeBits = 0,
-        .pool = VK_NULL_HANDLE,
-        .pUserData = nullptr,
-        .priority = 0.f,
-    };
-
-    VkImage handle{};
-    VmaAllocation allocation{};
-
-    vk::Check(vmaCreateImage(allocator, &ci, &alloc_ci, &handle, &allocation, nullptr));
-
-    return vk::Image(handle, ci.usage, *device.GetLogical(), allocator, allocation,
-                     device.GetDispatchLoader());
-}
-
-vk::Buffer MemoryAllocator::CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsage usage) const {
-    const VmaAllocationCreateInfo alloc_ci = {
-        .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage),
-        .usage = MemoryUsageVma(usage),
-        .requiredFlags = 0,
-        .preferredFlags = MemoryUsagePreferredVmaFlags(usage),
-        .memoryTypeBits = usage == MemoryUsage::Stream ? 0u : valid_memory_types,
-        .pool = VK_NULL_HANDLE,
-        .pUserData = nullptr,
-        .priority = 0.f,
-    };
-
-    VkBuffer handle{};
-    VmaAllocationInfo alloc_info{};
-    VmaAllocation allocation{};
-    VkMemoryPropertyFlags property_flags{};
-
-    vk::Check(vmaCreateBuffer(allocator, &ci, &alloc_ci, &handle, &allocation, &alloc_info));
-    vmaGetAllocationMemoryProperties(allocator, allocation, &property_flags);
-
-    u8* data = reinterpret_cast<u8*>(alloc_info.pMappedData);
-    const std::span<u8> mapped_data = data ? std::span<u8>{data, ci.size} : std::span<u8>{};
-    const bool is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
-
-    return vk::Buffer(handle, *device.GetLogical(), allocator, allocation, mapped_data, is_coherent,
-                      device.GetDispatchLoader());
-}
-
-MemoryCommit MemoryAllocator::Commit(const VkMemoryRequirements& requirements, MemoryUsage usage) {
-        // Find the fastest memory flags we can afford with the current requirements
-        const u32 type_mask = requirements.memoryTypeBits;
-        const VkMemoryPropertyFlags usage_flags = MemoryUsagePropertyFlags(usage);
-        const VkMemoryPropertyFlags flags = MemoryPropertyFlags(type_mask, usage_flags);
-        if (std::optional<MemoryCommit> commit = TryCommit(requirements, flags)) {
-            return std::move(*commit);
-        }
-        // Commit has failed, allocate more memory.
-        const u64 chunk_size = AllocationChunkSize(requirements.size);
-        if (!TryAllocMemory(flags, type_mask, chunk_size)) {
-            // TODO(Rodrigo): Handle out of memory situations in some way like flushing to guest memory.
-            throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY);
-        }
-        // Commit again, this time it won't fail since there's a fresh allocation above.
-        // If it does, there's a bug.
-        return TryCommit(requirements, flags).value();
-    }
-
-bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) {
-    const auto type_opt = FindType(flags, type_mask);
-    if (!type_opt) {
-        return false;
-    }
-
-    // Adreno stands firm
-    const u64 aligned_size = (device.GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY) ?
-                            Common::AlignUp(size, 4096) :
-                            size;
-
-    vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory({
-        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
-        .pNext = nullptr,
-        .allocationSize = aligned_size,
-        .memoryTypeIndex = *type_opt,
-    });
-
-    if (!memory) {
-        return false;
-    }
-
-    allocations.push_back(
-        std::make_unique<MemoryAllocation>(this, std::move(memory), flags, aligned_size, *type_opt));
-    return true;
-}
-
-void MemoryAllocator::ReleaseMemory(MemoryAllocation* alloc) {
-    const auto it = std::ranges::find(allocations, alloc, &std::unique_ptr<MemoryAllocation>::get);
-    ASSERT(it != allocations.end());
-    allocations.erase(it);
-}
-
-std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements,
-                                                       VkMemoryPropertyFlags flags) {
-    // Conservative, spec-compliant alignment for suballocation
-    VkDeviceSize eff_align = requirements.alignment;
-    const auto& limits = device.GetPhysical().GetProperties().limits;
-    if ((flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) &&
-        !(flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
-        // Non-coherent memory must be invalidated on atom boundary
-        if (limits.nonCoherentAtomSize > eff_align) eff_align = limits.nonCoherentAtomSize;
-    }
-    // Separate buffers to avoid stalls on tilers
-    if (buffer_image_granularity > eff_align) {
-        eff_align = buffer_image_granularity;
-    }
-    eff_align = std::bit_ceil(eff_align);
-
-    for (auto& allocation : allocations) {
-        if (!allocation->IsCompatible(flags, requirements.memoryTypeBits)) {
-            continue;
-        }
-        if (auto commit = allocation->Commit(requirements.size, eff_align)) {
-            return commit;
-        }
-    }
-    if ((flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0) {
-        // Look for non device local commits on failure
-        return TryCommit(requirements, flags & ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
-    }
-    return std::nullopt;
-}
-
-VkMemoryPropertyFlags MemoryAllocator::MemoryPropertyFlags(u32 type_mask,
-                                                           VkMemoryPropertyFlags flags) const {
-    if (FindType(flags, type_mask)) {
-        // Found a memory type with those requirements
-        return flags;
-    }
-    if ((flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) != 0) {
-        // Remove host cached bit in case it's not supported
-        return MemoryPropertyFlags(type_mask, flags & ~VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
-    }
-    if ((flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0) {
-        // Remove device local, if it's not supported by the requested resource
-        return MemoryPropertyFlags(type_mask, flags & ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
-    }
-    ASSERT_MSG(false, "No compatible memory types found");
-    return 0;
-}
-
-std::optional<u32> MemoryAllocator::FindType(VkMemoryPropertyFlags flags, u32 type_mask) const {
-    for (u32 type_index = 0; type_index < properties.memoryTypeCount; ++type_index) {
-        const VkMemoryPropertyFlags type_flags = properties.memoryTypes[type_index].propertyFlags;
-        if ((type_mask & (1U << type_index)) != 0 && (type_flags & flags) == flags) {
-            // The type matches in type and in the wanted properties.
-            return type_index;
-        }
-    }
-    // Failed to find index
-    return std::nullopt;
-}
-
 } // namespace Vulkan
diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h
index 38a182bcba..581f2e66d2 100644
--- a/src/video_core/vulkan_common/vulkan_memory_allocator.h
+++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -6,138 +9,134 @@
 #include <memory>
 #include <span>
 #include <vector>
+
 #include "common/common_types.h"
 #include "video_core/vulkan_common/vulkan_device.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
-
-VK_DEFINE_HANDLE(VmaAllocator)
+#include "video_core/vulkan_common/vma.h"
 
 namespace Vulkan {
 
-class Device;
-class MemoryMap;
-class MemoryAllocation;
+    class Device;
 
 /// Hints and requirements for the backing memory type of a commit
-enum class MemoryUsage {
-    DeviceLocal, ///< Requests device local host visible buffer, falling back to device local
-                 ///< memory.
-    Upload,      ///< Requires a host visible memory type optimized for CPU to GPU uploads
-    Download,    ///< Requires a host visible memory type optimized for GPU to CPU readbacks
-    Stream,      ///< Requests device local host visible buffer, falling back host memory.
-};
+    enum class MemoryUsage {
+        DeviceLocal, ///< Requests device local host visible buffer, falling back to device local memory.
+        Upload,      ///< Requires a host visible memory type optimized for CPU to GPU uploads
+        Download,    ///< Requires a host visible memory type optimized for GPU to CPU readbacks
+        Stream,      ///< Requests device local host visible buffer, falling back host memory.
+    };
 
-template <typename F>
-void ForEachDeviceLocalHostVisibleHeap(const Device& device, F&& f) {
-    auto memory_props = device.GetPhysical().GetMemoryProperties().memoryProperties;
-    for (size_t i = 0; i < memory_props.memoryTypeCount; i++) {
-        auto& memory_type = memory_props.memoryTypes[i];
-        if ((memory_type.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) &&
-            (memory_type.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) {
-            f(memory_type.heapIndex, memory_props.memoryHeaps[memory_type.heapIndex]);
+    template<typename F>
+    void ForEachDeviceLocalHostVisibleHeap(const Device &device, F &&f) {
+        auto memory_props = device.GetPhysical().GetMemoryProperties().memoryProperties;
+        for (size_t i = 0; i < memory_props.memoryTypeCount; i++) {
+            auto &memory_type = memory_props.memoryTypes[i];
+            if ((memory_type.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) &&
+                (memory_type.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) {
+                f(memory_type.heapIndex, memory_props.memoryHeaps[memory_type.heapIndex]);
+            }
         }
     }
-}
 
-/// Ownership handle of a memory commitment.
-/// Points to a subregion of a memory allocation.
-class MemoryCommit {
-public:
-    explicit MemoryCommit() noexcept = default;
-    explicit MemoryCommit(MemoryAllocation* allocation_, VkDeviceMemory memory_, u64 begin_,
-                          u64 end_) noexcept;
-    ~MemoryCommit();
+/// Ownership handle of a memory commitment (real VMA allocation).
+    class MemoryCommit {
+    public:
+        MemoryCommit() noexcept = default;
 
-    MemoryCommit& operator=(MemoryCommit&&) noexcept;
-    MemoryCommit(MemoryCommit&&) noexcept;
+        MemoryCommit(VmaAllocator allocator, VmaAllocation allocation,
+                     const VmaAllocationInfo &info) noexcept;
 
-    MemoryCommit& operator=(const MemoryCommit&) = delete;
-    MemoryCommit(const MemoryCommit&) = delete;
+        ~MemoryCommit();
 
-    /// Returns a host visible memory map.
-    /// It will map the backing allocation if it hasn't been mapped before.
-    std::span<u8> Map();
+        MemoryCommit(const MemoryCommit &) = delete;
 
-    /// Returns the Vulkan memory handler.
-    VkDeviceMemory Memory() const {
-        return memory;
-    }
+        MemoryCommit &operator=(const MemoryCommit &) = delete;
 
-    /// Returns the start position of the commit relative to the allocation.
-    VkDeviceSize Offset() const {
-        return static_cast<VkDeviceSize>(begin);
-    }
+        MemoryCommit(MemoryCommit &&) noexcept;
 
-private:
-    void Release();
+        MemoryCommit &operator=(MemoryCommit &&) noexcept;
 
-    MemoryAllocation* allocation{}; ///< Pointer to the large memory allocation.
-    VkDeviceMemory memory{};        ///< Vulkan device memory handler.
-    u64 begin{};                    ///< Beginning offset in bytes to where the commit exists.
-    u64 end{};                      ///< Offset in bytes where the commit ends.
-    std::span<u8> span;             ///< Host visible memory span. Empty if not queried before.
-};
+        [[nodiscard]] std::span<u8> Map();
+
+        [[nodiscard]] std::span<const u8> Map() const;
+
+        void Unmap();
+
+        explicit operator bool() const noexcept { return allocation != nullptr; }
+
+        VkDeviceMemory Memory() const noexcept { return memory; }
+
+        VkDeviceSize Offset() const noexcept { return offset; }
+
+        VkDeviceSize Size() const noexcept { return size; }
+
+        VmaAllocation Allocation() const noexcept { return allocation; }
+
+    private:
+        void Release();
+
+        VmaAllocator allocator{};   ///< VMA allocator
+        VmaAllocation allocation{};  ///< VMA allocation handle
+        VkDeviceMemory memory{};      ///< Underlying VkDeviceMemory chosen by VMA
+        VkDeviceSize offset{};      ///< Offset of this allocation inside VkDeviceMemory
+        VkDeviceSize size{};        ///< Size of the allocation
+        void *mapped_ptr{};  ///< Optional persistent mapped pointer
+    };
 
 /// Memory allocator container.
 /// Allocates and releases memory allocations on demand.
-class MemoryAllocator {
-    friend MemoryAllocation;
+    class MemoryAllocator {
+    public:
+        /**
+         * Construct memory allocator
+         *
+         * @param device_  Device to allocate from
+         *
+         * @throw vk::Exception on failure
+         */
+        explicit MemoryAllocator(const Device &device_);
 
-public:
-    /**
-     * Construct memory allocator
-     *
-     * @param device_             Device to allocate from
-     *
-     * @throw vk::Exception on failure
-     */
-    explicit MemoryAllocator(const Device& device_);
-    ~MemoryAllocator();
+        ~MemoryAllocator();
 
-    MemoryAllocator& operator=(const MemoryAllocator&) = delete;
-    MemoryAllocator(const MemoryAllocator&) = delete;
+        MemoryAllocator &operator=(const MemoryAllocator &) = delete;
 
-    vk::Image CreateImage(const VkImageCreateInfo& ci) const;
+        MemoryAllocator(const MemoryAllocator &) = delete;
 
-    vk::Buffer CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsage usage) const;
+        vk::Image CreateImage(const VkImageCreateInfo &ci) const;
 
-    /**
-     * Commits a memory with the specified requirements.
-     *
-     * @param requirements Requirements returned from a Vulkan call.
-     * @param usage        Indicates how the memory will be used.
-     *
-     * @returns A memory commit.
-     */
-    MemoryCommit Commit(const VkMemoryRequirements& requirements, MemoryUsage usage);
+        vk::Buffer CreateBuffer(const VkBufferCreateInfo &ci, MemoryUsage usage) const;
 
-    /// Commits memory required by the buffer and binds it.
-    MemoryCommit Commit(const vk::Buffer& buffer, MemoryUsage usage);
+        /**
+         * Commits a memory with the specified requirements.
+         *
+         * @param requirements Requirements returned from a Vulkan call.
+         * @param usage        Indicates how the memory will be used.
+         *
+         * @returns A memory commit.
+         */
+        MemoryCommit Commit(const VkMemoryRequirements &requirements, MemoryUsage usage);
 
-private:
-    /// Tries to allocate a chunk of memory.
-    bool TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size);
+        /// Commits memory required by the buffer and binds it (for buffers created outside VMA).
+        MemoryCommit Commit(const vk::Buffer &buffer, MemoryUsage usage);
 
-    /// Releases a chunk of memory.
-    void ReleaseMemory(MemoryAllocation* alloc);
+    private:
+        static bool IsAutoUsage(VmaMemoryUsage u) noexcept {
+            switch (u) {
+                case VMA_MEMORY_USAGE_AUTO:
+                case VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE:
+                case VMA_MEMORY_USAGE_AUTO_PREFER_HOST:
+                    return true;
+                default:
+                    return false;
+            }
+        }
 
-    /// Tries to allocate a memory commit.
-    std::optional<MemoryCommit> TryCommit(const VkMemoryRequirements& requirements,
-                                          VkMemoryPropertyFlags flags);
-
-    /// Returns the fastest compatible memory property flags from the wanted flags.
-    VkMemoryPropertyFlags MemoryPropertyFlags(u32 type_mask, VkMemoryPropertyFlags flags) const;
-
-    /// Returns index to the fastest memory type compatible with the passed requirements.
-    std::optional<u32> FindType(VkMemoryPropertyFlags flags, u32 type_mask) const;
-
-    const Device& device;                                       ///< Device handle.
-    VmaAllocator allocator;                                     ///< Vma allocator.
-    const VkPhysicalDeviceMemoryProperties properties;          ///< Physical device properties.
-    std::vector<std::unique_ptr<MemoryAllocation>> allocations; ///< Current allocations.
-    VkDeviceSize buffer_image_granularity; // The granularity for adjacent offsets between buffers
-                                           // and optimal images
-    u32 valid_memory_types{~0u};
-};
+        const Device &device;                              ///< Device handle.
+        VmaAllocator allocator;                           ///< VMA allocator.
+        const VkPhysicalDeviceMemoryProperties properties; ///< Physical device memory properties.
+        VkDeviceSize buffer_image_granularity;            ///< Adjacent buffer/image granularity
+        u32 valid_memory_types{~0u};
+    };
 
 } // namespace Vulkan

From 6fcfe7f4f38f7f411ce8aa0a7752e272c65dc1f3 Mon Sep 17 00:00:00 2001
From: innix <dev@innix.space>
Date: Mon, 1 Sep 2025 09:23:03 +0200
Subject: [PATCH 16/38] [macOS, compat] Allow games to boot in MacOS (#372)

This fixes the crashes on game launch caused by MacOS not being present in host_manager.cpp and enables primitiveRestart for MoltenVK to suppress a bunch of errors given in the log about  MoltenVK requiring primitiveRestart. Fixes an crash when switching kingdoms in Mario Odyssey as well

EDS is forced to 0, otherwise games do not show graphics

Note: For now only dynarmicc is working, performance will be slow
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/372
Reviewed-by: Lizzie <lizzie@eden-emu.dev>
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Reviewed-by: MaranBr <maranbr@outlook.com>
Co-authored-by: innix <dev@innix.space>
Co-committed-by: innix <dev@innix.space>
---
 src/common/host_memory.cpp                    | 29 ++++++++++---
 src/common/settings.h                         |  2 +
 src/video_core/renderer_vulkan/blit_image.cpp | 41 ++++++++++++-------
 .../renderer_vulkan/present/util.cpp          |  4 +-
 .../renderer_vulkan/vk_graphics_pipeline.cpp  |  6 ++-
 .../vulkan_common/vulkan_device.cpp           | 16 +++++++-
 src/video_core/vulkan_common/vulkan_device.h  |  4 ++
 .../vulkan_common/vulkan_wrapper.cpp          |  2 +
 .../configure_graphics_extensions.cpp         |  4 ++
 9 files changed, 82 insertions(+), 26 deletions(-)

diff --git a/src/common/host_memory.cpp b/src/common/host_memory.cpp
index e70ac216cb..15a198e216 100644
--- a/src/common/host_memory.cpp
+++ b/src/common/host_memory.cpp
@@ -12,7 +12,7 @@
 #include <windows.h>
 #include "common/dynamic_library.h"
 
-#elif defined(__linux__) || defined(__FreeBSD__) || defined(__sun__) // ^^^ Windows ^^^ vvv Linux vvv
+#elif defined(__linux__) || defined(__FreeBSD__) || defined(__sun__) || defined(__APPLE__) // ^^^ Windows ^^^ vvv POSIX vvv
 
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
@@ -20,10 +20,18 @@
 #include <boost/icl/interval_set.hpp>
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <sys/random.h>
 #include <unistd.h>
 #include "common/scope_exit.h"
 
+#if defined(__linux__)
+#include <sys/random.h>
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/random.h>
+#include <mach/vm_map.h>
+#include <mach/mach.h>
+#endif
+
 // FreeBSD
 #ifndef MAP_NORESERVE
 #define MAP_NORESERVE 0
@@ -32,8 +40,12 @@
 #ifndef MAP_ALIGNED_SUPER
 #define MAP_ALIGNED_SUPER 0
 #endif
+// macOS
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
 
-#endif // ^^^ Linux ^^^
+#endif // ^^^ POSIX ^^^
 
 #include <mutex>
 #include <random>
@@ -372,7 +384,7 @@ private:
     std::unordered_map<size_t, size_t> placeholder_host_pointers; ///< Placeholder backing offset
 };
 
-#elif defined(__linux__) || defined(__FreeBSD__) || defined(__sun__) // ^^^ Windows ^^^ vvv Linux vvv
+#elif defined(__linux__) || defined(__FreeBSD__) || defined(__sun__) || defined(__APPLE__) // ^^^ Windows ^^^ vvv POSIX vvv
 
 #ifdef ARCHITECTURE_arm64
 
@@ -489,6 +501,13 @@ public:
 #elif defined(__FreeBSD__) && __FreeBSD__ < 13
         // XXX Drop after FreeBSD 12.* reaches EOL on 2024-06-30
         fd = shm_open(SHM_ANON, O_RDWR, 0600);
+#elif defined(__APPLE__)
+        // macOS doesn't have memfd_create, use anonymous temporary file
+        char template_path[] = "/tmp/eden_mem_XXXXXX";
+        fd = mkstemp(template_path);
+        if (fd >= 0) {
+            unlink(template_path);
+        }
 #else
         fd = memfd_create("HostMemory", 0);
 #endif
@@ -645,7 +664,7 @@ private:
     FreeRegionManager free_manager{};
 };
 
-#else // ^^^ Linux ^^^ vvv Generic vvv
+#else // ^^^ POSIX ^^^ vvv Generic vvv
 
 class HostMemory::Impl {
 public:
diff --git a/src/common/settings.h b/src/common/settings.h
index 64545d10ff..b657dc8658 100644
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -551,6 +551,8 @@ struct Values {
                                            3,
 #elif defined (ANDROID)
                                            0,
+#elif defined (__APPLE__)
+                                           0,
 #else
                                            2,
 #endif
diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp
index 596323fb32..37213912e3 100644
--- a/src/video_core/renderer_vulkan/blit_image.cpp
+++ b/src/video_core/renderer_vulkan/blit_image.cpp
@@ -102,13 +102,16 @@ constexpr VkPipelineVertexInputStateCreateInfo PIPELINE_VERTEX_INPUT_STATE_CREAT
     .vertexAttributeDescriptionCount = 0,
     .pVertexAttributeDescriptions = nullptr,
 };
-constexpr VkPipelineInputAssemblyStateCreateInfo PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO{
-    .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
-    .pNext = nullptr,
-    .flags = 0,
-    .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
-    .primitiveRestartEnable = VK_FALSE,
-};
+
+VkPipelineInputAssemblyStateCreateInfo GetPipelineInputAssemblyStateCreateInfo(const Device& device) {
+    return VkPipelineInputAssemblyStateCreateInfo{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+        .primitiveRestartEnable = device.IsMoltenVK() ? VK_TRUE : VK_FALSE,
+    };
+}
 constexpr VkPipelineViewportStateCreateInfo PIPELINE_VIEWPORT_STATE_CREATE_INFO{
     .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
     .pNext = nullptr,
@@ -802,6 +805,7 @@ VkPipeline BlitImageHelper::FindOrEmplaceColorPipeline(const BlitImagePipelineKe
         .pAttachments = &blend_attachment,
         .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f},
     };
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci = GetPipelineInputAssemblyStateCreateInfo(device);
     blit_color_pipelines.push_back(device.GetLogical().CreateGraphicsPipeline({
         .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
         .pNext = nullptr,
@@ -809,7 +813,7 @@ VkPipeline BlitImageHelper::FindOrEmplaceColorPipeline(const BlitImagePipelineKe
         .stageCount = static_cast<u32>(stages.size()),
         .pStages = stages.data(),
         .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-        .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pInputAssemblyState = &input_assembly_ci,
         .pTessellationState = nullptr,
         .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
@@ -833,6 +837,7 @@ VkPipeline BlitImageHelper::FindOrEmplaceDepthStencilPipeline(const BlitImagePip
     }
     blit_depth_stencil_keys.push_back(key);
     const std::array stages = MakeStages(*full_screen_vert, *blit_depth_stencil_frag);
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci = GetPipelineInputAssemblyStateCreateInfo(device);
     blit_depth_stencil_pipelines.push_back(device.GetLogical().CreateGraphicsPipeline({
         .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
         .pNext = nullptr,
@@ -840,7 +845,7 @@ VkPipeline BlitImageHelper::FindOrEmplaceDepthStencilPipeline(const BlitImagePip
         .stageCount = static_cast<u32>(stages.size()),
         .pStages = stages.data(),
         .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-        .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pInputAssemblyState = &input_assembly_ci,
         .pTessellationState = nullptr,
         .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
@@ -885,6 +890,7 @@ VkPipeline BlitImageHelper::FindOrEmplaceClearColorPipeline(const BlitImagePipel
         .pAttachments = &color_blend_attachment_state,
         .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f},
     };
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci = GetPipelineInputAssemblyStateCreateInfo(device);
     clear_color_pipelines.push_back(device.GetLogical().CreateGraphicsPipeline({
         .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
         .pNext = nullptr,
@@ -892,7 +898,7 @@ VkPipeline BlitImageHelper::FindOrEmplaceClearColorPipeline(const BlitImagePipel
         .stageCount = static_cast<u32>(stages.size()),
         .pStages = stages.data(),
         .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-        .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pInputAssemblyState = &input_assembly_ci,
         .pTessellationState = nullptr,
         .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
@@ -940,6 +946,7 @@ VkPipeline BlitImageHelper::FindOrEmplaceClearStencilPipeline(
         .minDepthBounds = 0.0f,
         .maxDepthBounds = 0.0f,
     };
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci = GetPipelineInputAssemblyStateCreateInfo(device);
     clear_stencil_pipelines.push_back(device.GetLogical().CreateGraphicsPipeline({
         .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
         .pNext = nullptr,
@@ -947,7 +954,7 @@ VkPipeline BlitImageHelper::FindOrEmplaceClearStencilPipeline(
         .stageCount = static_cast<u32>(stages.size()),
         .pStages = stages.data(),
         .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-        .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pInputAssemblyState = &input_assembly_ci,
         .pTessellationState = nullptr,
         .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
@@ -970,6 +977,7 @@ void BlitImageHelper::ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRend
     }
     VkShaderModule frag_shader = *convert_float_to_depth_frag;
     const std::array stages = MakeStages(*full_screen_vert, frag_shader);
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci = GetPipelineInputAssemblyStateCreateInfo(device);
     pipeline = device.GetLogical().CreateGraphicsPipeline({
         .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
         .pNext = nullptr,
@@ -977,7 +985,7 @@ void BlitImageHelper::ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRend
         .stageCount = static_cast<u32>(stages.size()),
         .pStages = stages.data(),
         .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-        .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pInputAssemblyState = &input_assembly_ci,
         .pTessellationState = nullptr,
         .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
@@ -999,6 +1007,7 @@ void BlitImageHelper::ConvertColorToDepthPipeline(vk::Pipeline& pipeline, VkRend
     }
     VkShaderModule frag_shader = *convert_depth_to_float_frag;
     const std::array stages = MakeStages(*full_screen_vert, frag_shader);
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci = GetPipelineInputAssemblyStateCreateInfo(device);
     pipeline = device.GetLogical().CreateGraphicsPipeline({
         .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
         .pNext = nullptr,
@@ -1006,7 +1015,7 @@ void BlitImageHelper::ConvertColorToDepthPipeline(vk::Pipeline& pipeline, VkRend
         .stageCount = static_cast<u32>(stages.size()),
         .pStages = stages.data(),
         .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-        .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pInputAssemblyState = &input_assembly_ci,
         .pTessellationState = nullptr,
         .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
@@ -1029,6 +1038,7 @@ void BlitImageHelper::ConvertPipelineEx(vk::Pipeline& pipeline, VkRenderPass ren
         return;
     }
     const std::array stages = MakeStages(*full_screen_vert, *module);
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci = GetPipelineInputAssemblyStateCreateInfo(device);
     pipeline = device.GetLogical().CreateGraphicsPipeline({
         .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
         .pNext = nullptr,
@@ -1036,7 +1046,7 @@ void BlitImageHelper::ConvertPipelineEx(vk::Pipeline& pipeline, VkRenderPass ren
         .stageCount = static_cast<u32>(stages.size()),
         .pStages = stages.data(),
         .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-        .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pInputAssemblyState = &input_assembly_ci,
         .pTessellationState = nullptr,
         .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
@@ -1070,6 +1080,7 @@ void BlitImageHelper::ConvertPipeline(vk::Pipeline& pipeline, VkRenderPass rende
     VkShaderModule frag_shader =
         is_target_depth ? *convert_float_to_depth_frag : *convert_depth_to_float_frag;
     const std::array stages = MakeStages(*full_screen_vert, frag_shader);
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci = GetPipelineInputAssemblyStateCreateInfo(device);
     pipeline = device.GetLogical().CreateGraphicsPipeline({
         .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
         .pNext = nullptr,
@@ -1077,7 +1088,7 @@ void BlitImageHelper::ConvertPipeline(vk::Pipeline& pipeline, VkRenderPass rende
         .stageCount = static_cast<u32>(stages.size()),
         .pStages = stages.data(),
         .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-        .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pInputAssemblyState = &input_assembly_ci,
         .pTessellationState = nullptr,
         .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
diff --git a/src/video_core/renderer_vulkan/present/util.cpp b/src/video_core/renderer_vulkan/present/util.cpp
index 6874bbae99..07b6a41c5c 100644
--- a/src/video_core/renderer_vulkan/present/util.cpp
+++ b/src/video_core/renderer_vulkan/present/util.cpp
@@ -400,12 +400,12 @@ static vk::Pipeline CreateWrappedPipelineImpl(
         .pVertexAttributeDescriptions = nullptr,
     };
 
-    constexpr VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{
         .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
         .pNext = nullptr,
         .flags = 0,
         .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
-        .primitiveRestartEnable = VK_FALSE,
+        .primitiveRestartEnable = device.IsMoltenVK() ? VK_TRUE : VK_FALSE,
     };
 
     constexpr VkPipelineViewportStateCreateInfo viewport_state_ci{
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 0226eb2c14..dc068c5e52 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -635,14 +635,16 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
         .flags = 0,
         .topology = input_assembly_topology,
         .primitiveRestartEnable =
-        dynamic.primitive_restart_enable != 0 &&
+        // MoltenVK/Metal always has primitive restart enabled and cannot disable it
+        device.IsMoltenVK() ? VK_TRUE :
+        (dynamic.primitive_restart_enable != 0 &&
                 ((input_assembly_topology != VK_PRIMITIVE_TOPOLOGY_PATCH_LIST &&
                   device.IsTopologyListPrimitiveRestartSupported()) ||
                  SupportsPrimitiveRestart(input_assembly_topology) ||
                  (input_assembly_topology == VK_PRIMITIVE_TOPOLOGY_PATCH_LIST &&
                   device.IsPatchListPrimitiveRestartSupported()))
             ? VK_TRUE
-            : VK_FALSE,
+            : VK_FALSE),
     };
     const VkPipelineTessellationStateCreateInfo tessellation_ci{
         .sType = VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO,
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 4d74bf00a5..6fdf1e7874 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -725,6 +725,11 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
         dynamic_state3_enables = true;
     }
 
+    if (is_mvk && Settings::values.dyna_state.GetValue() != 0) {
+        LOG_WARNING(Render_Vulkan, "MoltenVK detected: Forcing dynamic state to 0 to prevent black screen issues");
+        Settings::values.dyna_state.SetValue(0);
+    }
+
     if (Settings::values.dyna_state.GetValue() == 0) {
         must_emulate_scaled_formats = true;
         LOG_INFO(Render_Vulkan, "Dynamic state is disabled (dyna_state = 0), forcing scaled format emulation ON");
@@ -1096,8 +1101,15 @@ bool Device::GetSuitability(bool requires_swapchain) {
 // Some features are mandatory. Check those.
 #define CHECK_FEATURE(feature, name)                                                               \
     if (!features.feature.name) {                                                                  \
-            LOG_ERROR(Render_Vulkan, "Missing required feature {}", #name);                            \
-            suitable = false;                                                                          \
+        if (IsMoltenVK() && (strcmp(#name, "geometryShader") == 0 ||                               \
+                            strcmp(#name, "logicOp") == 0 ||                                       \
+                            strcmp(#name, "shaderCullDistance") == 0 ||                            \
+                            strcmp(#name, "wideLines") == 0)) {                                    \
+            LOG_INFO(Render_Vulkan, "MoltenVK missing feature {} - using fallback", #name);       \
+        } else {                                                                                    \
+            LOG_ERROR(Render_Vulkan, "Missing required feature {}", #name);                        \
+            suitable = false;                                                                       \
+        }                                                                                           \
     }
 
 #define LOG_FEATURE(feature, name)                                                                 \
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 9b78f2e599..bd54144480 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -717,6 +717,10 @@ public:
         return properties.driver.driverID == VK_DRIVER_ID_NVIDIA_PROPRIETARY;
     }
 
+    bool IsMoltenVK() const noexcept {
+        return properties.driver.driverID == VK_DRIVER_ID_MOLTENVK;
+    }
+
     NvidiaArchitecture GetNvidiaArch() const noexcept {
         return nvidia_arch;
     }
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp
index 106630182f..949b91499d 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.cpp
+++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp
@@ -580,6 +580,7 @@ DescriptorSets DescriptorPool::Allocate(const VkDescriptorSetAllocateInfo& ai) c
     case VK_SUCCESS:
         return DescriptorSets(std::move(sets), num, owner, handle, *dld);
     case VK_ERROR_OUT_OF_POOL_MEMORY:
+    case VK_ERROR_FRAGMENTED_POOL:
         return {};
     default:
         throw Exception(result);
@@ -604,6 +605,7 @@ CommandBuffers CommandPool::Allocate(std::size_t num_buffers, VkCommandBufferLev
     case VK_SUCCESS:
         return CommandBuffers(std::move(buffers), num_buffers, owner, handle, *dld);
     case VK_ERROR_OUT_OF_POOL_MEMORY:
+    case VK_ERROR_FRAGMENTED_POOL:
         return {};
     default:
         throw Exception(result);
diff --git a/src/yuzu/configuration/configure_graphics_extensions.cpp b/src/yuzu/configuration/configure_graphics_extensions.cpp
index c8dee6b073..322fa9ea08 100644
--- a/src/yuzu/configuration/configure_graphics_extensions.cpp
+++ b/src/yuzu/configuration/configure_graphics_extensions.cpp
@@ -60,6 +60,10 @@ void ConfigureGraphicsExtensions::Setup(const ConfigurationShared::Builder& buil
         if (setting->Id() == Settings::values.dyna_state.Id()) {
             widget->slider->setTickInterval(1);
             widget->slider->setTickPosition(QSlider::TicksAbove);
+#ifdef __APPLE__
+            widget->setEnabled(false);
+            widget->setToolTip(tr("Extended Dynamic State is disabled on macOS due to MoltenVK compatibility issues that cause black screens."));
+#endif
         }
     }
 

From e28b0d2590f9f0bd81d22c3410e254620e8fbd87 Mon Sep 17 00:00:00 2001
From: innix <dev@innix.space>
Date: Mon, 1 Sep 2025 14:18:30 +0200
Subject: [PATCH 17/38] [android]: Force app to use the displays max set
 refresh rate (#373)

Since Android 15, google automatically forces "games" to be 60 hrz. This ensures the display's max refresh rate is actually used. Tested on a Google Pixel 7 Pro with Android 16

Co-authored-by: innix <dev@innix.space>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/373
Co-committed-by: innix <dev@innix.space>
---
 .../org/yuzu/yuzu_emu/ui/main/MainActivity.kt     | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
index fffaa1e3ba..e8dd566f79 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
@@ -38,6 +38,7 @@ import org.yuzu.yuzu_emu.model.DriverViewModel
 import org.yuzu.yuzu_emu.model.GamesViewModel
 import org.yuzu.yuzu_emu.model.HomeViewModel
 import org.yuzu.yuzu_emu.model.InstallResult
+import android.os.Build
 import org.yuzu.yuzu_emu.model.TaskState
 import org.yuzu.yuzu_emu.model.TaskViewModel
 import org.yuzu.yuzu_emu.utils.*
@@ -47,6 +48,7 @@ import java.io.BufferedOutputStream
 import java.util.zip.ZipEntry
 import java.util.zip.ZipInputStream
 import androidx.core.content.edit
+import kotlin.text.compareTo
 
 class MainActivity : AppCompatActivity(), ThemeProvider {
     private lateinit var binding: ActivityMainBinding
@@ -110,6 +112,19 @@ class MainActivity : AppCompatActivity(), ThemeProvider {
 
         binding = ActivityMainBinding.inflate(layoutInflater)
 
+        // Since Android 15, google automatically forces "games" to be 60 hrz
+        // This ensures the display's max refresh rate is actually used
+        display?.let {
+            val supportedModes = it.supportedModes
+            val maxRefreshRate = supportedModes.maxByOrNull { mode -> mode.refreshRate }
+
+            if (maxRefreshRate != null) {
+                val layoutParams = window.attributes
+                layoutParams.preferredDisplayModeId = maxRefreshRate.modeId
+                window.attributes = layoutParams
+            }
+        }
+
         setContentView(binding.root)
 
         checkAndRequestBluetoothPermissions()

From 6aa8be1da8af896a0413290625690ec9b63f9ef6 Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Mon, 1 Sep 2025 21:14:54 +0200
Subject: [PATCH 18/38] [cmake] fix gh dependencies (#377)

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/377
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 .patch/unordered-dense/0001-cmake.patch | 22 ++++++++++++++++++++++
 externals/cpmfile.json                  |  8 ++++----
 src/dynarmic/externals/cpmfile.json     |  5 ++++-
 3 files changed, 30 insertions(+), 5 deletions(-)
 create mode 100644 .patch/unordered-dense/0001-cmake.patch

diff --git a/.patch/unordered-dense/0001-cmake.patch b/.patch/unordered-dense/0001-cmake.patch
new file mode 100644
index 0000000000..39e7794b1f
--- /dev/null
+++ b/.patch/unordered-dense/0001-cmake.patch
@@ -0,0 +1,22 @@
+From e59d30b7b12e1d04cc2fc9c6219e35bda447c17e Mon Sep 17 00:00:00 2001
+From: Lizzie <159065448+Lizzie841@users.noreply.github.com>
+Date: Fri, 16 May 2025 04:12:13 +0100
+Subject: [PATCH] Update CMakeLists.txt
+
+---
+ CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index b5f4c4f..c5c6f31 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -24,7 +24,7 @@ target_include_directories(
+ 
+ target_compile_features(unordered_dense INTERFACE cxx_std_17)
+ 
+-if(_unordered_dense_is_toplevel_project)
++if(_unordered_dense_is_toplevel_project OR UNORDERED_DENSE_INSTALL)
+     # locations are provided by GNUInstallDirs
+     install(
+         TARGETS unordered_dense
diff --git a/externals/cpmfile.json b/externals/cpmfile.json
index effcbcc01f..bbc6b056c1 100644
--- a/externals/cpmfile.json
+++ b/externals/cpmfile.json
@@ -74,14 +74,14 @@
     },
     "xbyak_sun": {
         "package": "xbyak",
-        "repo": "Lizzie841/xbyak",
-        "sha": "51f507b0b3",
-        "hash": "4a29a3c2f97f7d5adf667a21a008be03c951fb6696b0d7ba27e7e4afa037bc76eb5e059bb84860e01baf741d4d3ac851b840cd54c99d038812fbe0f1fa6d38a4",
+        "repo": "herumi/xbyak",
+        "sha": "9bb219333a",
+        "hash": "303165d45c8c19387ec49d9fda7d7a4e0d86d4c0153898c23f25ce2d58ece567f44c0bbbfe348239b933edb6e1a1e34f4bc1c0ab3a285bee5da0e548879387b0",
         "bundled": true
     },
     "xbyak": {
         "package": "xbyak",
-        "repo": "Lizzie841/xbyak",
+        "repo": "herumi/xbyak",
         "sha": "4e44f4614d",
         "hash": "5824e92159e07fa36a774aedd3b3ef3541d0241371d522cffa4ab3e1f215fa5097b1b77865b47b2481376c704fa079875557ea463ca63d0a7fd6a8a20a589e70",
         "bundled": true
diff --git a/src/dynarmic/externals/cpmfile.json b/src/dynarmic/externals/cpmfile.json
index b934856af2..cebcdf5232 100644
--- a/src/dynarmic/externals/cpmfile.json
+++ b/src/dynarmic/externals/cpmfile.json
@@ -16,12 +16,15 @@
     },
     "unordered-dense": {
         "package": "unordered_dense",
-        "repo": "Lizzie841/unordered_dense",
+        "repo": "martinus/unordered_dense",
         "sha": "e59d30b7b1",
         "hash": "71eff7bd9ba4b9226967bacd56a8ff000946f8813167cb5664bb01e96fb79e4e220684d824fe9c59c4d1cc98c606f13aff05b7940a1ed8ab3c95d6974ee34fa0",
         "find_args": "CONFIG",
         "options": [
             "UNORDERED_DENSE_INSTALL OFF"
+        ],
+        "patches": [
+            "0001-cmake.patch"
         ]
     },
     "zycore": {

From be7a3e1e86431728b86a33ef7de0a587771d11be Mon Sep 17 00:00:00 2001
From: Bix <bix@bixed.xyz>
Date: Mon, 1 Sep 2025 22:25:26 +0200
Subject: [PATCH 19/38] [Hotfix] Update recommended driver from T21 to T22
 (#379)

Help crueters workload.
Signed-off-by: Bix <bix@bixed.xyz>

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/379
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: Bix <bix@bixed.xyz>
Co-committed-by: Bix <bix@bixed.xyz>
---
 .../java/org/yuzu/yuzu_emu/fragments/DriverFetcherFragment.kt   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverFetcherFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverFetcherFragment.kt
index b8d0f2197e..dea762dc17 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverFetcherFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverFetcherFragment.kt
@@ -79,7 +79,7 @@ class DriverFetcherFragment : Fragment() {
         IntRange(600, 639) to "Mr. Purple EOL-24.3.4",
         IntRange(640, 699) to "Mr. Purple T19",
         IntRange(700, 710) to "KIMCHI 25.2.0_r5",
-        IntRange(711, 799) to "Mr. Purple T21",
+        IntRange(711, 799) to "Mr. Purple T22",
         IntRange(800, 899) to "GameHub Adreno 8xx",
         IntRange(900, Int.MAX_VALUE) to "Unsupported"
     )

From 84fadd1506e3b5f8f4ad4e283bbcd8b77d8309b8 Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Tue, 2 Sep 2025 03:25:27 +0200
Subject: [PATCH 20/38] [cmake] fix unordered-dense deps (#380)

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Co-authored-by: crueter <crueter@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/380
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 externals/CMakeLists.txt              | 10 +---------
 externals/cpmfile.json                | 13 +++++++++++++
 src/dynarmic/externals/CMakeLists.txt | 10 +++++-----
 src/dynarmic/externals/cpmfile.json   | 13 -------------
 src/video_core/CMakeLists.txt         |  3 ++-
 5 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index f66423a672..e917e4e7d8 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -96,15 +96,7 @@ if (ENABLE_WEB_SERVICE)
 endif()
 
 # unordered_dense
-AddPackage(
-    NAME unordered_dense
-    REPO "Lizzie841/unordered_dense"
-    SHA e59d30b7b1
-    HASH 71eff7bd9ba4b9226967bacd56a8ff000946f8813167cb5664bb01e96fb79e4e220684d824fe9c59c4d1cc98c606f13aff05b7940a1ed8ab3c95d6974ee34fa0
-    FIND_PACKAGE_ARGUMENTS "CONFIG"
-    OPTIONS
-        "UNORDERED_DENSE_INSTALL OFF"
-)
+AddJsonPackage(unordered-dense)
 
 # FFMpeg
 if (YUZU_USE_BUNDLED_FFMPEG)
diff --git a/externals/cpmfile.json b/externals/cpmfile.json
index bbc6b056c1..4bc4a97ca4 100644
--- a/externals/cpmfile.json
+++ b/externals/cpmfile.json
@@ -105,5 +105,18 @@
         "sha": "2bc873e53c",
         "hash": "02329058a7f9cf7d5039afaae5ab170d9f42f60f4c01e21eaf4f46073886922b057a9ae30eeac040b3ac182f51b9c1bfe9fe1050a2c9f6ce567a1a9a0ec2c768",
         "bundled": true
+    },
+    "unordered-dense": {
+        "package": "unordered_dense",
+        "repo": "martinus/unordered_dense",
+        "sha": "73f3cbb237",
+        "hash": "c08c03063938339d61392b687562909c1a92615b6ef39ec8df19ea472aa6b6478e70d7d5e33d4a27b5d23f7806daf57fe1bacb8124c8a945c918c7663a9e8532",
+        "find_args": "CONFIG",
+        "options": [
+            "UNORDERED_DENSE_INSTALL OFF"
+        ],
+        "patches": [
+            "0001-cmake.patch"
+        ]
     }
 }
diff --git a/src/dynarmic/externals/CMakeLists.txt b/src/dynarmic/externals/CMakeLists.txt
index 23cfd42236..73c97d8f06 100644
--- a/src/dynarmic/externals/CMakeLists.txt
+++ b/src/dynarmic/externals/CMakeLists.txt
@@ -60,12 +60,12 @@ AddJsonPackage(
 #     endif()
 # endif()
 
-# unordered_dense
+# unordered_dense - already in root
 
-AddJsonPackage(
-    NAME unordered-dense
-    BUNDLED_PACKAGE ${DYNARMIC_USE_BUNDLED_EXTERNALS}
-)
+# AddJsonPackage(
+#     NAME unordered-dense
+#     BUNDLED_PACKAGE ${DYNARMIC_USE_BUNDLED_EXTERNALS}
+# )
 
 # xbyak
 # uncomment if in an independent repo
diff --git a/src/dynarmic/externals/cpmfile.json b/src/dynarmic/externals/cpmfile.json
index cebcdf5232..e9406cbe81 100644
--- a/src/dynarmic/externals/cpmfile.json
+++ b/src/dynarmic/externals/cpmfile.json
@@ -14,19 +14,6 @@
             "MCL_INSTALL OFF"
         ]
     },
-    "unordered-dense": {
-        "package": "unordered_dense",
-        "repo": "martinus/unordered_dense",
-        "sha": "e59d30b7b1",
-        "hash": "71eff7bd9ba4b9226967bacd56a8ff000946f8813167cb5664bb01e96fb79e4e220684d824fe9c59c4d1cc98c606f13aff05b7940a1ed8ab3c95d6974ee34fa0",
-        "find_args": "CONFIG",
-        "options": [
-            "UNORDERED_DENSE_INSTALL OFF"
-        ],
-        "patches": [
-            "0001-cmake.patch"
-        ]
-    },
     "zycore": {
         "package": "Zycore",
         "repo": "zyantific/zycore-c",
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 89fe7a35f9..8131d42aae 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -332,7 +332,8 @@ target_link_options(video_core PRIVATE ${FFmpeg_LDFLAGS})
 add_dependencies(video_core host_shaders)
 target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
 
-target_link_libraries(video_core PRIVATE sirit Vulkan::Headers Vulkan::UtilityHeaders GPUOpen::VulkanMemoryAllocator)
+target_link_libraries(video_core PRIVATE sirit Vulkan::Headers Vulkan::UtilityHeaders)
+target_link_libraries(video_core PUBLIC GPUOpen::VulkanMemoryAllocator)
 
 if (ENABLE_NSIGHT_AFTERMATH)
     if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK})

From e7560183fa39c961dd87355c7c61827c5c745bc5 Mon Sep 17 00:00:00 2001
From: xbzk <xbzk@eden-emu.dev>
Date: Wed, 3 Sep 2025 03:55:19 +0200
Subject: [PATCH 21/38] [android] minor ui tweaks + translations (#326)

CHANGES:
fix drawer pause/unpause sync (upon leaving/returning to app)
add quick toggle controller overlay to drawer (for players with multiple gear style)
added translation for emulation_hide_overlay
changed Show overlay to Show controller in all langs
added missing translations for values-de

WHAT TO TEST:
sync of pause/resume when you leave eden (screenshot, home, alt tab, etc).
show controller toggle: if it works it works.

Co-authored-by: Allison Cunha <allisonbzk@gmail.com>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/326
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: xbzk <xbzk@eden-emu.dev>
Co-committed-by: xbzk <xbzk@eden-emu.dev>
---
 .../yuzu_emu/fragments/EmulationFragment.kt   | 72 +++++++++++++++----
 .../app/src/main/res/menu/menu_in_game.xml    |  5 ++
 .../app/src/main/res/values-ar/strings.xml    |  3 +-
 .../app/src/main/res/values-ckb/strings.xml   |  3 +-
 .../app/src/main/res/values-cs/strings.xml    |  3 +-
 .../app/src/main/res/values-de/strings.xml    |  7 ++
 .../app/src/main/res/values-es/strings.xml    |  3 +-
 .../app/src/main/res/values-fa/strings.xml    |  3 +-
 .../app/src/main/res/values-fr/strings.xml    |  3 +-
 .../app/src/main/res/values-he/strings.xml    |  3 +-
 .../app/src/main/res/values-hu/strings.xml    |  3 +-
 .../app/src/main/res/values-id/strings.xml    |  3 +-
 .../app/src/main/res/values-it/strings.xml    |  3 +-
 .../app/src/main/res/values-ja/strings.xml    |  3 +-
 .../app/src/main/res/values-ko/strings.xml    |  1 +
 .../app/src/main/res/values-nb/strings.xml    |  3 +-
 .../app/src/main/res/values-pl/strings.xml    |  3 +-
 .../src/main/res/values-pt-rBR/strings.xml    |  3 +-
 .../src/main/res/values-pt-rPT/strings.xml    |  3 +-
 .../app/src/main/res/values-ru/strings.xml    |  3 +-
 .../app/src/main/res/values-sr/strings.xml    |  3 +-
 .../app/src/main/res/values-uk/strings.xml    |  3 +-
 .../app/src/main/res/values-vi/strings.xml    |  3 +-
 .../src/main/res/values-zh-rCN/strings.xml    |  3 +-
 .../src/main/res/values-zh-rTW/strings.xml    |  3 +-
 .../app/src/main/res/values/strings.xml       |  3 +-
 26 files changed, 117 insertions(+), 34 deletions(-)

diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
index 96015e58ec..5cc912fbbe 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
@@ -509,6 +509,8 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
         gpuModel = GpuDriverHelper.getGpuModel().toString()
         fwVersion = NativeLibrary.firmwareVersion()
 
+        updateQuickOverlayMenuEntry(BooleanSetting.SHOW_INPUT_OVERLAY.getBoolean())
+
         binding.surfaceEmulation.holder.addCallback(this)
         binding.doneControlConfig.setOnClickListener { stopConfiguringControls() }
 
@@ -530,6 +532,7 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
                 binding.drawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_UNLOCKED)
                 binding.inGameMenu.requestFocus()
                 emulationViewModel.setDrawerOpen(true)
+                updateQuickOverlayMenuEntry(BooleanSetting.SHOW_INPUT_OVERLAY.getBoolean())
             }
 
             override fun onDrawerClosed(drawerView: View) {
@@ -571,25 +574,24 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
                 R.id.menu_pause_emulation -> {
                     if (emulationState.isPaused) {
                         emulationState.run(false)
-                        it.title = resources.getString(R.string.emulation_pause)
-                        it.icon = ResourcesCompat.getDrawable(
-                            resources,
-                            R.drawable.ic_pause,
-                            requireContext().theme
-                        )
+                        updatePauseMenuEntry(false)
                     } else {
                         emulationState.pause()
-                        it.title = resources.getString(R.string.emulation_unpause)
-                        it.icon = ResourcesCompat.getDrawable(
-                            resources,
-                            R.drawable.ic_play,
-                            requireContext().theme
-                        )
+                        updatePauseMenuEntry(true)
                     }
                     binding.inGameMenu.requestFocus()
                     true
                 }
 
+                R.id.menu_quick_overlay -> {
+                    val newState = !BooleanSetting.SHOW_INPUT_OVERLAY.getBoolean()
+                    BooleanSetting.SHOW_INPUT_OVERLAY.setBoolean(newState)
+                    updateQuickOverlayMenuEntry(newState)
+                    binding.surfaceInputOverlay.refreshControls()
+                    NativeConfig.saveGlobalConfig()
+                    true
+                }
+
                 R.id.menu_settings -> {
                     val action = HomeNavigationDirections.actionGlobalSettingsActivity(
                         null,
@@ -844,9 +846,50 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
         }
     }
 
+    private fun updateQuickOverlayMenuEntry(isVisible: Boolean) {
+        val menu = binding.inGameMenu.menu
+        val item = menu.findItem(R.id.menu_quick_overlay)
+        if (isVisible) {
+            item.title = getString(R.string.emulation_hide_overlay)
+            item.icon = ResourcesCompat.getDrawable(
+                resources,
+                R.drawable.ic_controller_disconnected,
+                requireContext().theme
+            )
+        } else {
+            item.title = getString(R.string.emulation_show_overlay)
+            item.icon = ResourcesCompat.getDrawable(
+                resources,
+                R.drawable.ic_controller,
+                requireContext().theme
+            )
+        }
+    }
+
+    private fun updatePauseMenuEntry(isPaused: Boolean) {
+        val menu = binding.inGameMenu.menu
+        val pauseItem = menu.findItem(R.id.menu_pause_emulation)
+        if (isPaused) {
+            pauseItem.title = getString(R.string.emulation_unpause)
+            pauseItem.icon = ResourcesCompat.getDrawable(
+                resources,
+                R.drawable.ic_play,
+                requireContext().theme
+            )
+        } else {
+            pauseItem.title = getString(R.string.emulation_pause)
+            pauseItem.icon = ResourcesCompat.getDrawable(
+                resources,
+                R.drawable.ic_pause,
+                requireContext().theme
+            )
+        }
+    }
+
     override fun onPause() {
         if (emulationState.isRunning && emulationActivity?.isInPictureInPictureMode != true) {
             emulationState.pause()
+            updatePauseMenuEntry(true)
         }
         super.onPause()
     }
@@ -869,6 +912,10 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
 
         val socPosition = IntSetting.SOC_OVERLAY_POSITION.getInt()
         updateSocPosition(socPosition)
+
+        binding.inGameMenu.post {
+            emulationState?.isPaused?.let { updatePauseMenuEntry(it) }
+        }
     }
 
     private fun resetInputOverlay() {
@@ -1391,6 +1438,7 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
                 R.id.menu_show_overlay -> {
                     it.isChecked = !it.isChecked
                     BooleanSetting.SHOW_INPUT_OVERLAY.setBoolean(it.isChecked)
+                    updateQuickOverlayMenuEntry(it.isChecked)
                     binding.surfaceInputOverlay.refreshControls()
                     true
                 }
diff --git a/src/android/app/src/main/res/menu/menu_in_game.xml b/src/android/app/src/main/res/menu/menu_in_game.xml
index ce3f11f6e6..d45699638c 100644
--- a/src/android/app/src/main/res/menu/menu_in_game.xml
+++ b/src/android/app/src/main/res/menu/menu_in_game.xml
@@ -8,6 +8,11 @@
         android:icon="@drawable/ic_pause"
         android:title="@string/emulation_pause" />
 
+    <item
+        android:id="@+id/menu_quick_overlay"
+        android:icon="@drawable/ic_controller"
+        android:title="@string/emulation_show_overlay"/>
+
     <item
         android:id="@+id/menu_settings"
         android:icon="@drawable/ic_settings"
diff --git a/src/android/app/src/main/res/values-ar/strings.xml b/src/android/app/src/main/res/values-ar/strings.xml
index 7373abdf76..ed3fc76f3b 100644
--- a/src/android/app/src/main/res/values-ar/strings.xml
+++ b/src/android/app/src/main/res/values-ar/strings.xml
@@ -733,7 +733,8 @@
     <string name="emulation_rel_stick_center">مركز العصا النسبي</string>
     <string name="emulation_dpad_slide">مزلاق الأسهم</string>
     <string name="emulation_haptics">الاهتزازات الديناميكية</string>
-    <string name="emulation_show_overlay">عرض التراكب</string>
+    <string name="emulation_show_overlay">إظهار وحدة التحكم</string>
+    <string name="emulation_hide_overlay">إخفاء وحدة التحكم</string>
     <string name="emulation_toggle_all">الكل</string>
     <string name="emulation_control_adjust">ضبط التراكب</string>
     <string name="emulation_control_scale">الحجم</string>
diff --git a/src/android/app/src/main/res/values-ckb/strings.xml b/src/android/app/src/main/res/values-ckb/strings.xml
index 221a197843..34b1ae6252 100644
--- a/src/android/app/src/main/res/values-ckb/strings.xml
+++ b/src/android/app/src/main/res/values-ckb/strings.xml
@@ -710,7 +710,8 @@
     <string name="emulation_rel_stick_center">ناوەندی گێڕ بەنزیکەیی</string>
     <string name="emulation_dpad_slide">خلیسکانی 4 دوگمەکە</string>
     <string name="emulation_haptics">لەرینەوەی پەنجەلێدان</string>
-    <string name="emulation_show_overlay">نیشاندانی داپۆشەر</string>
+    <string name="emulation_show_overlay">نیشاندانی کۆنتڕۆڵەر</string>
+    <string name="emulation_hide_overlay">پیشاندانی کۆنتڕۆڵەر</string>
     <string name="emulation_toggle_all">گۆڕینی سەرجەم</string>
     <string name="emulation_control_adjust">ڕێکخستنی داپۆشەر</string>
     <string name="emulation_control_scale">پێوەر</string>
diff --git a/src/android/app/src/main/res/values-cs/strings.xml b/src/android/app/src/main/res/values-cs/strings.xml
index 61e389f9fc..293524271e 100644
--- a/src/android/app/src/main/res/values-cs/strings.xml
+++ b/src/android/app/src/main/res/values-cs/strings.xml
@@ -691,7 +691,8 @@
     <string name="emulation_rel_stick_center">Relativní střed joysticku</string>
     <string name="emulation_dpad_slide">D-pad slide</string>
     <string name="emulation_haptics">Haptická odezva</string>
-    <string name="emulation_show_overlay">Zobrazit překryv</string>
+    <string name="emulation_show_overlay">Zobrazit ovladač</string>
+    <string name="emulation_hide_overlay">Skrýt ovladač</string>
     <string name="emulation_toggle_all">Přepnout vše</string>
     <string name="emulation_control_adjust">Upravit překryv</string>
     <string name="emulation_control_scale">Měřítko</string>
diff --git a/src/android/app/src/main/res/values-de/strings.xml b/src/android/app/src/main/res/values-de/strings.xml
index bff0b0379e..46ae9ba7fe 100644
--- a/src/android/app/src/main/res/values-de/strings.xml
+++ b/src/android/app/src/main/res/values-de/strings.xml
@@ -762,6 +762,13 @@ Wirklich fortfahren?</string>
     <string name="emulation_exit">Emulation beenden</string>
     <string name="emulation_done">Fertig</string>
     <string name="emulation_fps_counter">FPS Zähler</string>
+    <string name="emulation_thermal_indicator"></string>
+    <string name="emulation_toggle_controls">Steuerung umschalten</string>
+    <string name="emulation_rel_stick_center">Relativer Stick-Zentrum</string>
+    <string name="emulation_dpad_slide">D-Pad-Scrollen</string>
+    <string name="emulation_haptics">Haptisches Feedback</string>
+    <string name="emulation_show_overlay">Controller anzeigen</string>
+    <string name="emulation_hide_overlay">Controller ausblenden</string>
     <string name="emulation_toggle_all">Alle umschalten</string>
     <string name="emulation_control_adjust">Overlay anpassen</string>
     <string name="emulation_control_scale">Größe</string>
diff --git a/src/android/app/src/main/res/values-es/strings.xml b/src/android/app/src/main/res/values-es/strings.xml
index 888d6d1684..8712f455de 100644
--- a/src/android/app/src/main/res/values-es/strings.xml
+++ b/src/android/app/src/main/res/values-es/strings.xml
@@ -806,7 +806,8 @@
     <string name="emulation_rel_stick_center">Centro relativo del stick</string>
     <string name="emulation_dpad_slide">Deslizamiento de la cruceta</string>
     <string name="emulation_haptics">Toques hápticos</string>
-    <string name="emulation_show_overlay">Mostrar overlay</string>
+    <string name="emulation_show_overlay">Mostrar controlador</string>
+    <string name="emulation_hide_overlay">Ocultar controlador</string>
     <string name="emulation_toggle_all">Alternar todo</string>
     <string name="emulation_control_adjust">Ajustar overlay</string>
     <string name="emulation_control_scale">Escala</string>
diff --git a/src/android/app/src/main/res/values-fa/strings.xml b/src/android/app/src/main/res/values-fa/strings.xml
index 60b1626aa5..07ff8ff4e0 100644
--- a/src/android/app/src/main/res/values-fa/strings.xml
+++ b/src/android/app/src/main/res/values-fa/strings.xml
@@ -805,7 +805,8 @@
     <string name="emulation_rel_stick_center">مرکز نسبی استیک</string>
     <string name="emulation_dpad_slide">لغزش دکمه‌های جهتی</string>
     <string name="emulation_haptics">لرزش لمسی</string>
-    <string name="emulation_show_overlay">نشان دادن نمایش روی صفحه</string>
+    <string name="emulation_show_overlay">نمایش کنترلر</string>
+    <string name="emulation_hide_overlay">پنهان کردن کنترلر</string>
     <string name="emulation_toggle_all">تغییر همه</string>
     <string name="emulation_control_adjust">تنظیم نمایش روی صفحه</string>
     <string name="emulation_control_scale">مقیاس</string>
diff --git a/src/android/app/src/main/res/values-fr/strings.xml b/src/android/app/src/main/res/values-fr/strings.xml
index fde02d1aa8..2e06ac98e1 100644
--- a/src/android/app/src/main/res/values-fr/strings.xml
+++ b/src/android/app/src/main/res/values-fr/strings.xml
@@ -854,7 +854,8 @@
     <string name="emulation_rel_stick_center">Centre du stick relatif</string>
     <string name="emulation_dpad_slide">Glissement du D-pad</string>
     <string name="emulation_haptics">Toucher haptique</string>
-    <string name="emulation_show_overlay">Afficher l\'overlay</string>
+    <string name="emulation_show_overlay">Afficher la manette</string>
+    <string name="emulation_hide_overlay">Masquer la manette</string>
     <string name="emulation_toggle_all">Tout basculer</string>
     <string name="emulation_control_adjust">Ajuster l\'overlay</string>
     <string name="emulation_control_scale">Échelle</string>
diff --git a/src/android/app/src/main/res/values-he/strings.xml b/src/android/app/src/main/res/values-he/strings.xml
index 59312086e9..c0c835d633 100644
--- a/src/android/app/src/main/res/values-he/strings.xml
+++ b/src/android/app/src/main/res/values-he/strings.xml
@@ -739,7 +739,8 @@
     <string name="emulation_rel_stick_center">מרכז ג׳ויסטיק יחסי</string>
     <string name="emulation_dpad_slide">החלקת D-pad</string>
     <string name="emulation_haptics">רטט מגע</string>
-    <string name="emulation_show_overlay">הצג את שכבת-העל</string>
+    <string name="emulation_show_overlay">הצג בקר</string>
+    <string name="emulation_hide_overlay">הסתר בקר</string>
     <string name="emulation_toggle_all">החלף הכל</string>
     <string name="emulation_control_adjust">התאם את שכבת-העל</string>
     <string name="emulation_control_scale">קנה מידה</string>
diff --git a/src/android/app/src/main/res/values-hu/strings.xml b/src/android/app/src/main/res/values-hu/strings.xml
index f95e2d3f97..46a5ac7cce 100644
--- a/src/android/app/src/main/res/values-hu/strings.xml
+++ b/src/android/app/src/main/res/values-hu/strings.xml
@@ -843,7 +843,8 @@
     <string name="emulation_toggle_controls">Irányítás átkapcsolása</string>
     <string name="emulation_dpad_slide">D-pad csúsztatása</string>
     <string name="emulation_haptics">Érintés haptikája</string>
-    <string name="emulation_show_overlay">Átfedés mutatása</string>
+    <string name="emulation_show_overlay">Vezérlő megjelenítése</string>
+    <string name="emulation_hide_overlay">Vezérlő elrejtése</string>
     <string name="emulation_toggle_all">Összes átkapcsolása</string>
     <string name="emulation_control_adjust">Átfedés testreszabása</string>
     <string name="emulation_control_scale">Skálázás</string>
diff --git a/src/android/app/src/main/res/values-id/strings.xml b/src/android/app/src/main/res/values-id/strings.xml
index dae77d53af..cffb526ad5 100644
--- a/src/android/app/src/main/res/values-id/strings.xml
+++ b/src/android/app/src/main/res/values-id/strings.xml
@@ -798,7 +798,8 @@
     <string name="emulation_rel_stick_center">Pusat stick relatif</string>
     <string name="emulation_dpad_slide">Geser Dpad</string>
     <string name="emulation_haptics">Haptik</string>
-    <string name="emulation_show_overlay">Tampilkan Hamparan</string>
+    <string name="emulation_show_overlay">Tampilkan Kontroler</string>
+    <string name="emulation_hide_overlay">Sembunyikan Kontroler</string>
     <string name="emulation_toggle_all">Alihkan Semua</string>
     <string name="emulation_control_adjust">Menyesuaikan</string>
     <string name="emulation_control_scale">Skala</string>
diff --git a/src/android/app/src/main/res/values-it/strings.xml b/src/android/app/src/main/res/values-it/strings.xml
index dd184e9d9a..cb234cf61e 100644
--- a/src/android/app/src/main/res/values-it/strings.xml
+++ b/src/android/app/src/main/res/values-it/strings.xml
@@ -770,7 +770,8 @@
     <string name="emulation_rel_stick_center">Centro relativo degli Stick</string>
     <string name="emulation_dpad_slide">DPad A Scorrimento</string>
     <string name="emulation_haptics">Feedback Aptico</string>
-    <string name="emulation_show_overlay">Mostra l\'overlay</string>
+    <string name="emulation_show_overlay">Mostra l\'controller</string>
+    <string name="emulation_hide_overlay">Nascondi l\'controller</string>
     <string name="emulation_toggle_all">Attiva/Disattiva tutto</string>
     <string name="emulation_control_adjust">Regola l\'overlay</string>
     <string name="emulation_control_scale">Scala</string>
diff --git a/src/android/app/src/main/res/values-ja/strings.xml b/src/android/app/src/main/res/values-ja/strings.xml
index 873d433fc0..abedb1e0bc 100644
--- a/src/android/app/src/main/res/values-ja/strings.xml
+++ b/src/android/app/src/main/res/values-ja/strings.xml
@@ -729,7 +729,8 @@
     <string name="emulation_rel_stick_center">スティックを固定しない</string>
     <string name="emulation_dpad_slide">十字キーをスライド操作</string>
     <string name="emulation_haptics">タッチ振動</string>
-    <string name="emulation_show_overlay">ボタンを表示</string>
+    <string name="emulation_show_overlay">コントローラーを表示</string>
+    <string name="emulation_hide_overlay">コントローラーを非表示</string>
     <string name="emulation_toggle_all">すべて切替</string>
     <string name="emulation_control_adjust">見た目を調整</string>
     <string name="emulation_control_scale">大きさ</string>
diff --git a/src/android/app/src/main/res/values-ko/strings.xml b/src/android/app/src/main/res/values-ko/strings.xml
index 3f3a4a96c0..c6d9457744 100644
--- a/src/android/app/src/main/res/values-ko/strings.xml
+++ b/src/android/app/src/main/res/values-ko/strings.xml
@@ -798,6 +798,7 @@
     <string name="emulation_dpad_slide">십자키 슬라이드</string>
     <string name="emulation_haptics">터치 햅틱</string>
     <string name="emulation_show_overlay">컨트롤러 표시</string>
+    <string name="emulation_hide_overlay">컨트롤러 숨기기</string>
     <string name="emulation_toggle_all">모두 선택</string>
     <string name="emulation_control_adjust">컨트롤러 조정</string>
     <string name="emulation_control_scale">크기</string>
diff --git a/src/android/app/src/main/res/values-nb/strings.xml b/src/android/app/src/main/res/values-nb/strings.xml
index 1e898fca79..3cc4c6d12c 100644
--- a/src/android/app/src/main/res/values-nb/strings.xml
+++ b/src/android/app/src/main/res/values-nb/strings.xml
@@ -720,7 +720,8 @@
     <string name="emulation_rel_stick_center">Relativt pinnesenter</string>
     <string name="emulation_dpad_slide">D-pad-skyving</string>
     <string name="emulation_haptics">Berøringshaptikk</string>
-    <string name="emulation_show_overlay">Vis overlegg</string>
+    <string name="emulation_show_overlay">Vis kontroller</string>
+    <string name="emulation_hide_overlay">Skjul kontroller</string>
     <string name="emulation_toggle_all">Veksle mellom alle</string>
     <string name="emulation_control_adjust">Juster overlegg</string>
     <string name="emulation_control_scale">Skaler</string>
diff --git a/src/android/app/src/main/res/values-pl/strings.xml b/src/android/app/src/main/res/values-pl/strings.xml
index 724d7608b6..b9858838e8 100644
--- a/src/android/app/src/main/res/values-pl/strings.xml
+++ b/src/android/app/src/main/res/values-pl/strings.xml
@@ -718,7 +718,8 @@
     <string name="emulation_rel_stick_center">Wycentruj gałki</string>
     <string name="emulation_dpad_slide">Ruchomy D-pad</string>
     <string name="emulation_haptics">Wibracje haptyczne</string>
-    <string name="emulation_show_overlay">Pokaż przyciski</string>
+    <string name="emulation_show_overlay">Pokaż kontroler</string>
+    <string name="emulation_hide_overlay">Ukryj kontroler</string>
     <string name="emulation_toggle_all">Włącz wszystkie</string>
     <string name="emulation_control_adjust">Dostosuj nakładkę</string>
     <string name="emulation_control_scale">Skala</string>
diff --git a/src/android/app/src/main/res/values-pt-rBR/strings.xml b/src/android/app/src/main/res/values-pt-rBR/strings.xml
index a3fd3fe13a..1296fad889 100644
--- a/src/android/app/src/main/res/values-pt-rBR/strings.xml
+++ b/src/android/app/src/main/res/values-pt-rBR/strings.xml
@@ -855,7 +855,8 @@ uma tentativa de mapeamento automático</string>
     <string name="emulation_rel_stick_center">Centro Relativo do Analógico</string>
     <string name="emulation_dpad_slide">Deslizamento dos Botões Direcionais</string>
     <string name="emulation_haptics">Vibração ao tocar</string>
-    <string name="emulation_show_overlay">Mostrar overlay</string>
+    <string name="emulation_show_overlay">Mostrar controle</string>
+    <string name="emulation_hide_overlay">Ocultar controle</string>
     <string name="emulation_toggle_all">Marcar/Desmarcar tudo</string>
     <string name="emulation_control_adjust">Ajustar overlay</string>
     <string name="emulation_control_scale">Escala</string>
diff --git a/src/android/app/src/main/res/values-pt-rPT/strings.xml b/src/android/app/src/main/res/values-pt-rPT/strings.xml
index 7adce075cf..a166907877 100644
--- a/src/android/app/src/main/res/values-pt-rPT/strings.xml
+++ b/src/android/app/src/main/res/values-pt-rPT/strings.xml
@@ -855,7 +855,8 @@ uma tentativa de mapeamento automático</string>
     <string name="emulation_rel_stick_center">Centro Relativo de Analógico</string>
     <string name="emulation_dpad_slide">Deslizamento dos Botões Direcionais</string>
     <string name="emulation_haptics">Vibração ao tocar</string>
-    <string name="emulation_show_overlay">Mostrar overlay</string>
+    <string name="emulation_show_overlay">Mostrar comando</string>
+    <string name="emulation_hide_overlay">Ocultar comando</string>
     <string name="emulation_toggle_all">Marcar/Desmarcar tudo</string>
     <string name="emulation_control_adjust">Ajustar overlay</string>
     <string name="emulation_control_scale">Escala</string>
diff --git a/src/android/app/src/main/res/values-ru/strings.xml b/src/android/app/src/main/res/values-ru/strings.xml
index 8d02ff7b58..dc68c7b817 100644
--- a/src/android/app/src/main/res/values-ru/strings.xml
+++ b/src/android/app/src/main/res/values-ru/strings.xml
@@ -856,7 +856,8 @@
     <string name="emulation_rel_stick_center">Относительный центр стика</string>
     <string name="emulation_dpad_slide">Слайд крестовиной</string>
     <string name="emulation_haptics">Обратная связь от нажатий</string>
-    <string name="emulation_show_overlay">Показать оверлей</string>
+    <string name="emulation_show_overlay">Показать контроллер</string>
+    <string name="emulation_hide_overlay">Скрыть контроллер</string>
     <string name="emulation_toggle_all">Переключить всё</string>
     <string name="emulation_control_adjust">Регулировка оверлея</string>
     <string name="emulation_control_scale">Масштаб</string>
diff --git a/src/android/app/src/main/res/values-sr/strings.xml b/src/android/app/src/main/res/values-sr/strings.xml
index 2294033550..c547b3f761 100644
--- a/src/android/app/src/main/res/values-sr/strings.xml
+++ b/src/android/app/src/main/res/values-sr/strings.xml
@@ -812,7 +812,8 @@
     <string name="emulation_rel_stick_center">Релативни центар за штапић</string>
     <string name="emulation_dpad_slide">Д-Пад Слиде</string>
     <string name="emulation_haptics">Додирните ХАптицс</string>
-    <string name="emulation_show_overlay">Приказати прекривање</string>
+    <string name="emulation_show_overlay">Приказати контролер</string>
+    <string name="emulation_hide_overlay">Сакрити контролер</string>
     <string name="emulation_toggle_all">Пребацивати све</string>
     <string name="emulation_control_adjust">Подесити прекривање</string>
     <string name="emulation_control_scale">Скала</string>
diff --git a/src/android/app/src/main/res/values-uk/strings.xml b/src/android/app/src/main/res/values-uk/strings.xml
index ebb5493f12..b48a8a4a58 100644
--- a/src/android/app/src/main/res/values-uk/strings.xml
+++ b/src/android/app/src/main/res/values-uk/strings.xml
@@ -749,7 +749,8 @@
     <string name="emulation_rel_stick_center">Відносний центр джойстика</string>
     <string name="emulation_dpad_slide">Ковзання D-pad</string>
     <string name="emulation_haptics">Тактильний відгук</string>
-    <string name="emulation_show_overlay">Показати накладання</string>
+    <string name="emulation_show_overlay">Показати контролер</string>
+    <string name="emulation_hide_overlay">Сховати контролер</string>
     <string name="emulation_toggle_all">Перемкнути все</string>
     <string name="emulation_control_adjust">Налаштувати накладання</string>
     <string name="emulation_control_scale">Масштаб</string>
diff --git a/src/android/app/src/main/res/values-vi/strings.xml b/src/android/app/src/main/res/values-vi/strings.xml
index 102c720835..b19d437ceb 100644
--- a/src/android/app/src/main/res/values-vi/strings.xml
+++ b/src/android/app/src/main/res/values-vi/strings.xml
@@ -723,7 +723,8 @@
     <string name="emulation_rel_stick_center">Trung tâm nút cần xoay tương đối</string>
     <string name="emulation_dpad_slide">Trượt D-pad</string>
     <string name="emulation_haptics">Chạm haptics</string>
-    <string name="emulation_show_overlay">Hiện lớp phủ</string>
+    <string name="emulation_show_overlay">Hiện bộ điều khiển</string>
+    <string name="emulation_hide_overlay">Ẩn bộ điều khiển</string>
     <string name="emulation_toggle_all">Chuyển đổi tất cả</string>
     <string name="emulation_control_adjust">Điều chỉnh lớp phủ</string>
     <string name="emulation_control_scale">Tỉ lệ thu phóng</string>
diff --git a/src/android/app/src/main/res/values-zh-rCN/strings.xml b/src/android/app/src/main/res/values-zh-rCN/strings.xml
index a0dab375d0..95ab14abd0 100644
--- a/src/android/app/src/main/res/values-zh-rCN/strings.xml
+++ b/src/android/app/src/main/res/values-zh-rCN/strings.xml
@@ -848,7 +848,8 @@
     <string name="emulation_rel_stick_center">相对摇杆中心</string>
     <string name="emulation_dpad_slide">十字方向键滑动</string>
     <string name="emulation_haptics">触觉反馈</string>
-    <string name="emulation_show_overlay">显示虚拟按键</string>
+    <string name="emulation_show_overlay">显示控制器</string>
+    <string name="emulation_hide_overlay">隐藏控制器</string>
     <string name="emulation_toggle_all">全部切换</string>
     <string name="emulation_control_adjust">调整虚拟按键</string>
     <string name="emulation_control_scale">缩放</string>
diff --git a/src/android/app/src/main/res/values-zh-rTW/strings.xml b/src/android/app/src/main/res/values-zh-rTW/strings.xml
index 851483668a..8640875f2c 100644
--- a/src/android/app/src/main/res/values-zh-rTW/strings.xml
+++ b/src/android/app/src/main/res/values-zh-rTW/strings.xml
@@ -853,7 +853,8 @@
     <string name="emulation_rel_stick_center">相對搖桿中心</string>
     <string name="emulation_dpad_slide">方向鍵滑動</string>
     <string name="emulation_haptics">觸覺回饋技術</string>
-    <string name="emulation_show_overlay">顯示覆疊</string>
+    <string name="emulation_show_overlay">顯示控制器</string>
+    <string name="emulation_hide_overlay">隱藏控制器</string>
     <string name="emulation_toggle_all">全部切換</string>
     <string name="emulation_control_adjust">調整覆疊</string>
     <string name="emulation_control_scale">縮放</string>
diff --git a/src/android/app/src/main/res/values/strings.xml b/src/android/app/src/main/res/values/strings.xml
index f73fc1d9aa..7124ba41b4 100644
--- a/src/android/app/src/main/res/values/strings.xml
+++ b/src/android/app/src/main/res/values/strings.xml
@@ -835,7 +835,8 @@
     <string name="emulation_rel_stick_center">Relative stick center</string>
     <string name="emulation_dpad_slide">D-pad slide</string>
     <string name="emulation_haptics">Touch haptics</string>
-    <string name="emulation_show_overlay">Show overlay</string>
+    <string name="emulation_show_overlay">Show controller</string>
+    <string name="emulation_hide_overlay">Hide controller</string>
     <string name="emulation_toggle_all">Toggle all</string>
     <string name="emulation_control_adjust">Adjust overlay</string>
     <string name="emulation_control_scale">Scale</string>

From 2bc792e211a157c69b809adf11762a68a343f343 Mon Sep 17 00:00:00 2001
From: crueter <crueter@eden-emu.dev>
Date: Wed, 3 Sep 2025 04:36:21 +0200
Subject: [PATCH 22/38] [cmake] fix yuzu_cmd, bundled overrides (#381)

Fixes yuzu_cmd not linking to vma (just link to vma for now, but should
be linked to video_core maybe?)

also fixes the weird precedence of bundled packages esp w.r.t json where
an effectively garbage value was passed into the BUNDLED_PACKAGE
argument (was forced to on)

Signed-off-by: crueter <crueter@eden-emu.dev>

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/381
---
 CMakeModules/CPMUtil.cmake       | 7 +++----
 src/android/app/build.gradle.kts | 1 +
 src/yuzu_cmd/CMakeLists.txt      | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CMakeModules/CPMUtil.cmake b/CMakeModules/CPMUtil.cmake
index 4d7db6ed61..9daada47ad 100644
--- a/CMakeModules/CPMUtil.cmake
+++ b/CMakeModules/CPMUtil.cmake
@@ -184,8 +184,6 @@ function(AddJsonPackage)
     # system/bundled
     if (bundled STREQUAL "unset" AND DEFINED JSON_BUNDLED_PACKAGE)
         set(bundled ${JSON_BUNDLED_PACKAGE})
-    else()
-        set(bundled ON)
     endif()
 
     AddPackage(
@@ -259,6 +257,7 @@ function(AddPackage)
 
         KEY
         BUNDLED_PACKAGE
+        FIND_PACKAGE_ARGUMENTS
     )
 
     set(multiValueArgs OPTIONS PATCHES)
@@ -409,9 +408,9 @@ function(AddPackage)
         set_precedence(OFF OFF)
     elseif (CPMUTIL_FORCE_SYSTEM)
         set_precedence(ON ON)
-    elseif(NOT CPMUTIL_FORCE_BUNDLED)
+    elseif(CPMUTIL_FORCE_BUNDLED)
         set_precedence(OFF OFF)
-    elseif (DEFINED PKG_ARGS_BUNDLED_PACKAGE)
+    elseif (DEFINED PKG_ARGS_BUNDLED_PACKAGE AND NOT PKG_ARGS_BUNDLED_PACKAGE STREQUAL "unset")
         if (PKG_ARGS_BUNDLED_PACKAGE)
             set(local OFF)
         else()
diff --git a/src/android/app/build.gradle.kts b/src/android/app/build.gradle.kts
index d907284bb7..e91d2e8c52 100644
--- a/src/android/app/build.gradle.kts
+++ b/src/android/app/build.gradle.kts
@@ -173,6 +173,7 @@ android {
                     "-DENABLE_OPENSSL=ON",
                     "-DANDROID_ARM_NEON=true", // cryptopp requires Neon to work
                     "-DYUZU_USE_CPM=ON",
+                    "-DCPMUTIL_FORCE_BUNDLED=ON",
                     "-DYUZU_USE_BUNDLED_FFMPEG=ON",
                     "-DYUZU_ENABLE_LTO=ON",
                     "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
diff --git a/src/yuzu_cmd/CMakeLists.txt b/src/yuzu_cmd/CMakeLists.txt
index ebd8fd7387..a7cf6d204c 100644
--- a/src/yuzu_cmd/CMakeLists.txt
+++ b/src/yuzu_cmd/CMakeLists.txt
@@ -39,6 +39,7 @@ create_resource("../../dist/yuzu.bmp" "yuzu_cmd/yuzu_icon.h" "yuzu_icon")
 target_include_directories(yuzu-cmd PRIVATE ${RESOURCES_DIR})
 
 target_link_libraries(yuzu-cmd PRIVATE SDL2::SDL2 Vulkan::Headers)
+target_link_libraries(yuzu-cmd PRIVATE GPUOpen::VulkanMemoryAllocator)
 
 if(UNIX AND NOT APPLE)
     install(TARGETS yuzu-cmd)

From bbcd8aded6ad61b02277d34a782cbcaedbc46ee2 Mon Sep 17 00:00:00 2001
From: crueter <crueter@eden-emu.dev>
Date: Thu, 4 Sep 2025 16:04:42 +0200
Subject: [PATCH 23/38] Revert "[heap_tracker] Use ankerl map instead of rb
 tree (#249)" (#382)

This reverts commit c9a3baab5d5ba524778492027ef8961da947df2d.

this commit caused issues in ender magnolia or something, need to make
sure I didn't mess up the revert

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/382
Reviewed-by: Shinmegumi <shinmegumi@eden-emu.dev>
Reviewed-by: Lizzie <lizzie@eden-emu.dev>
Reviewed-by: MaranBr <maranbr@outlook.com>
---
 src/common/CMakeLists.txt                 |  11 +-
 src/common/heap_tracker.cpp               | 200 ++++++++++++++++------
 src/common/heap_tracker.h                 |  82 ++++++---
 src/core/arm/dynarmic/arm_dynarmic.cpp    |  44 ++++-
 src/core/arm/dynarmic/arm_dynarmic.h      |  20 +++
 src/core/arm/dynarmic/arm_dynarmic_32.cpp |   5 +
 src/core/arm/dynarmic/arm_dynarmic_64.cpp |   5 +
 src/core/hle/kernel/k_process.cpp         |   4 +
 src/core/memory.cpp                       |  21 ++-
 src/core/memory.h                         |   5 +
 src/dynarmic/externals/CMakeLists.txt     |   2 +-
 11 files changed, 306 insertions(+), 93 deletions(-)

diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 6173e29f45..1aa433db32 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -262,23 +262,18 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 endif()
 
 if (BOOST_NO_HEADERS)
-  target_link_libraries(common PUBLIC Boost::algorithm Boost::icl Boost::pool)
+    target_link_libraries(common PUBLIC Boost::algorithm Boost::icl Boost::pool)
 else()
-  target_link_libraries(common PUBLIC Boost::headers)
+    target_link_libraries(common PUBLIC Boost::headers)
 endif()
 
 if (lz4_ADDED)
-  target_include_directories(common PRIVATE ${lz4_SOURCE_DIR}/lib)
+    target_include_directories(common PRIVATE ${lz4_SOURCE_DIR}/lib)
 endif()
 
 target_link_libraries(common PUBLIC fmt::fmt stb::headers Threads::Threads)
 target_link_libraries(common PRIVATE lz4::lz4 LLVM::Demangle zstd::zstd)
 
-if (TARGET unordered_dense::unordered_dense)
-  # weird quirk of system installs
-  target_link_libraries(common PUBLIC unordered_dense::unordered_dense)
-endif()
-
 if(ANDROID)
   # For ASharedMemory_create
   target_link_libraries(common PRIVATE android)
diff --git a/src/common/heap_tracker.cpp b/src/common/heap_tracker.cpp
index c147c279bd..c875683f0f 100644
--- a/src/common/heap_tracker.cpp
+++ b/src/common/heap_tracker.cpp
@@ -1,5 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
-// SPDX-License-Identifier: GPL-3.0-or-later
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -35,60 +33,68 @@ HeapTracker::~HeapTracker() = default;
 
 void HeapTracker::Map(size_t virtual_offset, size_t host_offset, size_t length,
                       MemoryPermission perm, bool is_separate_heap) {
-    bool rebuild_required = false;
     // When mapping other memory, map pages immediately.
     if (!is_separate_heap) {
         m_buffer.Map(virtual_offset, host_offset, length, perm, false);
         return;
     }
+
     {
-        // We are mapping part of a separate heap and insert into mappings.
+        // We are mapping part of a separate heap.
         std::scoped_lock lk{m_lock};
-        m_map_count++;
-        const auto it = m_mappings.insert_or_assign(virtual_offset, SeparateHeapMap{
+
+        auto* const map = new SeparateHeapMap{
+            .vaddr = virtual_offset,
             .paddr = host_offset,
             .size = length,
             .tick = m_tick++,
             .perm = perm,
             .is_resident = false,
-        });
-        // Update tick before possible rebuild.
-        it.first->second.tick = m_tick++;
-        // Check if we need to rebuild.
-        if (m_resident_map_count >= m_max_resident_map_count)
-            rebuild_required = true;
-        // Map the area.
-        m_buffer.Map(it.first->first, it.first->second.paddr, it.first->second.size, it.first->second.perm, false);
-        // This map is now resident.
-        it.first->second.is_resident = true;
-        m_resident_map_count++;
-        m_resident_mappings.insert(*it.first);
+        };
+
+        // Insert into mappings.
+        m_map_count++;
+        m_mappings.insert(*map);
     }
-    // A rebuild was required, so perform it now.
-    if (rebuild_required)
-        this->RebuildSeparateHeapAddressSpace();
+
+    // Finally, map.
+    this->DeferredMapSeparateHeap(virtual_offset);
 }
 
 void HeapTracker::Unmap(size_t virtual_offset, size_t size, bool is_separate_heap) {
     // If this is a separate heap...
     if (is_separate_heap) {
         std::scoped_lock lk{m_lock};
+
+        const SeparateHeapMap key{
+            .vaddr = virtual_offset,
+        };
+
         // Split at the boundaries of the region we are removing.
         this->SplitHeapMapLocked(virtual_offset);
         this->SplitHeapMapLocked(virtual_offset + size);
+
         // Erase all mappings in range.
-        auto it = m_mappings.find(virtual_offset);
-        while (it != m_mappings.end() && it->first < virtual_offset + size) {
+        auto it = m_mappings.find(key);
+        while (it != m_mappings.end() && it->vaddr < virtual_offset + size) {
+            // Get underlying item.
+            auto* const item = std::addressof(*it);
+
             // If resident, erase from resident map.
-            if (it->second.is_resident) {
+            if (item->is_resident) {
                 ASSERT(--m_resident_map_count >= 0);
-                m_resident_mappings.erase(m_resident_mappings.find(it->first));
+                m_resident_mappings.erase(m_resident_mappings.iterator_to(*item));
             }
+
             // Erase from map.
             ASSERT(--m_map_count >= 0);
             it = m_mappings.erase(it);
+
+            // Free the item.
+            delete item;
         }
     }
+
     // Unmap pages.
     m_buffer.Unmap(virtual_offset, size, false);
 }
@@ -110,51 +116,110 @@ void HeapTracker::Protect(size_t virtual_offset, size_t size, MemoryPermission p
 
         {
             std::scoped_lock lk2{m_lock};
+
+            const SeparateHeapMap key{
+                .vaddr = next,
+            };
+
             // Try to get the next mapping corresponding to this address.
-            const auto it = m_mappings.find(next);
+            const auto it = m_mappings.nfind(key);
+
             if (it == m_mappings.end()) {
                 // There are no separate heap mappings remaining.
                 next = end;
                 should_protect = true;
-            } else if (it->first == cur) {
+            } else if (it->vaddr == cur) {
                 // We are in range.
                 // Update permission bits.
-                it->second.perm = perm;
+                it->perm = perm;
 
                 // Determine next address and whether we should protect.
-                next = cur + it->second.size;
-                should_protect = it->second.is_resident;
+                next = cur + it->size;
+                should_protect = it->is_resident;
             } else /* if (it->vaddr > cur) */ {
                 // We weren't in range, but there is a block coming up that will be.
-                next = it->first;
+                next = it->vaddr;
                 should_protect = true;
             }
         }
 
         // Clamp to end.
         next = std::min(next, end);
+
         // Reprotect, if we need to.
-        if (should_protect)
+        if (should_protect) {
             m_buffer.Protect(cur, next - cur, perm);
+        }
+
         // Advance.
         cur = next;
     }
 }
 
+bool HeapTracker::DeferredMapSeparateHeap(u8* fault_address) {
+    if (m_buffer.IsInVirtualRange(fault_address)) {
+        return this->DeferredMapSeparateHeap(fault_address - m_buffer.VirtualBasePointer());
+    }
+
+    return false;
+}
+
+bool HeapTracker::DeferredMapSeparateHeap(size_t virtual_offset) {
+    bool rebuild_required = false;
+
+    {
+        std::scoped_lock lk{m_lock};
+
+        // Check to ensure this was a non-resident separate heap mapping.
+        const auto it = this->GetNearestHeapMapLocked(virtual_offset);
+        if (it == m_mappings.end() || it->is_resident) {
+            return false;
+        }
+
+        // Update tick before possible rebuild.
+        it->tick = m_tick++;
+
+        // Check if we need to rebuild.
+        if (m_resident_map_count > m_max_resident_map_count) {
+            rebuild_required = true;
+        }
+
+        // Map the area.
+        m_buffer.Map(it->vaddr, it->paddr, it->size, it->perm, false);
+
+        // This map is now resident.
+        it->is_resident = true;
+        m_resident_map_count++;
+        m_resident_mappings.insert(*it);
+    }
+
+    if (rebuild_required) {
+        // A rebuild was required, so perform it now.
+        this->RebuildSeparateHeapAddressSpace();
+    }
+
+    return true;
+}
+
 void HeapTracker::RebuildSeparateHeapAddressSpace() {
     std::scoped_lock lk{m_rebuild_lock, m_lock};
+
     ASSERT(!m_resident_mappings.empty());
+
     // Dump half of the mappings.
+    //
     // Despite being worse in theory, this has proven to be better in practice than more
     // regularly dumping a smaller amount, because it significantly reduces average case
     // lock contention.
-    std::size_t const desired_count = std::min(m_resident_map_count, m_max_resident_map_count) / 2;
-    std::size_t const evict_count = m_resident_map_count - desired_count;
+    const size_t desired_count = std::min(m_resident_map_count, m_max_resident_map_count) / 2;
+    const size_t evict_count = m_resident_map_count - desired_count;
     auto it = m_resident_mappings.begin();
-    for (std::size_t i = 0; i < evict_count && it != m_resident_mappings.end(); i++) {
+
+    for (size_t i = 0; i < evict_count && it != m_resident_mappings.end(); i++) {
         // Unmark and unmap.
-        it->second.is_resident = false;
-        m_buffer.Unmap(it->first, it->second.size, false);
+        it->is_resident = false;
+        m_buffer.Unmap(it->vaddr, it->size, false);
+
         // Advance.
         ASSERT(--m_resident_map_count >= 0);
         it = m_resident_mappings.erase(it);
@@ -163,32 +228,53 @@ void HeapTracker::RebuildSeparateHeapAddressSpace() {
 
 void HeapTracker::SplitHeapMap(VAddr offset, size_t size) {
     std::scoped_lock lk{m_lock};
+
     this->SplitHeapMapLocked(offset);
     this->SplitHeapMapLocked(offset + size);
 }
 
 void HeapTracker::SplitHeapMapLocked(VAddr offset) {
-    auto it = this->GetNearestHeapMapLocked(offset);
-    if (it != m_mappings.end() && it->first != offset) {
-        // Adjust left iterator
-        auto const orig_size = it->second.size;
-        auto const left_size = offset - it->first;
-        it->second.size = left_size;
-        // Insert the new right map.
-        auto const right = SeparateHeapMap{
-            .paddr = it->second.paddr + left_size,
-            .size = orig_size - left_size,
-            .tick = it->second.tick,
-            .perm = it->second.perm,
-            .is_resident = it->second.is_resident,
-        };
-        m_map_count++;
-        auto rit = m_mappings.insert_or_assign(it->first + left_size, right);
-        if (rit.first->second.is_resident) {
-            m_resident_map_count++;
-            m_resident_mappings.insert(*rit.first);
-        }
+    const auto it = this->GetNearestHeapMapLocked(offset);
+    if (it == m_mappings.end() || it->vaddr == offset) {
+        // Not contained or no split required.
+        return;
+    }
+
+    // Cache the original values.
+    auto* const left = std::addressof(*it);
+    const size_t orig_size = left->size;
+
+    // Adjust the left map.
+    const size_t left_size = offset - left->vaddr;
+    left->size = left_size;
+
+    // Create the new right map.
+    auto* const right = new SeparateHeapMap{
+        .vaddr = left->vaddr + left_size,
+        .paddr = left->paddr + left_size,
+        .size = orig_size - left_size,
+        .tick = left->tick,
+        .perm = left->perm,
+        .is_resident = left->is_resident,
+    };
+
+    // Insert the new right map.
+    m_map_count++;
+    m_mappings.insert(*right);
+
+    // If resident, also insert into resident map.
+    if (right->is_resident) {
+        m_resident_map_count++;
+        m_resident_mappings.insert(*right);
     }
 }
 
+HeapTracker::AddrTree::iterator HeapTracker::GetNearestHeapMapLocked(VAddr offset) {
+    const SeparateHeapMap key{
+        .vaddr = offset,
+    };
+
+    return m_mappings.find(key);
+}
+
 } // namespace Common
diff --git a/src/common/heap_tracker.h b/src/common/heap_tracker.h
index 14b5401c18..ee5b0bf43a 100644
--- a/src/common/heap_tracker.h
+++ b/src/common/heap_tracker.h
@@ -1,55 +1,93 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
-// SPDX-License-Identifier: GPL-3.0-or-later
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #pragma once
 
+#include <atomic>
 #include <mutex>
+#include <set>
 #include <shared_mutex>
-#include <ankerl/unordered_dense.h>
+
 #include "common/host_memory.h"
+#include "common/intrusive_red_black_tree.h"
 
 namespace Common {
 
 struct SeparateHeapMap {
-    PAddr paddr{}; //8
-    std::size_t size{}; //8 (16)
-    std::size_t tick{}; //8 (24)
-    // 4 bits needed, sync with host_memory.h if needed
-    MemoryPermission perm : 4 = MemoryPermission::Read;
-    bool is_resident : 1 = false;
+    Common::IntrusiveRedBlackTreeNode addr_node{};
+    Common::IntrusiveRedBlackTreeNode tick_node{};
+    VAddr vaddr{};
+    PAddr paddr{};
+    size_t size{};
+    size_t tick{};
+    MemoryPermission perm{};
+    bool is_resident{};
+};
+
+struct SeparateHeapMapAddrComparator {
+    static constexpr int Compare(const SeparateHeapMap& lhs, const SeparateHeapMap& rhs) {
+        if (lhs.vaddr < rhs.vaddr) {
+            return -1;
+        } else if (lhs.vaddr <= (rhs.vaddr + rhs.size - 1)) {
+            return 0;
+        } else {
+            return 1;
+        }
+    }
+};
+
+struct SeparateHeapMapTickComparator {
+    static constexpr int Compare(const SeparateHeapMap& lhs, const SeparateHeapMap& rhs) {
+        if (lhs.tick < rhs.tick) {
+            return -1;
+        } else if (lhs.tick > rhs.tick) {
+            return 1;
+        } else {
+            return SeparateHeapMapAddrComparator::Compare(lhs, rhs);
+        }
+    }
 };
-static_assert(sizeof(SeparateHeapMap) == 32); //half a cache line! good for coherency
 
 class HeapTracker {
 public:
     explicit HeapTracker(Common::HostMemory& buffer);
     ~HeapTracker();
-    void Map(size_t virtual_offset, size_t host_offset, size_t length, MemoryPermission perm, bool is_separate_heap);
+
+    void Map(size_t virtual_offset, size_t host_offset, size_t length, MemoryPermission perm,
+             bool is_separate_heap);
     void Unmap(size_t virtual_offset, size_t size, bool is_separate_heap);
     void Protect(size_t virtual_offset, size_t length, MemoryPermission perm);
-    inline u8* VirtualBasePointer() noexcept {
+    u8* VirtualBasePointer() {
         return m_buffer.VirtualBasePointer();
     }
+
+    bool DeferredMapSeparateHeap(u8* fault_address);
+    bool DeferredMapSeparateHeap(size_t virtual_offset);
+
 private:
-    // TODO: You may want to "fake-map" the first 2GB of 64-bit address space
-    // and dedicate it entirely to a recursive PTE mapping :)
-    // However Ankerl is way better than using an RB tree, in all senses
-    using AddrTree = ankerl::unordered_dense::map<VAddr, SeparateHeapMap>;
-    AddrTree m_mappings;
-    using TicksTree = ankerl::unordered_dense::map<VAddr, SeparateHeapMap>;
-    TicksTree m_resident_mappings;
+    using AddrTreeTraits =
+        Common::IntrusiveRedBlackTreeMemberTraitsDeferredAssert<&SeparateHeapMap::addr_node>;
+    using AddrTree = AddrTreeTraits::TreeType<SeparateHeapMapAddrComparator>;
+
+    using TickTreeTraits =
+        Common::IntrusiveRedBlackTreeMemberTraitsDeferredAssert<&SeparateHeapMap::tick_node>;
+    using TickTree = TickTreeTraits::TreeType<SeparateHeapMapTickComparator>;
+
+    AddrTree m_mappings{};
+    TickTree m_resident_mappings{};
+
 private:
     void SplitHeapMap(VAddr offset, size_t size);
     void SplitHeapMapLocked(VAddr offset);
+
+    AddrTree::iterator GetNearestHeapMapLocked(VAddr offset);
+
     void RebuildSeparateHeapAddressSpace();
-    inline HeapTracker::AddrTree::iterator GetNearestHeapMapLocked(VAddr offset) noexcept {
-        return m_mappings.find(offset);
-    }
+
 private:
     Common::HostMemory& m_buffer;
     const s64 m_max_resident_map_count;
+
     std::shared_mutex m_rebuild_lock{};
     std::mutex m_lock{};
     s64 m_map_count{};
diff --git a/src/core/arm/dynarmic/arm_dynarmic.cpp b/src/core/arm/dynarmic/arm_dynarmic.cpp
index 9d26db51f7..e6e9fc45be 100644
--- a/src/core/arm/dynarmic/arm_dynarmic.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic.cpp
@@ -3,9 +3,47 @@
 
 #ifdef __linux__
 
-//#include "common/signal_chain.h"
+#include "common/signal_chain.h"
+
 #include "core/arm/dynarmic/arm_dynarmic.h"
-//#include "core/hle/kernel/k_process.h"
-//#include "core/memory.h"
+#include "core/hle/kernel/k_process.h"
+#include "core/memory.h"
+
+namespace Core {
+
+namespace {
+
+thread_local Core::Memory::Memory* g_current_memory{};
+std::once_flag g_registered{};
+struct sigaction g_old_segv {};
+
+void HandleSigSegv(int sig, siginfo_t* info, void* ctx) {
+    if (g_current_memory && g_current_memory->InvalidateSeparateHeap(info->si_addr)) {
+        return;
+    }
+
+    return g_old_segv.sa_sigaction(sig, info, ctx);
+}
+
+} // namespace
+
+ScopedJitExecution::ScopedJitExecution(Kernel::KProcess* process) {
+    g_current_memory = std::addressof(process->GetMemory());
+}
+
+ScopedJitExecution::~ScopedJitExecution() {
+    g_current_memory = nullptr;
+}
+
+void ScopedJitExecution::RegisterHandler() {
+    std::call_once(g_registered, [] {
+        struct sigaction sa {};
+        sa.sa_sigaction = &HandleSigSegv;
+        sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+        Common::SigAction(SIGSEGV, std::addressof(sa), std::addressof(g_old_segv));
+    });
+}
+
+} // namespace Core
 
 #endif
diff --git a/src/core/arm/dynarmic/arm_dynarmic.h b/src/core/arm/dynarmic/arm_dynarmic.h
index eef7c31160..53dd188151 100644
--- a/src/core/arm/dynarmic/arm_dynarmic.h
+++ b/src/core/arm/dynarmic/arm_dynarmic.h
@@ -26,4 +26,24 @@ constexpr HaltReason TranslateHaltReason(Dynarmic::HaltReason hr) {
     return static_cast<HaltReason>(hr);
 }
 
+#ifdef __linux__
+
+class ScopedJitExecution {
+public:
+    explicit ScopedJitExecution(Kernel::KProcess* process);
+    ~ScopedJitExecution();
+    static void RegisterHandler();
+};
+
+#else
+
+class ScopedJitExecution {
+public:
+    explicit ScopedJitExecution(Kernel::KProcess* process) {}
+    ~ScopedJitExecution() {}
+    static void RegisterHandler() {}
+};
+
+#endif
+
 } // namespace Core
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
index 7123497682..2c2c54a1ad 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -343,11 +343,15 @@ bool ArmDynarmic32::IsInThumbMode() const {
 }
 
 HaltReason ArmDynarmic32::RunThread(Kernel::KThread* thread) {
+    ScopedJitExecution sj(thread->GetOwnerProcess());
+
     m_jit->ClearExclusiveState();
     return TranslateHaltReason(m_jit->Run());
 }
 
 HaltReason ArmDynarmic32::StepThread(Kernel::KThread* thread) {
+    ScopedJitExecution sj(thread->GetOwnerProcess());
+
     m_jit->ClearExclusiveState();
     return TranslateHaltReason(m_jit->Step());
 }
@@ -389,6 +393,7 @@ ArmDynarmic32::ArmDynarmic32(System& system, bool uses_wall_clock, Kernel::KProc
       m_cp15(std::make_shared<DynarmicCP15>(*this)), m_core_index{core_index} {
     auto& page_table_impl = process->GetPageTable().GetBasePageTable().GetImpl();
     m_jit = MakeJit(&page_table_impl);
+    ScopedJitExecution::RegisterHandler();
 }
 
 ArmDynarmic32::~ArmDynarmic32() = default;
diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
index 2745aeb862..438b7b691c 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -374,11 +374,15 @@ std::shared_ptr<Dynarmic::A64::Jit> ArmDynarmic64::MakeJit(Common::PageTable* pa
 }
 
 HaltReason ArmDynarmic64::RunThread(Kernel::KThread* thread) {
+    ScopedJitExecution sj(thread->GetOwnerProcess());
+
     m_jit->ClearExclusiveState();
     return TranslateHaltReason(m_jit->Run());
 }
 
 HaltReason ArmDynarmic64::StepThread(Kernel::KThread* thread) {
+    ScopedJitExecution sj(thread->GetOwnerProcess());
+
     m_jit->ClearExclusiveState();
     return TranslateHaltReason(m_jit->Step());
 }
@@ -418,6 +422,7 @@ ArmDynarmic64::ArmDynarmic64(System& system, bool uses_wall_clock, Kernel::KProc
     auto& page_table = process->GetPageTable().GetBasePageTable();
     auto& page_table_impl = page_table.GetImpl();
     m_jit = MakeJit(&page_table_impl, page_table.GetAddressSpaceWidth());
+    ScopedJitExecution::RegisterHandler();
 }
 
 ArmDynarmic64::~ArmDynarmic64() = default;
diff --git a/src/core/hle/kernel/k_process.cpp b/src/core/hle/kernel/k_process.cpp
index cf03353f84..80566b7e77 100644
--- a/src/core/hle/kernel/k_process.cpp
+++ b/src/core/hle/kernel/k_process.cpp
@@ -1266,6 +1266,10 @@ void KProcess::InitializeInterfaces() {
 
 #ifdef HAS_NCE
     if (this->IsApplication() && Settings::IsNceEnabled()) {
+        // Register the scoped JIT handler before creating any NCE instances
+        // so that its signal handler will appear first in the signal chain.
+        Core::ScopedJitExecution::RegisterHandler();
+
         for (size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
             m_arm_interfaces[i] = std::make_unique<Core::ArmNce>(m_kernel.System(), true, i);
         }
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 08391cd815..0035c626e2 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -61,7 +61,8 @@ struct Memory::Impl {
         }
 
 #ifdef __linux__
-        buffer.emplace(system.DeviceMemory().buffer);
+        heap_tracker.emplace(system.DeviceMemory().buffer);
+        buffer = std::addressof(*heap_tracker);
 #else
         buffer = std::addressof(system.DeviceMemory().buffer);
 #endif
@@ -1023,8 +1024,9 @@ struct Memory::Impl {
     std::span<Core::GPUDirtyMemoryManager> gpu_dirty_managers;
     std::mutex sys_core_guard;
 
+    std::optional<Common::HeapTracker> heap_tracker;
 #ifdef __linux__
-    std::optional<Common::HeapTracker> buffer;
+    Common::HeapTracker* buffer{};
 #else
     Common::HostMemory* buffer{};
 #endif
@@ -1228,7 +1230,22 @@ bool Memory::InvalidateNCE(Common::ProcessAddress vaddr, size_t size) {
     if (rasterizer) {
         impl->InvalidateGPUMemory(ptr, size);
     }
+
+#ifdef __linux__
+    if (!rasterizer && mapped) {
+        impl->buffer->DeferredMapSeparateHeap(GetInteger(vaddr));
+    }
+#endif
+
     return mapped && ptr != nullptr;
 }
 
+bool Memory::InvalidateSeparateHeap(void* fault_address) {
+#ifdef __linux__
+    return impl->buffer->DeferredMapSeparateHeap(static_cast<u8*>(fault_address));
+#else
+    return false;
+#endif
+}
+
 } // namespace Core::Memory
diff --git a/src/core/memory.h b/src/core/memory.h
index 99108ecf0d..dcca26892b 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -487,8 +487,13 @@ public:
      *              marked as debug or non-debug.
      */
     void MarkRegionDebug(Common::ProcessAddress vaddr, u64 size, bool debug);
+
     void SetGPUDirtyManagers(std::span<Core::GPUDirtyMemoryManager> managers);
+
     bool InvalidateNCE(Common::ProcessAddress vaddr, size_t size);
+
+    bool InvalidateSeparateHeap(void* fault_address);
+
 private:
     Core::System& system;
 
diff --git a/src/dynarmic/externals/CMakeLists.txt b/src/dynarmic/externals/CMakeLists.txt
index 73c97d8f06..ba70797a84 100644
--- a/src/dynarmic/externals/CMakeLists.txt
+++ b/src/dynarmic/externals/CMakeLists.txt
@@ -60,7 +60,7 @@ AddJsonPackage(
 #     endif()
 # endif()
 
-# unordered_dense - already in root
+# unordered_dense
 
 # AddJsonPackage(
 #     NAME unordered-dense

From 718891d11f53a8496ce1462ce37a3c0d4083ba33 Mon Sep 17 00:00:00 2001
From: Maufeat <sahyno1996@gmail.com>
Date: Fri, 5 Sep 2025 00:04:37 +0200
Subject: [PATCH 24/38] [fs] temporarely disable nca verification (#298)

This adds a passthrough to basically disable nca verification for newer NCAs, this fixes (tested) Pokemon 4.0.0 update and other newer SDK games and updates (as reported on the discord)

This is implemented as toggle that is default enabled, this needs proper implementation in the future.

Co-authored-by: crueter <crueter@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/298
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: Maufeat <sahyno1996@gmail.com>
Co-committed-by: Maufeat <sahyno1996@gmail.com>
---
 src/android/app/build.gradle.kts              |   4 +-
 .../org/yuzu/yuzu_emu/adapters/GameAdapter.kt |  22 ++
 .../features/settings/model/BooleanSetting.kt |   1 +
 .../features/settings/model/Settings.kt       |   1 +
 .../settings/model/view/SettingsItem.kt       |   8 +-
 .../settings/ui/SettingsFragmentPresenter.kt  |   1 +
 .../org/yuzu/yuzu_emu/model/HomeViewModel.kt  |   7 -
 .../org/yuzu/yuzu_emu/ui/main/MainActivity.kt |  34 ---
 src/android/app/src/main/jni/native.cpp       |   2 +
 .../app/src/main/res/values-ar/strings.xml    |   2 +
 .../app/src/main/res/values-ckb/strings.xml   |   2 +
 .../app/src/main/res/values-cs/strings.xml    |   2 +
 .../app/src/main/res/values-de/strings.xml    |   2 +
 .../app/src/main/res/values-es/strings.xml    |   2 +
 .../app/src/main/res/values-fa/strings.xml    |   2 +
 .../app/src/main/res/values-fr/strings.xml    |   2 +
 .../app/src/main/res/values-he/strings.xml    |   2 +
 .../app/src/main/res/values-hu/strings.xml    |   2 +
 .../app/src/main/res/values-id/strings.xml    |   2 +
 .../app/src/main/res/values-it/strings.xml    |   2 +
 .../app/src/main/res/values-ja/strings.xml    |   2 +
 .../app/src/main/res/values-ko/strings.xml    |   2 +
 .../app/src/main/res/values-nb/strings.xml    |   2 +
 .../app/src/main/res/values-pl/strings.xml    |   2 +
 .../src/main/res/values-pt-rBR/strings.xml    |   2 +
 .../src/main/res/values-pt-rPT/strings.xml    |   2 +
 .../app/src/main/res/values-ru/strings.xml    |   2 +
 .../app/src/main/res/values-sr/strings.xml    |   2 +
 .../app/src/main/res/values-uk/strings.xml    |   2 +
 .../app/src/main/res/values-vi/strings.xml    |   2 +
 .../src/main/res/values-zh-rCN/strings.xml    |   2 +
 .../src/main/res/values-zh-rTW/strings.xml    |   2 +
 .../app/src/main/res/values/strings.xml       |   5 +
 src/audio_core/common/feature_support.h       |   2 +-
 src/common/settings.h                         |   9 +-
 src/core/CMakeLists.txt                       |   3 +
 .../fssystem/fssystem_bucket_tree.cpp         |   9 +-
 .../fssystem_hierarchical_sha256_storage.cpp  |  20 +-
 .../fssystem_hierarchical_sha3_storage.cpp    |  57 +++++
 .../fssystem_hierarchical_sha3_storage.h      |  44 ++++
 .../fssystem_nca_file_system_driver.cpp       | 221 +++++++++++++-----
 .../fssystem_nca_file_system_driver.h         |   7 +
 .../file_sys/fssystem/fssystem_nca_header.cpp |   9 +-
 .../fssystem/fssystem_passthrough_storage.h   |  32 +++
 src/core/hle/service/am/applet.cpp            |   9 +-
 src/core/hle/service/am/applet.h              |   4 +
 .../all_system_applet_proxies_service.cpp     |  10 +
 .../all_system_applet_proxies_service.h       |   1 +
 .../am/service/applet_common_functions.cpp    |   7 +
 .../am/service/applet_common_functions.h      |   1 +
 .../am/service/application_functions.cpp      |  11 +
 .../am/service/application_functions.h        |   4 +
 .../am/service/library_applet_creator.cpp     |   4 +-
 .../ns/application_manager_interface.cpp      |  16 ++
 .../ns/application_manager_interface.h        |   2 +
 .../service/pctl/parental_control_service.cpp |  14 +-
 .../service/pctl/parental_control_service.h   |   2 +
 src/yuzu/applets/qt_web_browser.cpp           |   9 +-
 src/yuzu/configuration/shared_translation.cpp |   6 +
 src/yuzu/main.cpp                             |  41 +++-
 src/yuzu/main.h                               |   3 +
 61 files changed, 559 insertions(+), 129 deletions(-)
 create mode 100644 src/core/file_sys/fssystem/fssystem_hierarchical_sha3_storage.cpp
 create mode 100644 src/core/file_sys/fssystem/fssystem_hierarchical_sha3_storage.h
 create mode 100644 src/core/file_sys/fssystem/fssystem_passthrough_storage.h

diff --git a/src/android/app/build.gradle.kts b/src/android/app/build.gradle.kts
index e91d2e8c52..3f1a7c102b 100644
--- a/src/android/app/build.gradle.kts
+++ b/src/android/app/build.gradle.kts
@@ -30,8 +30,8 @@ val autoVersion = (((System.currentTimeMillis() / 1000) - 1451606400) / 10).toIn
 android {
     namespace = "org.yuzu.yuzu_emu"
 
-    compileSdkVersion = "android-35"
-    ndkVersion = "26.1.10909125"
+    compileSdkVersion = "android-36"
+    ndkVersion = "28.2.13676358"
 
     buildFeatures {
         viewBinding = true
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/adapters/GameAdapter.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/adapters/GameAdapter.kt
index 11b81a01a6..98f342c274 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/adapters/GameAdapter.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/adapters/GameAdapter.kt
@@ -36,6 +36,9 @@ import androidx.core.net.toUri
 import androidx.core.content.edit
 import com.google.android.material.dialog.MaterialAlertDialogBuilder
 import org.yuzu.yuzu_emu.NativeLibrary
+import org.yuzu.yuzu_emu.features.settings.model.BooleanSetting
+import org.yuzu.yuzu_emu.features.settings.model.Settings
+import org.yuzu.yuzu_emu.utils.NativeConfig
 
 class GameAdapter(private val activity: AppCompatActivity) :
     AbstractDiffAdapter<Game, GameAdapter.GameViewHolder>(exact = false) {
@@ -229,6 +232,8 @@ class GameAdapter(private val activity: AppCompatActivity) :
                 binding.root.findNavController().navigate(action)
             }
 
+            val preferences = PreferenceManager.getDefaultSharedPreferences(YuzuApplication.appContext)
+
             if (NativeLibrary.gameRequiresFirmware(game.programId) && !NativeLibrary.isFirmwareAvailable()) {
                 MaterialAlertDialogBuilder(activity)
                     .setTitle(R.string.loader_requires_firmware)
@@ -243,6 +248,23 @@ class GameAdapter(private val activity: AppCompatActivity) :
                     }
                     .setNegativeButton(android.R.string.cancel) { _, _ -> }
                     .show()
+            } else if (BooleanSetting.DISABLE_NCA_VERIFICATION.getBoolean(false) && !preferences.getBoolean(
+                    Settings.PREF_HIDE_NCA_POPUP, false)) {
+                MaterialAlertDialogBuilder(activity)
+                    .setTitle(R.string.nca_verification_disabled)
+                    .setMessage(activity.getString(R.string.nca_verification_disabled_description))
+                    .setPositiveButton(android.R.string.ok) { _, _ ->
+                        launch()
+                    }
+                    .setNeutralButton(R.string.dont_show_again) { _, _ ->
+                        preferences.edit {
+                            putBoolean(Settings.PREF_HIDE_NCA_POPUP, true)
+                        }
+
+                        launch()
+                    }
+                    .setNegativeButton(android.R.string.cancel) { _, _ -> }
+                    .show()
             } else {
                 launch()
             }
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
index 3c5b9003de..6d4bfd97ac 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
@@ -35,6 +35,7 @@ enum class BooleanSetting(override val key: String) : AbstractBooleanSetting {
     RENDERER_SAMPLE_SHADING("sample_shading"),
     PICTURE_IN_PICTURE("picture_in_picture"),
     USE_CUSTOM_RTC("custom_rtc_enabled"),
+    DISABLE_NCA_VERIFICATION("disable_nca_verification"),
     BLACK_BACKGROUNDS("black_backgrounds"),
     JOYSTICK_REL_CENTER("joystick_rel_center"),
     DPAD_SLIDE("dpad_slide"),
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/Settings.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/Settings.kt
index a52f582031..2564849ef4 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/Settings.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/Settings.kt
@@ -37,6 +37,7 @@ object Settings {
     const val PREF_SHOULD_SHOW_PRE_ALPHA_WARNING = "ShouldShowPreAlphaWarning"
     const val PREF_SHOULD_SHOW_EDENS_VEIL_DIALOG = "ShouldShowEdensVeilDialog"
     const val PREF_MEMORY_WARNING_SHOWN = "MemoryWarningShown"
+    const val PREF_HIDE_NCA_POPUP = "HideNCAVerificationPopup"
     const val SECTION_STATS_OVERLAY = "Stats Overlay"
 
     // Deprecated input overlay preference keys
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
index a689b6ce76..883d8efaef 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
@@ -297,7 +297,13 @@ abstract class SettingsItem(
                     descriptionId = R.string.use_custom_rtc_description
                 )
             )
-
+            put(
+                SwitchSetting(
+                    BooleanSetting.DISABLE_NCA_VERIFICATION,
+                    titleId = R.string.disable_nca_verification,
+                    descriptionId = R.string.disable_nca_verification_description
+                )
+            )
             put(
                 StringInputSetting(
                     StringSetting.WEB_TOKEN,
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
index 14d62ceec3..630bcb0d74 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
@@ -210,6 +210,7 @@ class SettingsFragmentPresenter(
             add(IntSetting.LANGUAGE_INDEX.key)
             add(BooleanSetting.USE_CUSTOM_RTC.key)
             add(LongSetting.CUSTOM_RTC.key)
+            add(BooleanSetting.DISABLE_NCA_VERIFICATION.key)
 
             add(HeaderSetting(R.string.network))
             add(StringSetting.WEB_TOKEN.key)
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/HomeViewModel.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/HomeViewModel.kt
index 97a60ee184..a06abb394f 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/HomeViewModel.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/HomeViewModel.kt
@@ -31,9 +31,6 @@ class HomeViewModel : ViewModel() {
     private val _checkKeys = MutableStateFlow(false)
     val checkKeys = _checkKeys.asStateFlow()
 
-    private val _checkFirmware = MutableStateFlow(false)
-    val checkFirmware = _checkFirmware.asStateFlow()
-
     var navigatedToSetup = false
 
     fun setStatusBarShadeVisibility(visible: Boolean) {
@@ -66,8 +63,4 @@ class HomeViewModel : ViewModel() {
     fun setCheckKeys(value: Boolean) {
         _checkKeys.value = value
     }
-
-    fun setCheckFirmware(value: Boolean) {
-        _checkFirmware.value = value
-    }
 }
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
index e8dd566f79..cfed4d08ec 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
@@ -142,16 +142,6 @@ class MainActivity : AppCompatActivity(), ThemeProvider {
             checkedDecryption = true
         }
 
-        if (!checkedFirmware) {
-            val firstTimeSetup = PreferenceManager.getDefaultSharedPreferences(applicationContext)
-                .getBoolean(Settings.PREF_FIRST_APP_LAUNCH, true)
-            if (!firstTimeSetup) {
-                checkFirmware()
-                showPreAlphaWarningDialog()
-            }
-            checkedFirmware = true
-        }
-
         WindowCompat.setDecorFitsSystemWindows(window, false)
         window.setSoftInputMode(WindowManager.LayoutParams.SOFT_INPUT_ADJUST_NOTHING)
 
@@ -198,13 +188,6 @@ class MainActivity : AppCompatActivity(), ThemeProvider {
             if (it) checkKeys()
         }
 
-        homeViewModel.checkFirmware.collect(
-            this,
-            resetState = { homeViewModel.setCheckFirmware(false) }
-        ) {
-            if (it) checkFirmware()
-        }
-
         setInsets()
     }
 
@@ -243,21 +226,6 @@ class MainActivity : AppCompatActivity(), ThemeProvider {
             ).show(supportFragmentManager, MessageDialogFragment.TAG)
         }
     }
-
-    private fun checkFirmware() {
-        val resultCode: Int = NativeLibrary.verifyFirmware()
-        if (resultCode == 0) return
-
-        val resultString: String =
-            resources.getStringArray(R.array.verifyFirmwareResults)[resultCode]
-
-        MessageDialogFragment.newInstance(
-            titleId = R.string.firmware_invalid,
-            descriptionString = resultString,
-            helpLinkId = R.string.firmware_missing_help
-        ).show(supportFragmentManager, MessageDialogFragment.TAG)
-    }
-
     override fun onSaveInstanceState(outState: Bundle) {
         super.onSaveInstanceState(outState)
         outState.putBoolean(CHECKED_DECRYPTION, checkedDecryption)
@@ -434,7 +402,6 @@ class MainActivity : AppCompatActivity(), ThemeProvider {
                     cacheFirmwareDir.copyRecursively(firmwarePath, true)
                     NativeLibrary.initializeSystem(true)
                     homeViewModel.setCheckKeys(true)
-                    homeViewModel.setCheckFirmware(true)
                     getString(R.string.save_file_imported_success)
                 }
             } catch (e: Exception) {
@@ -464,7 +431,6 @@ class MainActivity : AppCompatActivity(), ThemeProvider {
                     // Optionally reinitialize the system or perform other necessary steps
                     NativeLibrary.initializeSystem(true)
                     homeViewModel.setCheckKeys(true)
-                    homeViewModel.setCheckFirmware(true)
                     messageToShow = getString(R.string.firmware_uninstalled_success)
                 } else {
                     messageToShow = getString(R.string.firmware_uninstalled_failure)
diff --git a/src/android/app/src/main/jni/native.cpp b/src/android/app/src/main/jni/native.cpp
index f8f175d313..306b7e2a4c 100644
--- a/src/android/app/src/main/jni/native.cpp
+++ b/src/android/app/src/main/jni/native.cpp
@@ -596,6 +596,8 @@ jstring Java_org_yuzu_yuzu_1emu_utils_GpuDriverHelper_getGpuModel(JNIEnv *env, j
 
     const std::string model_name{device.GetModelName()};
 
+    window.release();
+
     return Common::Android::ToJString(env, model_name);
 }
 
diff --git a/src/android/app/src/main/res/values-ar/strings.xml b/src/android/app/src/main/res/values-ar/strings.xml
index ed3fc76f3b..c705ae08f1 100644
--- a/src/android/app/src/main/res/values-ar/strings.xml
+++ b/src/android/app/src/main/res/values-ar/strings.xml
@@ -498,6 +498,8 @@
     <string name="use_custom_rtc">ساعة مخصصة في الوقت الحقيقي</string>
     <string name="use_custom_rtc_description">يسمح لك بتعيين ساعة مخصصة في الوقت الفعلي منفصلة عن وقت النظام الحالي لديك</string>
     <string name="set_custom_rtc">تعيين ساعة مخصصة في الوقت الحقيقي</string>
+    <string name="disable_nca_verification">تعطيل التحقق من NCA</string>
+    <string name="disable_nca_verification_description">يعطل التحقق من سلامة أرشيفات محتوى NCA. قد يحسن هذا من سرعة التحميل لكنه يخاطر بتلف البيانات أو تمرير ملفات غير صالحة دون اكتشاف. ضروري لجعل الألعاب والتحديثات التي تتطلب نظامًا أساسيًا 20+ تعمل.</string>
 
     <!-- Network settings strings -->
     <string name="generate">توليد</string>
diff --git a/src/android/app/src/main/res/values-ckb/strings.xml b/src/android/app/src/main/res/values-ckb/strings.xml
index 34b1ae6252..af0eeeaa45 100644
--- a/src/android/app/src/main/res/values-ckb/strings.xml
+++ b/src/android/app/src/main/res/values-ckb/strings.xml
@@ -482,6 +482,8 @@
     <string name="use_custom_rtc">RTCی تایبەتمەند</string>
     <string name="use_custom_rtc_description">ڕێگەت پێدەدات کاتژمێرێکی کاتی ڕاستەقینەی تایبەتمەند دابنێیت کە جیاوازە لە کاتی ئێستای سیستەمەکەت.</string>
     <string name="set_custom_rtc">دانانی RTCی تایبەتمەند</string>
+    <string name="disable_nca_verification">ناچالاککردنی پشکنینی NCA</string>
+    <string name="disable_nca_verification_description">پشکنینی پێکهاتەی ئارشیڤەکانی ناوەڕۆکی NCA ناچالاک دەکات. ئەمە لەوانەیە خێرایی بارکردن به‌ره‌وپێش ببات، بەڵام مەترسی لەناوچوونی داتا یان ئەوەی فایلە نادروستەکان بەبێ ئەوەی دۆزرایەوە تێپەڕبن زیاتر دەکات. بۆ ئەوەی یاری و نوێکردنەوەکان کار بکەن کە پێویستی بە فریموێری 20+ هەیە زۆر پێویستە.</string>
 
     <!-- Network settings strings -->
     <string name="generate">بەرهەم هێنان</string>
diff --git a/src/android/app/src/main/res/values-cs/strings.xml b/src/android/app/src/main/res/values-cs/strings.xml
index 293524271e..8d42b8303f 100644
--- a/src/android/app/src/main/res/values-cs/strings.xml
+++ b/src/android/app/src/main/res/values-cs/strings.xml
@@ -458,6 +458,8 @@
     <string name="use_custom_rtc">Vlastní RTC</string>
     <string name="use_custom_rtc_description">Vlastní nastavení času</string>
     <string name="set_custom_rtc">Nastavit vlastní RTC</string>
+    <string name="disable_nca_verification">Zakázat ověřování NCA</string>
+    <string name="disable_nca_verification_description">Zakáže ověřování integrity archivů obsahu NCA. To může zlepšit rychlost načítání, ale hrozí poškození dat nebo neodhalení neplatných souborů. Je nutné, aby fungovaly hry a aktualizace vyžadující firmware 20+.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Generovat</string>
diff --git a/src/android/app/src/main/res/values-de/strings.xml b/src/android/app/src/main/res/values-de/strings.xml
index 46ae9ba7fe..907b114388 100644
--- a/src/android/app/src/main/res/values-de/strings.xml
+++ b/src/android/app/src/main/res/values-de/strings.xml
@@ -486,6 +486,8 @@ Wird der Handheld-Modus verwendet, verringert es die Auflösung und erhöht die
     <string name="select_rtc_date">RTC-Datum auswählen</string>
     <string name="select_rtc_time">RTC-Zeit auswählen</string>
     <string name="use_custom_rtc">Benutzerdefinierte Echtzeituhr</string>
+    <string name="disable_nca_verification">NCA-Verifizierung deaktivieren</string>
+    <string name="disable_nca_verification_description">Deaktiviert die Integritätsprüfung von NCA-Inhaltsarchiven. Dies kann die Ladegeschwindigkeit verbessern, riskiert jedoch Datenbeschädigung oder dass ungültige Dateien unentdeckt bleiben. Ist notwendig, um Spiele und Updates, die Firmware 20+ benötigen, zum Laufen zu bringen.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Generieren</string>
diff --git a/src/android/app/src/main/res/values-es/strings.xml b/src/android/app/src/main/res/values-es/strings.xml
index 8712f455de..12b7183cae 100644
--- a/src/android/app/src/main/res/values-es/strings.xml
+++ b/src/android/app/src/main/res/values-es/strings.xml
@@ -506,6 +506,8 @@
     <string name="use_custom_rtc">RTC personalizado</string>
     <string name="use_custom_rtc_description">Te permite tener un reloj personalizado en tiempo real diferente del tiempo del propio sistema.</string>
     <string name="set_custom_rtc">Configurar RTC personalizado</string>
+    <string name="disable_nca_verification">Desactivar verificación NCA</string>
+    <string name="disable_nca_verification_description">Desactiva la verificación de integridad de los archivos de contenido NCA. Esto puede mejorar la velocidad de carga, pero arriesga corrupción de datos o que archivos inválidos pasen desapercibidos. Es necesario para que funcionen juegos y actualizaciones que requieren firmware 20+.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Generar</string>
diff --git a/src/android/app/src/main/res/values-fa/strings.xml b/src/android/app/src/main/res/values-fa/strings.xml
index 07ff8ff4e0..d7ac1b770a 100644
--- a/src/android/app/src/main/res/values-fa/strings.xml
+++ b/src/android/app/src/main/res/values-fa/strings.xml
@@ -504,6 +504,8 @@
     <string name="use_custom_rtc">زمان سفارشی</string>
     <string name="use_custom_rtc_description">به شما امکان می‌دهد یک ساعت سفارشی جدا از زمان فعلی سیستم خود تنظیم کنید.</string>
     <string name="set_custom_rtc">تنظیم زمان سفارشی</string>
+    <string name="disable_nca_verification">غیرفعال کردن تأیید اعتبار NCA</string>
+    <string name="disable_nca_verification_description">بررسی صحت آرشیوهای محتوای NCA را غیرفعال می‌کند. این ممکن است سرعت بارگذاری را بهبود بخشد اما خطر خرابی داده یا تشخیص داده نشدن فایل‌های نامعتبر را به همراه دارد. برای کار کردن بازی‌ها و به‌روزرسانی‌هایی که به فرمور ۲۰+ نیاز دارند، ضروری است.</string>
 
     <!-- Network settings strings -->
     <string name="generate">تولید</string>
diff --git a/src/android/app/src/main/res/values-fr/strings.xml b/src/android/app/src/main/res/values-fr/strings.xml
index 2e06ac98e1..62e67a6fee 100644
--- a/src/android/app/src/main/res/values-fr/strings.xml
+++ b/src/android/app/src/main/res/values-fr/strings.xml
@@ -506,6 +506,8 @@
     <string name="use_custom_rtc">RTC personnalisé</string>
     <string name="use_custom_rtc_description">Vous permet de définir une horloge en temps réel personnalisée distincte de l\'heure actuelle de votre système.</string>
     <string name="set_custom_rtc">Définir l\'horloge RTC personnalisée</string>
+    <string name="disable_nca_verification">Désactiver la vérification NCA</string>
+    <string name="disable_nca_verification_description">Désactive la vérification d\'intégrité des archives de contenu NCA. Cela peut améliorer la vitesse de chargement mais risque une corruption des données ou que des fichiers invalides ne soient pas détectés. Est nécessaire pour faire fonctionner les jeux et mises à jour nécessitant un firmware 20+.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Générer</string>
diff --git a/src/android/app/src/main/res/values-he/strings.xml b/src/android/app/src/main/res/values-he/strings.xml
index c0c835d633..ec5b526ce0 100644
--- a/src/android/app/src/main/res/values-he/strings.xml
+++ b/src/android/app/src/main/res/values-he/strings.xml
@@ -505,6 +505,8 @@
     <string name="use_custom_rtc">RTC מותאם אישית</string>
     <string name="use_custom_rtc_description">מאפשר לך לקבוע שעון זמן אמת נפרד משעון המערכת שלך.</string>
     <string name="set_custom_rtc">קבע RTC מותאם אישית</string>
+    <string name="disable_nca_verification">השבת אימות NCA</string>
+    <string name="disable_nca_verification_description">משבית את אימות השלמות של ארכיוני התוכן של NCA. זה עשוי לשפר את מהירות הטעינה אך מסתכן בשחיקת נתונים או שמא קבצים לא חוקיים יעברו ללא זיהוי. זה הכרחי כדי לגרום למשחקים ועדכונים הדורשים firmware 20+ לעבוד.</string>
 
     <!-- Network settings strings -->
     <string name="generate">יצירה</string>
diff --git a/src/android/app/src/main/res/values-hu/strings.xml b/src/android/app/src/main/res/values-hu/strings.xml
index 46a5ac7cce..b45692f147 100644
--- a/src/android/app/src/main/res/values-hu/strings.xml
+++ b/src/android/app/src/main/res/values-hu/strings.xml
@@ -501,6 +501,8 @@
     <string name="use_custom_rtc">Egyéni RTC</string>
     <string name="use_custom_rtc_description">Megadhatsz egy valós idejű órát, amely eltér a rendszer által használt órától.</string>
     <string name="set_custom_rtc">Egyéni RTC beállítása</string>
+    <string name="disable_nca_verification">NCA ellenőrzés letiltása</string>
+    <string name="disable_nca_verification_description">Letiltja az NCA tartalomarchívumok integritás-ellenőrzését. Ez javíthatja a betöltési sebességet, de az adatsérülés vagy az érvénytelen fájlok észrevétlen maradásának kockázatával jár. Elengedhetetlen a 20+ firmware-et igénylő játékok és frissítések működtetéséhez.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Generálás</string>
diff --git a/src/android/app/src/main/res/values-id/strings.xml b/src/android/app/src/main/res/values-id/strings.xml
index cffb526ad5..1817fc654a 100644
--- a/src/android/app/src/main/res/values-id/strings.xml
+++ b/src/android/app/src/main/res/values-id/strings.xml
@@ -502,6 +502,8 @@
     <string name="use_custom_rtc">RTC Kustom</string>
     <string name="use_custom_rtc_description">Memungkinkan Anda untuk mengatur jam waktu nyata kustom yang terpisah dari waktu sistem saat ini Anda.</string>
     <string name="set_custom_rtc">Setel RTC Kustom</string>
+    <string name="disable_nca_verification">Nonaktifkan Verifikasi NCA</string>
+    <string name="disable_nca_verification_description">Menonaktifkan verifikasi integritas arsip konten NCA. Ini dapat meningkatkan kecepatan pemuatan tetapi berisiko kerusakan data atau file yang tidak valid tidak terdeteksi. Diperlukan untuk membuat game dan pembaruan yang membutuhkan firmware 20+ bekerja.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Hasilkan</string>
diff --git a/src/android/app/src/main/res/values-it/strings.xml b/src/android/app/src/main/res/values-it/strings.xml
index cb234cf61e..ff3706dd40 100644
--- a/src/android/app/src/main/res/values-it/strings.xml
+++ b/src/android/app/src/main/res/values-it/strings.xml
@@ -505,6 +505,8 @@
     <string name="use_custom_rtc">RTC Personalizzato</string>
     <string name="use_custom_rtc_description">Ti permette di impostare un orologio in tempo reale personalizzato, completamente separato da quello di sistema.</string>
     <string name="set_custom_rtc">Imposta un orologio in tempo reale personalizzato</string>
+    <string name="disable_nca_verification">Disabilita verifica NCA</string>
+    <string name="disable_nca_verification_description">Disabilita la verifica dell\'integrità degli archivi di contenuto NCA. Può migliorare la velocità di caricamento ma rischia il danneggiamento dei dati o che file non validi passino inosservati. Necessario per far funzionare giochi e aggiornamenti che richiedono il firmware 20+.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Genera</string>
diff --git a/src/android/app/src/main/res/values-ja/strings.xml b/src/android/app/src/main/res/values-ja/strings.xml
index abedb1e0bc..a6c1ebf8ab 100644
--- a/src/android/app/src/main/res/values-ja/strings.xml
+++ b/src/android/app/src/main/res/values-ja/strings.xml
@@ -491,6 +491,8 @@
     <string name="use_custom_rtc">カスタム RTC</string>
     <string name="use_custom_rtc_description">現在のシステム時間とは別に、任意のリアルタイムクロックを設定できます。</string>
     <string name="set_custom_rtc">カスタムRTCを設定</string>
+    <string name="disable_nca_verification">NCA検証を無効化</string>
+    <string name="disable_nca_verification_description">NCAコンテンツアーカイブの整合性検証を無効にします。読み込み速度が向上する可能性がありますが、データ破損や不正なファイルが検出されないリスクがあります。ファームウェア20以上が必要なゲームや更新を動作させるために必要です。</string>
 
     <!-- Network settings strings -->
     <string name="generate">生成</string>
diff --git a/src/android/app/src/main/res/values-ko/strings.xml b/src/android/app/src/main/res/values-ko/strings.xml
index c6d9457744..01e2c5f4c0 100644
--- a/src/android/app/src/main/res/values-ko/strings.xml
+++ b/src/android/app/src/main/res/values-ko/strings.xml
@@ -501,6 +501,8 @@
     <string name="use_custom_rtc">사용자 지정 RTC</string>
     <string name="use_custom_rtc_description">현재 시스템 시간과 별도로 사용자 지정 RTC를 설정할 수 있습니다.</string>
     <string name="set_custom_rtc">사용자 지정 RTC 설정</string>
+    <string name="disable_nca_verification">NCA 검증 비활성화</string>
+    <string name="disable_nca_verification_description">NCA 콘텐츠 아카이브의 무결성 검증을 비활성화합니다. 로딩 속도를 향상시킬 수 있지만 데이터 손상이나 유효하지 않은 파일이 미검증될 위험이 있습니다. 펌웨어 20+가 필요한 게임 및 업데이트를 실행하려면 필요합니다.</string>
 
     <!-- Network settings strings -->
     <string name="generate">생성</string>
diff --git a/src/android/app/src/main/res/values-nb/strings.xml b/src/android/app/src/main/res/values-nb/strings.xml
index 3cc4c6d12c..90c0dbf05e 100644
--- a/src/android/app/src/main/res/values-nb/strings.xml
+++ b/src/android/app/src/main/res/values-nb/strings.xml
@@ -482,6 +482,8 @@
     <string name="use_custom_rtc">Tilpasset Sannhetstidsklokke</string>
     <string name="use_custom_rtc_description">Gjør det mulig å stille inn en egendefinert sanntidsklokke separat fra den gjeldende systemtiden.</string>
     <string name="set_custom_rtc">Angi tilpasset RTC</string>
+    <string name="disable_nca_verification">Deaktiver NCA-verifisering</string>
+    <string name="disable_nca_verification_description">Deaktiverer integritetsverifisering av NCA-innholdsarkiv. Dette kan forbedre lastehastigheten, men medfører risiko for datakorrupsjon eller at ugyldige filer ikke oppdages. Er nødvendig for å få spill og oppdateringer som trenger firmware 20+ til å fungere.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Generer</string>
diff --git a/src/android/app/src/main/res/values-pl/strings.xml b/src/android/app/src/main/res/values-pl/strings.xml
index b9858838e8..080064edaf 100644
--- a/src/android/app/src/main/res/values-pl/strings.xml
+++ b/src/android/app/src/main/res/values-pl/strings.xml
@@ -482,6 +482,8 @@
     <string name="use_custom_rtc">Niestandardowy RTC</string>
     <string name="use_custom_rtc_description">Ta opcja pozwala na wybranie własnych ustawień czasu używanych w czasie emulacji, innych niż czas systemu Android.</string>
     <string name="set_custom_rtc">Ustaw niestandardowy czas RTC</string>
+    <string name="disable_nca_verification">Wyłącz weryfikację NCA</string>
+    <string name="disable_nca_verification_description">Wyłącza weryfikację integralności archiwów zawartości NCA. Może to poprawić szybkość ładowania, ale niesie ryzyko uszkodzenia danych lub niezauważenia nieprawidłowych plików. Konieczne, aby działały gry i aktualizacje wymagające firmware\'u 20+.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Generuj</string>
diff --git a/src/android/app/src/main/res/values-pt-rBR/strings.xml b/src/android/app/src/main/res/values-pt-rBR/strings.xml
index 1296fad889..3dc1f83e6e 100644
--- a/src/android/app/src/main/res/values-pt-rBR/strings.xml
+++ b/src/android/app/src/main/res/values-pt-rBR/strings.xml
@@ -506,6 +506,8 @@
     <string name="use_custom_rtc">Data e hora personalizadas</string>
     <string name="use_custom_rtc_description">Permite a você configurar um relógio em tempo real separado do relógio do seu dispositivo.</string>
     <string name="set_custom_rtc">Definir um relógio em tempo real personalizado</string>
+    <string name="disable_nca_verification">Desativar verificação NCA</string>
+    <string name="disable_nca_verification_description">Desativa a verificação de integridade de arquivos de conteúdo NCA. Pode melhorar a velocidade de carregamento, mas arrisica corromper dados ou que arquivos inválidos passem despercebidos. É necessário para fazer jogos e atualizações que exigem firmware 20+ funcionarem.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Gerar</string>
diff --git a/src/android/app/src/main/res/values-pt-rPT/strings.xml b/src/android/app/src/main/res/values-pt-rPT/strings.xml
index a166907877..feb4950fcb 100644
--- a/src/android/app/src/main/res/values-pt-rPT/strings.xml
+++ b/src/android/app/src/main/res/values-pt-rPT/strings.xml
@@ -506,6 +506,8 @@
     <string name="use_custom_rtc">RTC personalizado</string>
     <string name="use_custom_rtc_description">Permite a você configurar um relógio em tempo real separado do relógio do seu dispositivo.</string>
     <string name="set_custom_rtc">Defina um relógio em tempo real personalizado</string>
+    <string name="disable_nca_verification">Desativar verificação NCA</string>
+    <string name="disable_nca_verification_description">Desativa a verificação de integridade dos arquivos de conteúdo NCA. Pode melhorar a velocidade de carregamento, mas arrisca a corrupção de dados ou que ficheiros inválidos passem despercebidos. É necessário para que jogos e atualizações que necessitam de firmware 20+ funcionem.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Gerar</string>
diff --git a/src/android/app/src/main/res/values-ru/strings.xml b/src/android/app/src/main/res/values-ru/strings.xml
index dc68c7b817..d56c94f10b 100644
--- a/src/android/app/src/main/res/values-ru/strings.xml
+++ b/src/android/app/src/main/res/values-ru/strings.xml
@@ -508,6 +508,8 @@
     <string name="use_custom_rtc">Пользовательский RTC</string>
     <string name="use_custom_rtc_description">Позволяет установить пользовательские часы реального времени отдельно от текущего системного времени.</string>
     <string name="set_custom_rtc">Установить пользовательский RTC</string>
+    <string name="disable_nca_verification">Отключить проверку NCA</string>
+    <string name="disable_nca_verification_description">Отключает проверку целостности архивов содержимого NCA. Может улучшить скорость загрузки, но есть риск повреждения данных или того, что недействительные файлы останутся незамеченными. Необходимо для работы игр и обновлений, требующих прошивку 20+.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Создать</string>
diff --git a/src/android/app/src/main/res/values-sr/strings.xml b/src/android/app/src/main/res/values-sr/strings.xml
index c547b3f761..5b444f6187 100644
--- a/src/android/app/src/main/res/values-sr/strings.xml
+++ b/src/android/app/src/main/res/values-sr/strings.xml
@@ -457,6 +457,8 @@
     <string name="use_custom_rtc">Цустом РТЦ</string>
     <string name="use_custom_rtc_description">Омогућава вам да поставите прилагођени сат у реалном времену одвојено од тренутног времена система.</string>
     <string name="set_custom_rtc">Подесите прилагођени РТЦ</string>
+    <string name="disable_nca_verification">Искључи верификацију НЦА</string>
+    <string name="disable_nca_verification_description">Искључује верификацију интегритета НЦА архива садржаја. Ово може побољшати брзину учитавања, али ризикује оштећење података или да неважећи фајлови прођу незапажено. Неопходно је да би игре и ажурирања која захтевају firmware 20+ радили.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Генериши</string>
diff --git a/src/android/app/src/main/res/values-uk/strings.xml b/src/android/app/src/main/res/values-uk/strings.xml
index b48a8a4a58..4ae61340dc 100644
--- a/src/android/app/src/main/res/values-uk/strings.xml
+++ b/src/android/app/src/main/res/values-uk/strings.xml
@@ -495,6 +495,8 @@
     <string name="use_custom_rtc">Свій RTC</string>
     <string name="use_custom_rtc_description">Дозволяє встановити власний час (Real-time clock, або RTC), відмінний від системного.</string>
     <string name="set_custom_rtc">Встановити RTC</string>
+    <string name="disable_nca_verification">Вимкнути перевірку NCA</string>
+    <string name="disable_nca_verification_description">Вимкає перевірку цілісності архівів вмісту NCA. Може покращити швидкість завантаження, але ризикує пошкодженням даних або тим, що недійсні файли залишаться непоміченими. Необхідно для роботи ігор та оновлень, які вимагають прошивки 20+.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Створити</string>
diff --git a/src/android/app/src/main/res/values-vi/strings.xml b/src/android/app/src/main/res/values-vi/strings.xml
index b19d437ceb..dd64fbca55 100644
--- a/src/android/app/src/main/res/values-vi/strings.xml
+++ b/src/android/app/src/main/res/values-vi/strings.xml
@@ -482,6 +482,8 @@
     <string name="use_custom_rtc">RTC tuỳ chỉnh</string>
     <string name="use_custom_rtc_description">Cho phép bạn thiết lập một đồng hồ thời gian thực tùy chỉnh riêng biệt so với thời gian hệ thống hiện tại.</string>
     <string name="set_custom_rtc">Thiết lập RTC tùy chỉnh</string>
+    <string name="disable_nca_verification">Tắt xác minh NCA</string>
+    <string name="disable_nca_verification_description">Tắt xác minh tính toàn vẹn của kho lưu trữ nội dung NCA. Có thể cải thiện tốc độ tải nhưng có nguy cơ hỏng dữ liệu hoặc các tệp không hợp lệ không bị phát hiện. Cần thiết để các trò chơi và bản cập nhật yêu cầu firmware 20+ hoạt động.</string>
 
     <!-- Network settings strings -->
     <string name="generate">Tạo</string>
diff --git a/src/android/app/src/main/res/values-zh-rCN/strings.xml b/src/android/app/src/main/res/values-zh-rCN/strings.xml
index 95ab14abd0..a12c00063f 100644
--- a/src/android/app/src/main/res/values-zh-rCN/strings.xml
+++ b/src/android/app/src/main/res/values-zh-rCN/strings.xml
@@ -500,6 +500,8 @@
     <string name="use_custom_rtc">自定义系统时间</string>
     <string name="use_custom_rtc_description">此选项允许您设置与目前系统时间相独立的自定义系统时钟。</string>
     <string name="set_custom_rtc">设置自定义系统时间</string>
+<string name="disable_nca_verification">禁用NCA验证</string>
+<string name="disable_nca_verification_description">禁用NCA内容存档的完整性验证。可能会提高加载速度，但存在数据损坏或无效文件未被检测到的风险。对于需要固件20+的游戏和更新是必需的。</string>
 
     <!-- Network settings strings -->
     <string name="generate">生成</string>
diff --git a/src/android/app/src/main/res/values-zh-rTW/strings.xml b/src/android/app/src/main/res/values-zh-rTW/strings.xml
index 8640875f2c..a125553102 100644
--- a/src/android/app/src/main/res/values-zh-rTW/strings.xml
+++ b/src/android/app/src/main/res/values-zh-rTW/strings.xml
@@ -505,6 +505,8 @@
     <string name="use_custom_rtc">自訂 RTC</string>
     <string name="use_custom_rtc_description">允許您設定與您的目前系統時間相互獨立的自訂即時時鐘。</string>
     <string name="set_custom_rtc">設定自訂 RTC</string>
+    <string name="disable_nca_verification">停用NCA驗證</string>
+    <string name="disable_nca_verification_description">停用NCA內容存檔的完整性驗證。可能會提高載入速度，但存在資料損毀或無效檔案未被偵測到的風險。對於需要韌體20+的遊戲和更新是必需的。</string>
 
     <!-- Network settings strings -->
     <string name="generate">生成</string>
diff --git a/src/android/app/src/main/res/values/strings.xml b/src/android/app/src/main/res/values/strings.xml
index 7124ba41b4..2c7923d5a3 100644
--- a/src/android/app/src/main/res/values/strings.xml
+++ b/src/android/app/src/main/res/values/strings.xml
@@ -474,6 +474,8 @@
     <string name="use_custom_rtc">Custom RTC</string>
     <string name="use_custom_rtc_description">Allows you to set a custom real-time clock separate from your current system time.</string>
     <string name="set_custom_rtc">Set custom RTC</string>
+    <string name="disable_nca_verification">Disable NCA Verification</string>
+    <string name="disable_nca_verification_description">Disables integrity verification of NCA content archives. This may improve loading speed but risks data corruption or invalid files going undetected. Some games that require firmware versions 20+ may need this as well.</string>
 
     <string name="generate">Generate</string>
 
@@ -782,6 +784,9 @@
     <string name="loader_requires_firmware">Game Requires Firmware</string>
     <string name="loader_requires_firmware_description"><![CDATA[The game you are trying to launch requires firmware to boot or to get past the opening menu. Please <a href="https://yuzu-mirror.github.io/help/quickstart"> dump and install firmware</a>, or press "OK" to launch anyways.]]></string>
 
+    <string name="nca_verification_disabled">NCA Verification Disabled</string>
+    <string name="nca_verification_disabled_description">This is required to run new games and updates, but may cause instability or crashes if NCA files are corrupt, modified, or tampered with. If unsure, re-enable verification in Advanced Settings -> System, and use firmware versions of 19.0.1 or below.</string>
+
     <!-- Intent Launch strings -->
     <string name="searching_for_game">Searching for game...</string>
     <string name="game_not_found_for_title_id">Game not found for Title ID: %1$s</string>
diff --git a/src/audio_core/common/feature_support.h b/src/audio_core/common/feature_support.h
index eef2a844ba..cd83df3832 100644
--- a/src/audio_core/common/feature_support.h
+++ b/src/audio_core/common/feature_support.h
@@ -13,7 +13,7 @@
 #include "common/polyfill_ranges.h"
 
 namespace AudioCore {
-constexpr u32 CurrentRevision = 13;
+constexpr u32 CurrentRevision = 15;
 
 enum class SupportTags {
     CommandProcessingTimeEstimatorVersion4,
diff --git a/src/common/settings.h b/src/common/settings.h
index b657dc8658..047dfc800a 100644
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -217,7 +217,8 @@ struct Values {
                                              true,
                                              true,
                                              &use_speed_limit};
-    SwitchableSetting<bool> sync_core_speed{linkage, false, "sync_core_speed", Category::Core, Specialization::Default};
+    SwitchableSetting<bool> sync_core_speed{linkage, false, "sync_core_speed", Category::Core,
+                                            Specialization::Default};
 
     // Memory
 #ifdef HAS_NCE
@@ -624,7 +625,11 @@ struct Values {
                                     linkage, 0,    "rng_seed",       Category::System, Specialization::Hex,
                                     true,    true, &rng_seed_enabled};
     Setting<std::string> device_name{
-                                     linkage, "Eden", "device_name", Category::System, Specialization::Default, true, true};
+        linkage, "Eden", "device_name", Category::System, Specialization::Default, true, true};
+    SwitchableSetting<bool> disable_nca_verification{linkage, true, "disable_nca_verification",
+                                                     Category::System, Specialization::Default};
+    Setting<bool> hide_nca_verification_popup{
+        linkage, false, "hide_nca_verification_popup", Category::System, Specialization::Default};
 
     Setting<s32> current_user{linkage, 0, "current_user", Category::System};
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index eab506f194..33990d61a5 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -88,6 +88,8 @@ add_library(core STATIC
     file_sys/fssystem/fssystem_crypto_configuration.h
     file_sys/fssystem/fssystem_hierarchical_integrity_verification_storage.cpp
     file_sys/fssystem/fssystem_hierarchical_integrity_verification_storage.h
+    file_sys/fssystem/fssystem_hierarchical_sha3_storage.cpp
+    file_sys/fssystem/fssystem_hierarchical_sha3_storage.h
     file_sys/fssystem/fssystem_hierarchical_sha256_storage.cpp
     file_sys/fssystem/fssystem_hierarchical_sha256_storage.h
     file_sys/fssystem/fssystem_indirect_storage.cpp
@@ -102,6 +104,7 @@ add_library(core STATIC
     file_sys/fssystem/fssystem_nca_header.cpp
     file_sys/fssystem/fssystem_nca_header.h
     file_sys/fssystem/fssystem_nca_reader.cpp
+    file_sys/fssystem/fssystem_passthrough_storage.h
     file_sys/fssystem/fssystem_pooled_buffer.cpp
     file_sys/fssystem/fssystem_pooled_buffer.h
     file_sys/fssystem/fssystem_sparse_storage.cpp
diff --git a/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp b/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp
index af8541009e..615a624f4f 100644
--- a/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp
+++ b/src/core/file_sys/fssystem/fssystem_bucket_tree.cpp
@@ -1,6 +1,10 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include "common/settings.h"
 #include "core/file_sys/errors.h"
 #include "core/file_sys/fssystem/fssystem_bucket_tree.h"
 #include "core/file_sys/fssystem/fssystem_bucket_tree_utils.h"
@@ -233,7 +237,10 @@ Result BucketTree::Initialize(VirtualFile node_storage, VirtualFile entry_storag
 void BucketTree::Initialize(size_t node_size, s64 end_offset) {
     ASSERT(NodeSizeMin <= node_size && node_size <= NodeSizeMax);
     ASSERT(Common::IsPowerOfTwo(node_size));
-    ASSERT(end_offset > 0);
+
+    if (!Settings::values.disable_nca_verification.GetValue()) {
+        ASSERT(end_offset > 0);
+    }
     ASSERT(!this->IsInitialized());
 
     m_node_size = node_size;
diff --git a/src/core/file_sys/fssystem/fssystem_hierarchical_sha256_storage.cpp b/src/core/file_sys/fssystem/fssystem_hierarchical_sha256_storage.cpp
index a68fd973c9..e8669a4a7d 100644
--- a/src/core/file_sys/fssystem/fssystem_hierarchical_sha256_storage.cpp
+++ b/src/core/file_sys/fssystem/fssystem_hierarchical_sha256_storage.cpp
@@ -5,23 +5,10 @@
 #include "common/scope_exit.h"
 #include "core/file_sys/fssystem/fssystem_hierarchical_sha256_storage.h"
 
+#include <cmath>
+
 namespace FileSys {
 
-namespace {
-
-s32 Log2(s32 value) {
-    ASSERT(value > 0);
-    ASSERT(Common::IsPowerOfTwo(value));
-
-    s32 log = 0;
-    while ((value >>= 1) > 0) {
-        ++log;
-    }
-    return log;
-}
-
-} // namespace
-
 Result HierarchicalSha256Storage::Initialize(VirtualFile* base_storages, s32 layer_count,
                                              size_t htbs, void* hash_buf, size_t hash_buf_size) {
     // Validate preconditions.
@@ -31,7 +18,8 @@ Result HierarchicalSha256Storage::Initialize(VirtualFile* base_storages, s32 lay
 
     // Set size tracking members.
     m_hash_target_block_size = static_cast<s32>(htbs);
-    m_log_size_ratio = Log2(m_hash_target_block_size / HashSize);
+    m_log_size_ratio =
+        static_cast<s32>(std::log2(static_cast<double>(m_hash_target_block_size) / HashSize));
 
     // Get the base storage size.
     m_base_storage_size = base_storages[2]->GetSize();
diff --git a/src/core/file_sys/fssystem/fssystem_hierarchical_sha3_storage.cpp b/src/core/file_sys/fssystem/fssystem_hierarchical_sha3_storage.cpp
new file mode 100644
index 0000000000..d58f2ee9be
--- /dev/null
+++ b/src/core/file_sys/fssystem/fssystem_hierarchical_sha3_storage.cpp
@@ -0,0 +1,57 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "common/alignment.h"
+#include "common/scope_exit.h"
+#include "core/file_sys/fssystem/fssystem_hierarchical_sha3_storage.h"
+
+#include <cmath>
+
+namespace FileSys {
+
+Result HierarchicalSha3Storage::Initialize(VirtualFile* base_storages, s32 layer_count, size_t htbs,
+                                           void* hash_buf, size_t hash_buf_size) {
+    ASSERT(layer_count == LayerCount);
+    ASSERT(Common::IsPowerOfTwo(htbs));
+    ASSERT(hash_buf != nullptr);
+
+    m_hash_target_block_size = static_cast<s32>(htbs);
+    m_log_size_ratio =
+        static_cast<s32>(std::log2(static_cast<double>(m_hash_target_block_size) / HashSize));
+
+    m_base_storage_size = base_storages[2]->GetSize();
+    {
+        auto size_guard = SCOPE_GUARD {
+            m_base_storage_size = 0;
+        };
+        R_UNLESS(m_base_storage_size <= static_cast<s64>(HashSize)
+                                            << m_log_size_ratio << m_log_size_ratio,
+                 ResultHierarchicalSha256BaseStorageTooLarge);
+        size_guard.Cancel();
+    }
+
+    m_base_storage = base_storages[2];
+    m_hash_buffer = static_cast<char*>(hash_buf);
+    m_hash_buffer_size = hash_buf_size;
+
+    std::array<u8, HashSize> master_hash{};
+    base_storages[0]->ReadObject(std::addressof(master_hash));
+
+    s64 hash_storage_size = base_storages[1]->GetSize();
+    ASSERT(Common::IsAligned(hash_storage_size, HashSize));
+    ASSERT(hash_storage_size <= m_hash_target_block_size);
+    ASSERT(hash_storage_size <= static_cast<s64>(m_hash_buffer_size));
+
+    base_storages[1]->Read(reinterpret_cast<u8*>(m_hash_buffer),
+                           static_cast<size_t>(hash_storage_size), 0);
+    R_SUCCEED();
+}
+
+size_t HierarchicalSha3Storage::Read(u8* buffer, size_t size, size_t offset) const {
+    if (size == 0)
+        return size;
+    ASSERT(buffer != nullptr);
+    return m_base_storage->Read(buffer, size, offset);
+}
+
+} // namespace FileSys
diff --git a/src/core/file_sys/fssystem/fssystem_hierarchical_sha3_storage.h b/src/core/file_sys/fssystem/fssystem_hierarchical_sha3_storage.h
new file mode 100644
index 0000000000..2db7bb28e1
--- /dev/null
+++ b/src/core/file_sys/fssystem/fssystem_hierarchical_sha3_storage.h
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include <mutex>
+
+#include "core/file_sys/errors.h"
+#include "core/file_sys/fssystem/fs_i_storage.h"
+#include "core/file_sys/vfs/vfs.h"
+
+namespace FileSys {
+
+class HierarchicalSha3Storage : public IReadOnlyStorage {
+    YUZU_NON_COPYABLE(HierarchicalSha3Storage);
+    YUZU_NON_MOVEABLE(HierarchicalSha3Storage);
+
+public:
+    static constexpr s32 LayerCount = 3;
+    static constexpr size_t HashSize = 256 / 8; // SHA3-256
+
+public:
+    HierarchicalSha3Storage() : m_mutex() {}
+
+    Result Initialize(VirtualFile* base_storages, s32 layer_count, size_t htbs, void* hash_buf,
+                      size_t hash_buf_size);
+
+    virtual size_t GetSize() const override {
+        return m_base_storage->GetSize();
+    }
+
+    virtual size_t Read(u8* buffer, size_t length, size_t offset) const override;
+
+private:
+    VirtualFile m_base_storage;
+    s64 m_base_storage_size{};
+    char* m_hash_buffer{};
+    size_t m_hash_buffer_size{};
+    s32 m_hash_target_block_size{};
+    s32 m_log_size_ratio{};
+    std::mutex m_mutex;
+};
+
+} // namespace FileSys
diff --git a/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.cpp b/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.cpp
index ab5a7984e3..1bc7039318 100644
--- a/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.cpp
+++ b/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.cpp
@@ -1,6 +1,10 @@
+﻿// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include "common/settings.h"
 #include "core/file_sys/fssystem/fssystem_aes_ctr_counter_extended_storage.h"
 #include "core/file_sys/fssystem/fssystem_aes_ctr_storage.h"
 #include "core/file_sys/fssystem/fssystem_aes_xts_storage.h"
@@ -10,10 +14,12 @@
 #include "core/file_sys/fssystem/fssystem_hierarchical_sha256_storage.h"
 #include "core/file_sys/fssystem/fssystem_indirect_storage.h"
 #include "core/file_sys/fssystem/fssystem_integrity_romfs_storage.h"
+#include "core/file_sys/fssystem/fssystem_passthrough_storage.h"
 #include "core/file_sys/fssystem/fssystem_memory_resource_buffer_hold_storage.h"
 #include "core/file_sys/fssystem/fssystem_nca_file_system_driver.h"
 #include "core/file_sys/fssystem/fssystem_sparse_storage.h"
 #include "core/file_sys/fssystem/fssystem_switch_storage.h"
+#include "core/file_sys/fssystem/fssystem_hierarchical_sha3_storage.h"
 #include "core/file_sys/vfs/vfs_offset.h"
 #include "core/file_sys/vfs/vfs_vector.h"
 
@@ -299,18 +305,24 @@ Result NcaFileSystemDriver::CreateStorageByRawStorage(VirtualFile* out,
     // Process hash/integrity layer.
     switch (header_reader->GetHashType()) {
     case NcaFsHeader::HashType::HierarchicalSha256Hash:
-        R_TRY(this->CreateSha256Storage(std::addressof(storage), std::move(storage),
-                                        header_reader->GetHashData().hierarchical_sha256_data));
+        R_TRY(CreateSha256Storage(&storage, std::move(storage),
+                                  header_reader->GetHashData().hierarchical_sha256_data));
         break;
     case NcaFsHeader::HashType::HierarchicalIntegrityHash:
-        R_TRY(this->CreateIntegrityVerificationStorage(
-            std::addressof(storage), std::move(storage),
-            header_reader->GetHashData().integrity_meta_info));
+        R_TRY(CreateIntegrityVerificationStorage(&storage, std::move(storage),
+                                                 header_reader->GetHashData().integrity_meta_info));
+        break;
+    case NcaFsHeader::HashType::HierarchicalSha3256Hash:
+        R_TRY(CreateSha3Storage(&storage, std::move(storage),
+                                header_reader->GetHashData().hierarchical_sha256_data));
         break;
     default:
+        LOG_ERROR(Loader, "Unhandled Fs HashType enum={}",
+                  static_cast<int>(header_reader->GetHashType()));
         R_THROW(ResultInvalidNcaFsHeaderHashType);
     }
 
+
     // Process compression layer.
     if (header_reader->ExistsCompressionLayer()) {
         R_TRY(this->CreateCompressedStorage(
@@ -679,6 +691,7 @@ Result NcaFileSystemDriver::CreateSparseStorageMetaStorageWithVerification(
 
     // Create the verification storage.
     VirtualFile integrity_storage;
+
     Result rc = this->CreateIntegrityVerificationStorageForMeta(
         std::addressof(integrity_storage), out_layer_info_storage, std::move(decrypted_storage),
         meta_offset, meta_data_hash_data_info);
@@ -734,8 +747,26 @@ Result NcaFileSystemDriver::CreateSparseStorageWithVerification(
                                  NcaHeader::CtrBlockSize)));
 
         // Check the meta data hash type.
-        R_UNLESS(meta_data_hash_type == NcaFsHeader::MetaDataHashType::HierarchicalIntegrity,
-                 ResultRomNcaInvalidSparseMetaDataHashType);
+        if (meta_data_hash_type != NcaFsHeader::MetaDataHashType::HierarchicalIntegrity) {
+            LOG_ERROR(Loader, "Sparse meta hash type {} not supported for verification; mounting sparse data WITHOUT verification (temporary).", static_cast<int>(meta_data_hash_type));
+
+            R_TRY(this->CreateBodySubStorage(std::addressof(body_substorage),
+                                             sparse_info.physical_offset,
+                                             sparse_info.GetPhysicalSize()));
+
+            // Create sparse core directly (no meta verification)
+            std::shared_ptr<SparseStorage> sparse_storage_fallback;
+            R_TRY(this->CreateSparseStorageCore(std::addressof(sparse_storage_fallback),
+                                                body_substorage, sparse_info.GetPhysicalSize(),
+                                                /*meta_storage*/ body_substorage, // dummy; not used
+                                                sparse_info, false));
+
+            if (out_sparse_storage)
+                *out_sparse_storage = sparse_storage_fallback;
+            *out_fs_data_offset = fs_offset;
+            *out = std::move(sparse_storage_fallback);
+            R_SUCCEED();
+        }
 
         // Create the meta storage.
         VirtualFile meta_storage;
@@ -1093,6 +1124,56 @@ Result NcaFileSystemDriver::CreatePatchMetaStorage(
     R_SUCCEED();
 }
 
+Result NcaFileSystemDriver::CreateSha3Storage(
+    VirtualFile* out, VirtualFile base_storage,
+    const NcaFsHeader::HashData::HierarchicalSha256Data& hash_data) {
+    ASSERT(out != nullptr);
+    ASSERT(base_storage != nullptr);
+
+    using VerificationStorage = HierarchicalSha3Storage;
+
+    R_UNLESS(Common::IsPowerOfTwo(hash_data.hash_block_size),
+             ResultInvalidHierarchicalSha256BlockSize);
+    R_UNLESS(hash_data.hash_layer_count == VerificationStorage::LayerCount - 1,
+             ResultInvalidHierarchicalSha256LayerCount);
+
+    const auto& hash_region = hash_data.hash_layer_region[0];
+    const auto& data_region = hash_data.hash_layer_region[1];
+
+    constexpr s32 CacheBlockCount = 2;
+    const auto hash_buffer_size = static_cast<size_t>(hash_region.size);
+    const auto cache_buffer_size = CacheBlockCount * hash_data.hash_block_size;
+    const auto total_buffer_size = hash_buffer_size + cache_buffer_size;
+
+    auto buffer_hold_storage = std::make_shared<MemoryResourceBufferHoldStorage>(
+        std::move(base_storage), total_buffer_size);
+    R_UNLESS(buffer_hold_storage != nullptr, ResultAllocationMemoryFailedAllocateShared);
+    R_UNLESS(buffer_hold_storage->IsValid(), ResultAllocationMemoryFailedInNcaFileSystemDriverI);
+
+    s64 base_size = buffer_hold_storage->GetSize();
+    R_UNLESS(hash_region.offset + hash_region.size <= base_size, ResultNcaBaseStorageOutOfRangeC);
+    R_UNLESS(data_region.offset + data_region.size <= base_size, ResultNcaBaseStorageOutOfRangeC);
+
+    auto master_hash_storage =
+        std::make_shared<ArrayVfsFile<sizeof(Hash)>>(hash_data.fs_data_master_hash.value);
+
+    auto verification_storage = std::make_shared<VerificationStorage>();
+    R_UNLESS(verification_storage != nullptr, ResultAllocationMemoryFailedAllocateShared);
+
+    std::array<VirtualFile, VerificationStorage::LayerCount> layer_storages{
+        std::make_shared<OffsetVfsFile>(master_hash_storage, sizeof(Hash), 0),
+        std::make_shared<OffsetVfsFile>(buffer_hold_storage, hash_region.size, hash_region.offset),
+        std::make_shared<OffsetVfsFile>(buffer_hold_storage, data_region.size, data_region.offset),
+    };
+
+    R_TRY(verification_storage->Initialize(layer_storages.data(), VerificationStorage::LayerCount,
+                                           hash_data.hash_block_size,
+                                           buffer_hold_storage->GetBuffer(), hash_buffer_size));
+
+    *out = std::move(verification_storage);
+    R_SUCCEED();
+}
+
 Result NcaFileSystemDriver::CreateSha256Storage(
     VirtualFile* out, VirtualFile base_storage,
     const NcaFsHeader::HashData::HierarchicalSha256Data& hash_data) {
@@ -1160,6 +1241,7 @@ Result NcaFileSystemDriver::CreateSha256Storage(
 Result NcaFileSystemDriver::CreateIntegrityVerificationStorage(
     VirtualFile* out, VirtualFile base_storage,
     const NcaFsHeader::HashData::IntegrityMetaInfo& meta_info) {
+
     R_RETURN(this->CreateIntegrityVerificationStorageImpl(
         out, base_storage, meta_info, 0, IntegrityDataCacheCount, IntegrityHashCacheCount,
         HierarchicalIntegrityVerificationStorage::GetDefaultDataCacheBufferLevel(
@@ -1209,63 +1291,96 @@ Result NcaFileSystemDriver::CreateIntegrityVerificationStorageImpl(
     VirtualFile* out, VirtualFile base_storage,
     const NcaFsHeader::HashData::IntegrityMetaInfo& meta_info, s64 layer_info_offset,
     int max_data_cache_entries, int max_hash_cache_entries, s8 buffer_level) {
-    // Validate preconditions.
+    // Preconditions
     ASSERT(out != nullptr);
     ASSERT(base_storage != nullptr);
     ASSERT(layer_info_offset >= 0);
 
-    // Define storage types.
-    using VerificationStorage = HierarchicalIntegrityVerificationStorage;
-    using StorageInfo = VerificationStorage::HierarchicalStorageInformation;
+    if (!Settings::values.disable_nca_verification.GetValue()) {
+        // Define storage types.
+        using VerificationStorage = HierarchicalIntegrityVerificationStorage;
+        using StorageInfo = VerificationStorage::HierarchicalStorageInformation;
 
-    // Validate the meta info.
-    HierarchicalIntegrityVerificationInformation level_hash_info;
-    std::memcpy(std::addressof(level_hash_info), std::addressof(meta_info.level_hash_info),
-                sizeof(level_hash_info));
+        // Validate the meta info.
+        HierarchicalIntegrityVerificationInformation level_hash_info;
+        std::memcpy(std::addressof(level_hash_info), std::addressof(meta_info.level_hash_info),
+                    sizeof(level_hash_info));
 
-    R_UNLESS(IntegrityMinLayerCount <= level_hash_info.max_layers,
-             ResultInvalidNcaHierarchicalIntegrityVerificationLayerCount);
-    R_UNLESS(level_hash_info.max_layers <= IntegrityMaxLayerCount,
-             ResultInvalidNcaHierarchicalIntegrityVerificationLayerCount);
+        R_UNLESS(IntegrityMinLayerCount <= level_hash_info.max_layers,
+                 ResultInvalidNcaHierarchicalIntegrityVerificationLayerCount);
+        R_UNLESS(level_hash_info.max_layers <= IntegrityMaxLayerCount,
+                 ResultInvalidNcaHierarchicalIntegrityVerificationLayerCount);
 
-    // Get the base storage size.
-    s64 base_storage_size = base_storage->GetSize();
+        // Get the base storage size.
+        s64 base_storage_size = base_storage->GetSize();
 
-    // Create storage info.
-    StorageInfo storage_info;
-    for (s32 i = 0; i < static_cast<s32>(level_hash_info.max_layers - 2); ++i) {
-        const auto& layer_info = level_hash_info.info[i];
-        R_UNLESS(layer_info_offset + layer_info.offset + layer_info.size <= base_storage_size,
+        // Create storage info.
+        StorageInfo storage_info;
+        for (s32 i = 0; i < static_cast<s32>(level_hash_info.max_layers - 2); ++i) {
+            const auto& layer_info = level_hash_info.info[i];
+            R_UNLESS(layer_info_offset + layer_info.offset + layer_info.size <= base_storage_size,
+                     ResultNcaBaseStorageOutOfRangeD);
+
+            storage_info[i + 1] = std::make_shared<OffsetVfsFile>(
+                base_storage, layer_info.size, layer_info_offset + layer_info.offset);
+        }
+
+        // Set the last layer info.
+        const auto& layer_info = level_hash_info.info[level_hash_info.max_layers - 2];
+        const s64 last_layer_info_offset = layer_info_offset > 0 ? 0LL : layer_info.offset.Get();
+        R_UNLESS(last_layer_info_offset + layer_info.size <= base_storage_size,
                  ResultNcaBaseStorageOutOfRangeD);
+        if (layer_info_offset > 0) {
+            R_UNLESS(last_layer_info_offset + layer_info.size <= layer_info_offset,
+                     ResultRomNcaInvalidIntegrityLayerInfoOffset);
+        }
+        storage_info.SetDataStorage(std::make_shared<OffsetVfsFile>(
+            std::move(base_storage), layer_info.size, last_layer_info_offset));
 
-        storage_info[i + 1] = std::make_shared<OffsetVfsFile>(
-            base_storage, layer_info.size, layer_info_offset + layer_info.offset);
+        // Make the integrity romfs storage.
+        auto integrity_storage = std::make_shared<IntegrityRomFsStorage>();
+        R_UNLESS(integrity_storage != nullptr, ResultAllocationMemoryFailedAllocateShared);
+
+        // Initialize the integrity storage.
+        R_TRY(integrity_storage->Initialize(level_hash_info, meta_info.master_hash, storage_info,
+                                            max_data_cache_entries, max_hash_cache_entries,
+                                            buffer_level));
+
+        // Set the output.
+        *out = std::move(integrity_storage);
+        R_SUCCEED();
+    } else {
+        // Read IVFC layout
+        HierarchicalIntegrityVerificationInformation lhi{};
+        std::memcpy(std::addressof(lhi), std::addressof(meta_info.level_hash_info), sizeof(lhi));
+
+        R_UNLESS(IntegrityMinLayerCount <= lhi.max_layers,
+                 ResultInvalidNcaHierarchicalIntegrityVerificationLayerCount);
+        R_UNLESS(lhi.max_layers <= IntegrityMaxLayerCount,
+                 ResultInvalidNcaHierarchicalIntegrityVerificationLayerCount);
+
+        const auto& data_li = lhi.info[lhi.max_layers - 2];
+
+        const s64 base_size = base_storage->GetSize();
+
+        // Compute the data layer window
+        const s64 data_off = (layer_info_offset > 0) ? 0LL : data_li.offset.Get();
+        R_UNLESS(data_off + data_li.size <= base_size, ResultNcaBaseStorageOutOfRangeD);
+        if (layer_info_offset > 0) {
+            R_UNLESS(data_off + data_li.size <= layer_info_offset,
+                     ResultRomNcaInvalidIntegrityLayerInfoOffset);
+        }
+
+        // TODO: Passthrough (temporary compatibility: integrity disabled)
+        auto data_view = std::make_shared<OffsetVfsFile>(base_storage, data_li.size, data_off);
+        R_UNLESS(data_view != nullptr, ResultAllocationMemoryFailedAllocateShared);
+
+        auto passthrough = std::make_shared<PassthroughStorage>(std::move(data_view));
+        R_UNLESS(passthrough != nullptr, ResultAllocationMemoryFailedAllocateShared);
+
+        *out = std::move(passthrough);
+        R_SUCCEED();
     }
-
-    // Set the last layer info.
-    const auto& layer_info = level_hash_info.info[level_hash_info.max_layers - 2];
-    const s64 last_layer_info_offset = layer_info_offset > 0 ? 0LL : layer_info.offset.Get();
-    R_UNLESS(last_layer_info_offset + layer_info.size <= base_storage_size,
-             ResultNcaBaseStorageOutOfRangeD);
-    if (layer_info_offset > 0) {
-        R_UNLESS(last_layer_info_offset + layer_info.size <= layer_info_offset,
-                 ResultRomNcaInvalidIntegrityLayerInfoOffset);
-    }
-    storage_info.SetDataStorage(std::make_shared<OffsetVfsFile>(
-        std::move(base_storage), layer_info.size, last_layer_info_offset));
-
-    // Make the integrity romfs storage.
-    auto integrity_storage = std::make_shared<IntegrityRomFsStorage>();
-    R_UNLESS(integrity_storage != nullptr, ResultAllocationMemoryFailedAllocateShared);
-
-    // Initialize the integrity storage.
-    R_TRY(integrity_storage->Initialize(level_hash_info, meta_info.master_hash, storage_info,
-                                        max_data_cache_entries, max_hash_cache_entries,
-                                        buffer_level));
-
-    // Set the output.
-    *out = std::move(integrity_storage);
-    R_SUCCEED();
 }
 
 Result NcaFileSystemDriver::CreateRegionSwitchStorage(VirtualFile* out,
diff --git a/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.h b/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.h
index 5bc838de64..e09bfc588a 100644
--- a/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.h
+++ b/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -329,6 +332,10 @@ private:
                                   const NcaPatchInfo& patch_info,
                                   const NcaMetaDataHashDataInfo& meta_data_hash_data_info);
 
+
+    Result CreateSha3Storage(VirtualFile* out, VirtualFile base_storage,
+                             const NcaFsHeader::HashData::HierarchicalSha256Data& hash_data);
+
     Result CreateSha256Storage(VirtualFile* out, VirtualFile base_storage,
                                const NcaFsHeader::HashData::HierarchicalSha256Data& sha256_data);
 
diff --git a/src/core/file_sys/fssystem/fssystem_nca_header.cpp b/src/core/file_sys/fssystem/fssystem_nca_header.cpp
index bf5742d39f..cef0f0bb94 100644
--- a/src/core/file_sys/fssystem/fssystem_nca_header.cpp
+++ b/src/core/file_sys/fssystem/fssystem_nca_header.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -10,11 +13,13 @@ u8 NcaHeader::GetProperKeyGeneration() const {
 }
 
 bool NcaPatchInfo::HasIndirectTable() const {
-    return this->indirect_size != 0;
+    static constexpr unsigned char BKTR[4] = {'B', 'K', 'T', 'R'};
+    return std::memcmp(indirect_header.data(), BKTR, sizeof(BKTR)) == 0;
 }
 
 bool NcaPatchInfo::HasAesCtrExTable() const {
-    return this->aes_ctr_ex_size != 0;
+    static constexpr unsigned char BKTR[4] = {'B', 'K', 'T', 'R'};
+    return std::memcmp(aes_ctr_ex_header.data(), BKTR, sizeof(BKTR)) == 0;
 }
 
 } // namespace FileSys
diff --git a/src/core/file_sys/fssystem/fssystem_passthrough_storage.h b/src/core/file_sys/fssystem/fssystem_passthrough_storage.h
new file mode 100644
index 0000000000..8fc6f4962a
--- /dev/null
+++ b/src/core/file_sys/fssystem/fssystem_passthrough_storage.h
@@ -0,0 +1,32 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+#include "core/file_sys/fssystem/fs_i_storage.h"
+#include "core/file_sys/vfs/vfs.h"
+
+namespace FileSys {
+
+//TODO: No integrity verification.
+class PassthroughStorage final : public IReadOnlyStorage {
+    YUZU_NON_COPYABLE(PassthroughStorage);
+    YUZU_NON_MOVEABLE(PassthroughStorage);
+
+public:
+    explicit PassthroughStorage(VirtualFile base) : base_(std::move(base)) {}
+    ~PassthroughStorage() override = default;
+
+    size_t Read(u8* buffer, size_t size, size_t offset) const override {
+        if (!base_ || size == 0)
+            return 0;
+        return base_->Read(buffer, size, offset);
+    }
+    size_t GetSize() const override {
+        return base_ ? base_->GetSize() : 0;
+    }
+
+private:
+    VirtualFile base_{};
+};
+
+} // namespace FileSys
diff --git a/src/core/hle/service/am/applet.cpp b/src/core/hle/service/am/applet.cpp
index 59ade29c8e..aa355b06d5 100644
--- a/src/core/hle/service/am/applet.cpp
+++ b/src/core/hle/service/am/applet.cpp
@@ -1,5 +1,8 @@
-// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator
+// Project// SPDX-License-Identifier: GPL-2.0-or-later
 
 #include "core/core.h"
 #include "core/hle/service/am/applet.h"
@@ -12,7 +15,7 @@ Applet::Applet(Core::System& system, std::unique_ptr<Process> process_, bool is_
       process(std::move(process_)), hid_registration(system, *process),
       gpu_error_detected_event(context), friend_invitation_storage_channel_event(context),
       notification_storage_channel_event(context), health_warning_disappeared_system_event(context),
-      acquired_sleep_lock_event(context), pop_from_general_channel_event(context),
+      unknown_event(context), acquired_sleep_lock_event(context), pop_from_general_channel_event(context),
       library_applet_launchable_event(context), accumulated_suspended_tick_changed_event(context),
       sleep_lock_event(context), state_changed_event(context) {
 
diff --git a/src/core/hle/service/am/applet.h b/src/core/hle/service/am/applet.h
index 835cfe6ec8..6cc8cdf741 100644
--- a/src/core/hle/service/am/applet.h
+++ b/src/core/hle/service/am/applet.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -120,6 +123,7 @@ struct Applet {
     Event friend_invitation_storage_channel_event;
     Event notification_storage_channel_event;
     Event health_warning_disappeared_system_event;
+    Event unknown_event;
     Event acquired_sleep_lock_event;
     Event pop_from_general_channel_event;
     Event library_applet_launchable_event;
diff --git a/src/core/hle/service/am/service/all_system_applet_proxies_service.cpp b/src/core/hle/service/am/service/all_system_applet_proxies_service.cpp
index 5a787494a8..d99482a45c 100644
--- a/src/core/hle/service/am/service/all_system_applet_proxies_service.cpp
+++ b/src/core/hle/service/am/service/all_system_applet_proxies_service.cpp
@@ -18,6 +18,7 @@ IAllSystemAppletProxiesService::IAllSystemAppletProxiesService(Core::System& sys
     // clang-format off
     static const FunctionInfo functions[] = {
         {100, D<&IAllSystemAppletProxiesService::OpenSystemAppletProxy>, "OpenSystemAppletProxy"},
+        {110, D<&IAllSystemAppletProxiesService::OpenSystemAppletProxy>, "OpenSystemAppletProxyEx"},
         {200, D<&IAllSystemAppletProxiesService::OpenLibraryAppletProxyOld>, "OpenLibraryAppletProxyOld"},
         {201, D<&IAllSystemAppletProxiesService::OpenLibraryAppletProxy>, "OpenLibraryAppletProxy"},
         {300, nullptr, "OpenOverlayAppletProxy"},
@@ -25,6 +26,7 @@ IAllSystemAppletProxiesService::IAllSystemAppletProxiesService(Core::System& sys
         {400, nullptr, "CreateSelfLibraryAppletCreatorForDevelop"},
         {410, nullptr, "GetSystemAppletControllerForDebug"},
         {450, D<&IAllSystemAppletProxiesService::GetSystemProcessCommonFunctions>, "GetSystemProcessCommonFunctions"}, // 19.0.0+
+        {460, D<&IAllSystemAppletProxiesService::GetAppletAlternativeFunctions>, "GetAppletAlternativeFunctions"}, // 20.0.0+
         {1000, nullptr, "GetDebugFunctions"},
     };
     // clang-format on
@@ -99,6 +101,14 @@ Result IAllSystemAppletProxiesService::GetSystemProcessCommonFunctions() {
     R_SUCCEED();
 }
 
+Result IAllSystemAppletProxiesService::GetAppletAlternativeFunctions() {
+    LOG_DEBUG(Service_AM, "(STUBBED) called.");
+
+    // TODO (maufeat)
+
+    R_SUCCEED();
+}
+
 std::shared_ptr<Applet> IAllSystemAppletProxiesService::GetAppletFromProcessId(
     ProcessId process_id) {
     return m_window_system.GetByAppletResourceUserId(process_id.pid);
diff --git a/src/core/hle/service/am/service/all_system_applet_proxies_service.h b/src/core/hle/service/am/service/all_system_applet_proxies_service.h
index a3111c4c9b..525525c795 100644
--- a/src/core/hle/service/am/service/all_system_applet_proxies_service.h
+++ b/src/core/hle/service/am/service/all_system_applet_proxies_service.h
@@ -39,6 +39,7 @@ private:
         InCopyHandle<Kernel::KProcess> process_handle,
         InLargeData<AppletAttribute, BufferAttr_HipcMapAlias> attribute);
     Result GetSystemProcessCommonFunctions();
+    Result GetAppletAlternativeFunctions();
 
 private:
     std::shared_ptr<Applet> GetAppletFromProcessId(ProcessId pid);
diff --git a/src/core/hle/service/am/service/applet_common_functions.cpp b/src/core/hle/service/am/service/applet_common_functions.cpp
index 1c9cd74533..6a73a896f9 100644
--- a/src/core/hle/service/am/service/applet_common_functions.cpp
+++ b/src/core/hle/service/am/service/applet_common_functions.cpp
@@ -35,6 +35,7 @@ IAppletCommonFunctions::IAppletCommonFunctions(Core::System& system_,
         {310, nullptr, "IsSystemAppletHomeMenu"}, //19.0.0+
         {320, nullptr, "SetGpuTimeSliceBoost"}, //19.0.0+
         {321, nullptr, "SetGpuTimeSliceBoostDueToApplication"}, //19.0.0+
+        {350, D<&IAppletCommonFunctions::Unknown350>, "Unknown350"} //20.0.0+
     };
     // clang-format on
 
@@ -70,4 +71,10 @@ Result IAppletCommonFunctions::GetCurrentApplicationId(Out<u64> out_application_
     R_SUCCEED();
 }
 
+Result IAppletCommonFunctions::Unknown350(Out<u16> out_unknown) {
+    LOG_WARNING(Service_AM, "(STUBBED) called");
+    *out_unknown = 0;
+    R_SUCCEED();
+}
+
 } // namespace Service::AM
diff --git a/src/core/hle/service/am/service/applet_common_functions.h b/src/core/hle/service/am/service/applet_common_functions.h
index 376f85acf7..623efdb7fc 100644
--- a/src/core/hle/service/am/service/applet_common_functions.h
+++ b/src/core/hle/service/am/service/applet_common_functions.h
@@ -20,6 +20,7 @@ private:
     Result GetHomeButtonDoubleClickEnabled(Out<bool> out_home_button_double_click_enabled);
     Result SetCpuBoostRequestPriority(s32 priority);
     Result GetCurrentApplicationId(Out<u64> out_application_id);
+    Result Unknown350(Out<u16> out_unknown);
 
     const std::shared_ptr<Applet> applet;
 };
diff --git a/src/core/hle/service/am/service/application_functions.cpp b/src/core/hle/service/am/service/application_functions.cpp
index 560244c714..b736e2821b 100644
--- a/src/core/hle/service/am/service/application_functions.cpp
+++ b/src/core/hle/service/am/service/application_functions.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -85,6 +88,7 @@ IApplicationFunctions::IApplicationFunctions(Core::System& system_, std::shared_
         {181, nullptr, "UpgradeLaunchRequiredVersion"},
         {190, nullptr, "SendServerMaintenanceOverlayNotification"},
         {200, nullptr, "GetLastApplicationExitReason"},
+        {210, D<&IApplicationFunctions::GetUnknownEvent210>, "Unknown210"},
         {500, nullptr, "StartContinuousRecordingFlushForDebug"},
         {1000, nullptr, "CreateMovieMaker"},
         {1001, D<&IApplicationFunctions::PrepareForJit>, "PrepareForJit"},
@@ -487,6 +491,13 @@ Result IApplicationFunctions::GetHealthWarningDisappearedSystemEvent(
     R_SUCCEED();
 }
 
+Result IApplicationFunctions::GetUnknownEvent210(
+    OutCopyHandle<Kernel::KReadableEvent> out_event) {
+    LOG_DEBUG(Service_AM, "called");
+    *out_event = m_applet->unknown_event.GetHandle();
+    R_SUCCEED();
+}
+
 Result IApplicationFunctions::PrepareForJit() {
     LOG_WARNING(Service_AM, "(STUBBED) called");
 
diff --git a/src/core/hle/service/am/service/application_functions.h b/src/core/hle/service/am/service/application_functions.h
index 10025a152b..35b3e9505d 100644
--- a/src/core/hle/service/am/service/application_functions.h
+++ b/src/core/hle/service/am/service/application_functions.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -76,6 +79,7 @@ private:
     Result TryPopFromFriendInvitationStorageChannel(Out<SharedPointer<IStorage>> out_storage);
     Result GetNotificationStorageChannelEvent(OutCopyHandle<Kernel::KReadableEvent> out_event);
     Result GetHealthWarningDisappearedSystemEvent(OutCopyHandle<Kernel::KReadableEvent> out_event);
+    Result GetUnknownEvent210(OutCopyHandle<Kernel::KReadableEvent> out_event);
     Result PrepareForJit();
 
     const std::shared_ptr<Applet> m_applet;
diff --git a/src/core/hle/service/am/service/library_applet_creator.cpp b/src/core/hle/service/am/service/library_applet_creator.cpp
index 413388d40a..54790838e0 100644
--- a/src/core/hle/service/am/service/library_applet_creator.cpp
+++ b/src/core/hle/service/am/service/library_applet_creator.cpp
@@ -113,9 +113,11 @@ std::shared_ptr<ILibraryAppletAccessor> CreateGuestApplet(Core::System& system,
         Firmware1700 = 17,
         Firmware1800 = 18,
         Firmware1900 = 19,
+        Firmware2000 = 20,
+        Firmware2100 = 21,
     };
 
-    auto process = CreateProcess(system, program_id, Firmware1400, Firmware1900);
+    auto process = CreateProcess(system, program_id, Firmware1400, Firmware2100);
     if (!process) {
         // Couldn't initialize the guest process
         return {};
diff --git a/src/core/hle/service/ns/application_manager_interface.cpp b/src/core/hle/service/ns/application_manager_interface.cpp
index f1ddba8231..517ec75743 100644
--- a/src/core/hle/service/ns/application_manager_interface.cpp
+++ b/src/core/hle/service/ns/application_manager_interface.cpp
@@ -306,6 +306,9 @@ IApplicationManagerInterface::IApplicationManagerInterface(Core::System& system_
         {3013, nullptr, "IsGameCardEnabled"},
         {3014, nullptr, "IsLocalContentShareEnabled"},
         {3050, nullptr, "ListAssignELicenseTaskResult"},
+        {4022, D<&IApplicationManagerInterface::Unknown4022>, "Unknown4022"},
+        {4023, D<&IApplicationManagerInterface::Unknown4023>, "Unknown4023"},
+        {4088, D<&IApplicationManagerInterface::Unknown4022>, "Unknown4088"},
         {9999, nullptr, "GetApplicationCertificate"},
     };
     // clang-format on
@@ -523,4 +526,17 @@ Result IApplicationManagerInterface::GetApplicationTerminateResult(Out<Result> o
     R_SUCCEED();
 }
 
+Result IApplicationManagerInterface::Unknown4022(
+    OutCopyHandle<Kernel::KReadableEvent> out_event) {
+    LOG_WARNING(Service_NS, "(STUBBED) called");
+    *out_event = gamecard_update_detection_event.GetHandle();
+    R_SUCCEED();
+}
+
+Result IApplicationManagerInterface::Unknown4023(Out<u64> out_result) {
+    LOG_WARNING(Service_NS, "(STUBBED) called.");
+    *out_result = 0;
+    R_SUCCEED();
+}
+
 } // namespace Service::NS
diff --git a/src/core/hle/service/ns/application_manager_interface.h b/src/core/hle/service/ns/application_manager_interface.h
index 2def50bd5c..251f93ee06 100644
--- a/src/core/hle/service/ns/application_manager_interface.h
+++ b/src/core/hle/service/ns/application_manager_interface.h
@@ -53,6 +53,8 @@ public:
                                         u64 application_id);
     Result CheckApplicationLaunchVersion(u64 application_id);
     Result GetApplicationTerminateResult(Out<Result> out_result, u64 application_id);
+    Result Unknown4022(OutCopyHandle<Kernel::KReadableEvent> out_event);
+    Result Unknown4023(Out<u64> out_result);
 
 private:
     KernelHelpers::ServiceContext service_context;
diff --git a/src/core/hle/service/pctl/parental_control_service.cpp b/src/core/hle/service/pctl/parental_control_service.cpp
index 1d990e66d7..82c65ac1fd 100644
--- a/src/core/hle/service/pctl/parental_control_service.cpp
+++ b/src/core/hle/service/pctl/parental_control_service.cpp
@@ -80,11 +80,12 @@ IParentalControlService::IParentalControlService(Core::System& system_, Capabili
         {1451, D<&IParentalControlService::StartPlayTimer>, "StartPlayTimer"},
         {1452, D<&IParentalControlService::StopPlayTimer>, "StopPlayTimer"},
         {1453, D<&IParentalControlService::IsPlayTimerEnabled>, "IsPlayTimerEnabled"},
-        {1454, nullptr, "GetPlayTimerRemainingTime"},
+        {1454, D<&IParentalControlService::GetPlayTimerRemainingTime>, "GetPlayTimerRemainingTime"},
         {1455, D<&IParentalControlService::IsRestrictedByPlayTimer>, "IsRestrictedByPlayTimer"},
         {1456, D<&IParentalControlService::GetPlayTimerSettingsOld>, "GetPlayTimerSettingsOld"},
         {1457, D<&IParentalControlService::GetPlayTimerEventToRequestSuspension>, "GetPlayTimerEventToRequestSuspension"},
         {1458, D<&IParentalControlService::IsPlayTimerAlarmDisabled>, "IsPlayTimerAlarmDisabled"},
+        {1459, D<&IParentalControlService::GetPlayTimerRemainingTimeDisplayInfo>, "GetPlayTimerRemainingTimeDisplayInfo"},
         {1471, nullptr, "NotifyWrongPinCodeInputManyTimes"},
         {1472, nullptr, "CancelNetworkRequest"},
         {1473, D<&IParentalControlService::GetUnlinkedEvent>, "GetUnlinkedEvent"},
@@ -378,6 +379,12 @@ Result IParentalControlService::IsPlayTimerEnabled(Out<bool> out_is_play_timer_e
     R_SUCCEED();
 }
 
+Result IParentalControlService::GetPlayTimerRemainingTime(Out<s32> out_remaining_time) {
+    LOG_WARNING(Service_PCTL, "(STUBBED) called");
+    *out_remaining_time = std::numeric_limits<s32>::max();
+    R_SUCCEED();
+}
+
 Result IParentalControlService::IsRestrictedByPlayTimer(Out<bool> out_is_restricted_by_play_timer) {
     *out_is_restricted_by_play_timer = false;
     LOG_WARNING(Service_PCTL, "(STUBBED) called, restricted={}", *out_is_restricted_by_play_timer);
@@ -412,6 +419,11 @@ Result IParentalControlService::IsPlayTimerAlarmDisabled(Out<bool> out_play_time
     R_SUCCEED();
 }
 
+Result IParentalControlService::GetPlayTimerRemainingTimeDisplayInfo(/* Out 0x18 */) {
+    LOG_INFO(Service_PCTL, "called");
+    R_SUCCEED();
+}
+
 Result IParentalControlService::GetUnlinkedEvent(OutCopyHandle<Kernel::KReadableEvent> out_event) {
     LOG_INFO(Service_PCTL, "called");
     *out_event = unlinked_event.GetHandle();
diff --git a/src/core/hle/service/pctl/parental_control_service.h b/src/core/hle/service/pctl/parental_control_service.h
index 1b1884c4de..9d143fe2e2 100644
--- a/src/core/hle/service/pctl/parental_control_service.h
+++ b/src/core/hle/service/pctl/parental_control_service.h
@@ -49,10 +49,12 @@ private:
     Result StartPlayTimer();
     Result StopPlayTimer();
     Result IsPlayTimerEnabled(Out<bool> out_is_play_timer_enabled);
+    Result GetPlayTimerRemainingTime(Out<s32> out_remaining_time);
     Result IsRestrictedByPlayTimer(Out<bool> out_is_restricted_by_play_timer);
     Result GetPlayTimerSettingsOld(Out<PlayTimerSettings> out_play_timer_settings);
     Result GetPlayTimerEventToRequestSuspension(OutCopyHandle<Kernel::KReadableEvent> out_event);
     Result IsPlayTimerAlarmDisabled(Out<bool> out_play_timer_alarm_disabled);
+    Result GetPlayTimerRemainingTimeDisplayInfo();
     Result GetUnlinkedEvent(OutCopyHandle<Kernel::KReadableEvent> out_event);
     Result GetStereoVisionRestriction(Out<bool> out_stereo_vision_restriction);
     Result SetStereoVisionRestriction(bool stereo_vision_restriction);
diff --git a/src/yuzu/applets/qt_web_browser.cpp b/src/yuzu/applets/qt_web_browser.cpp
index 8245c12ba2..a287ea16df 100644
--- a/src/yuzu/applets/qt_web_browser.cpp
+++ b/src/yuzu/applets/qt_web_browser.cpp
@@ -17,15 +17,16 @@
 #include "yuzu/applets/qt_web_browser_scripts.h"
 #endif
 
+#include "yuzu/applets/qt_web_browser.h"
+#include "yuzu/main.h"
+
+#ifdef YUZU_USE_QT_WEB_ENGINE
+
 #include "common/fs/path_util.h"
 #include "core/core.h"
 #include "input_common/drivers/keyboard.h"
-#include "yuzu/applets/qt_web_browser.h"
-#include "yuzu/main.h"
 #include "yuzu/util/url_request_interceptor.h"
 
-#ifdef YUZU_USE_QT_WEB_ENGINE
-
 namespace {
 
 constexpr int HIDButtonToKey(Core::HID::NpadButton button) {
diff --git a/src/yuzu/configuration/shared_translation.cpp b/src/yuzu/configuration/shared_translation.cpp
index fca4c94893..1137145659 100644
--- a/src/yuzu/configuration/shared_translation.cpp
+++ b/src/yuzu/configuration/shared_translation.cpp
@@ -409,6 +409,12 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QWidget* parent)
               "their resolution, details and supported controllers and depending on this setting.\n"
               "Setting to Handheld can help improve performance for low end systems."));
     INSERT(Settings, current_user, QString(), QString());
+    INSERT(Settings, disable_nca_verification, tr("Disable NCA Verification"),
+           tr("Disables integrity verification of NCA content archives."
+              "\nThis may improve loading speed but risks data corruption or invalid files going "
+              "undetected.\n"
+              "Is necessary to make games and updates work that needs firmware 20+."));
+    INSERT(Settings, hide_nca_verification_popup, QString(), QString());
 
     // Controls
 
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 4c6b176c56..4604a7b904 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -2036,6 +2036,10 @@ bool GMainWindow::LoadROM(const QString& filename, Service::AM::FrontendAppletPa
         }
     }
 
+    if (!OnCheckNcaVerification()) {
+        return false;
+    }
+
     /** Exec */
     const Core::SystemResultStatus result{
         system->Load(*render_window, filename.toStdString(), params)};
@@ -5265,6 +5269,41 @@ void GMainWindow::OnCheckFirmwareDecryption() {
     UpdateMenuState();
 }
 
+bool GMainWindow::OnCheckNcaVerification() {
+    if (!Settings::values.disable_nca_verification.GetValue())
+        return true;
+
+    const bool currently_hidden = Settings::values.hide_nca_verification_popup.GetValue();
+    LOG_INFO(Frontend, "NCA Verification is disabled. Popup State={}", currently_hidden);
+    if (currently_hidden)
+        return true;
+
+    QMessageBox msgbox(this);
+    msgbox.setWindowTitle(tr("NCA Verification Disabled"));
+    msgbox.setText(tr("NCA Verification is disabled.\n"
+                      "This is required to run new games and updates.\n"
+                      "Running without verification can cause instability or crashes if NCA files "
+                      "are corrupt, modified, or tampered.\n"
+                      "If unsure, re-enable verification in Eden's Settings and use firmware "
+                      "version 19.0.1 or below."));
+    msgbox.setIcon(QMessageBox::Warning);
+    msgbox.setStandardButtons(QMessageBox::Ok | QMessageBox::Cancel);
+    msgbox.setDefaultButton(QMessageBox::Ok);
+
+    QCheckBox* cb = new QCheckBox(tr("Don't show again"), &msgbox);
+    cb->setChecked(currently_hidden);
+    msgbox.setCheckBox(cb);
+
+    int result = msgbox.exec();
+
+    const bool hide = cb->isChecked();
+    if (hide != currently_hidden) {
+        Settings::values.hide_nca_verification_popup.SetValue(hide);
+    }
+
+    return result == static_cast<int>(QMessageBox::Ok);
+}
+
 bool GMainWindow::CheckFirmwarePresence() {
     return FirmwareManager::CheckFirmwarePresence(*system.get());
 }
@@ -5285,7 +5324,7 @@ void GMainWindow::SetFirmwareVersion() {
     const std::string display_version(firmware_data.display_version.data());
     const std::string display_title(firmware_data.display_title.data());
 
-    LOG_INFO(Frontend, "Installed firmware: {}", display_title);
+    LOG_INFO(Frontend, "Installed firmware: {}", display_version);
 
     firmware_label->setText(QString::fromStdString(display_version));
     firmware_label->setToolTip(QString::fromStdString(display_title));
diff --git a/src/yuzu/main.h b/src/yuzu/main.h
index b1c5669a41..7857788fcf 100644
--- a/src/yuzu/main.h
+++ b/src/yuzu/main.h
@@ -485,6 +485,9 @@ private:
                             const std::filesystem::path& command, const std::string& arguments,
                             const std::string& categories, const std::string& keywords,
                             const std::string& name);
+
+    bool OnCheckNcaVerification();
+
     /**
      * Mimic the behavior of QMessageBox::question but link controller navigation to the dialog
      * The only difference is that it returns a boolean.

From 37e0b80766f03ae3eb052f2eaa95506b34419bbe Mon Sep 17 00:00:00 2001
From: Gamer64 <gamer64@eden-emu.dev>
Date: Sun, 7 Sep 2025 19:20:45 +0200
Subject: [PATCH 25/38] [hle] Added missing error codes and increased audio
 renderer revision (#390)

Co-authored-by: Jarrod Norwell <official.antique@gmail.com>

Fixes Animal Well

Co-authored-by: Gamer64 <76565986+Gamer64ytb@users.noreply.github.com>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/390
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: Gamer64 <gamer64@eden-emu.dev>
Co-committed-by: Gamer64 <gamer64@eden-emu.dev>
---
 src/audio_core/common/feature_support.h |  6 +++++-
 src/core/hle/result.h                   | 16 ++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/audio_core/common/feature_support.h b/src/audio_core/common/feature_support.h
index cd83df3832..39d50746b8 100644
--- a/src/audio_core/common/feature_support.h
+++ b/src/audio_core/common/feature_support.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -13,7 +16,7 @@
 #include "common/polyfill_ranges.h"
 
 namespace AudioCore {
-constexpr u32 CurrentRevision = 15;
+constexpr u32 CurrentRevision = 16;
 
 enum class SupportTags {
     CommandProcessingTimeEstimatorVersion4,
@@ -54,6 +57,7 @@ constexpr u32 GetRevisionNum(u32 user_revision) {
         user_revision -= Common::MakeMagic('R', 'E', 'V', '0');
         user_revision >>= 24;
     }
+
     return user_revision;
 };
 
diff --git a/src/core/hle/result.h b/src/core/hle/result.h
index 316370266d..495e6e32c2 100644
--- a/src/core/hle/result.h
+++ b/src/core/hle/result.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: 2014 Citra Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -24,6 +27,7 @@ enum class ErrorModule : u32 {
     HTCS = 4,
     NCM = 5,
     DD = 6,
+    OSDBG = 7,
     LR = 8,
     Loader = 9,
     CMIF = 10,
@@ -51,6 +55,7 @@ enum class ErrorModule : u32 {
     Util = 33,
     TIPC = 35,
     ANIF = 37,
+    CRT = 39,
     ETHC = 100,
     I2C = 101,
     GPIO = 102,
@@ -106,6 +111,7 @@ enum class ErrorModule : u32 {
     Audio = 153,
     NPNS = 154,
     NPNSHTTPSTREAM = 155,
+    IDLE = 156,
     ARP = 157,
     SWKBD = 158,
     BOOT = 159,
@@ -115,6 +121,7 @@ enum class ErrorModule : u32 {
     Fatal = 163,
     NIMShop = 164,
     SPSM = 165,
+    AOC = 166,
     BGTC = 167,
     UserlandCrash = 168,
     SASBUS = 169,
@@ -176,13 +183,22 @@ enum class ErrorModule : u32 {
     DP2HDMI = 244,
     Cradle = 245,
     SProfile = 246,
+    Icm42607p = 248,
     NDRM = 250,
+    Fst2 = 251,
+    Nex = 306,
+    NPLN = 321,
     TSPM = 499,
     DevMenu = 500,
+    Nverpt = 520,
+    Am_StuckMonitor = 521,
+    Pia = 618,
+    Eagle = 623,
     GeneralWebApplet = 800,
     WifiWebAuthApplet = 809,
     WhitelistedApplet = 810,
     ShopN = 811,
+    Coral = 815
 };
 
 /// Encapsulates a Horizon OS error code, allowing it to be separated into its constituent fields.

From 10dd003d0fe69d8ce015113a30327d96f164b9fa Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Mon, 8 Sep 2025 00:49:46 +0200
Subject: [PATCH 26/38] [dynarmic, cmake] allow LTO build for dynarmic (#252)

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/252
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 .ci/linux/build.sh                       | 1 +
 .ci/windows/build.sh                     | 1 +
 src/android/app/build.gradle.kts         | 3 ++-
 src/dynarmic/CMakeLists.txt              | 1 +
 src/dynarmic/src/dynarmic/CMakeLists.txt | 4 ++++
 5 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/.ci/linux/build.sh b/.ci/linux/build.sh
index 7c8bed1279..8e3a452809 100755
--- a/.ci/linux/build.sh
+++ b/.ci/linux/build.sh
@@ -104,6 +104,7 @@ cmake .. -G Ninja \
     -DYUZU_USE_QT_WEB_ENGINE=$WEBENGINE \
     -DYUZU_USE_FASTER_LD=ON \
     -DYUZU_ENABLE_LTO=ON \
+    -DDYNARMIC_ENABLE_LTO=ON \
     "${EXTRA_CMAKE_FLAGS[@]}"
 
 ninja -j${NPROC}
diff --git a/.ci/windows/build.sh b/.ci/windows/build.sh
index 7504630a57..681f327793 100644
--- a/.ci/windows/build.sh
+++ b/.ci/windows/build.sh
@@ -52,6 +52,7 @@ cmake .. -G Ninja \
     -DYUZU_USE_QT_MULTIMEDIA=$MULTIMEDIA \
     -DYUZU_USE_QT_WEB_ENGINE=$WEBENGINE \
     -DYUZU_ENABLE_LTO=ON \
+    -DDYNARMIC_ENABLE_LTO=ON \
     "${EXTRA_CMAKE_FLAGS[@]}"
 
 ninja
diff --git a/src/android/app/build.gradle.kts b/src/android/app/build.gradle.kts
index 3f1a7c102b..c76b5e7162 100644
--- a/src/android/app/build.gradle.kts
+++ b/src/android/app/build.gradle.kts
@@ -179,7 +179,8 @@ android {
                     "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
                     "-DBUILD_TESTING=OFF",
                     "-DYUZU_TESTS=OFF",
-                    "-DDYNARMIC_TESTS=OFF"
+                    "-DDYNARMIC_TESTS=OFF",
+                    "-DDYNARMIC_ENABLE_LTO=ON"
                 )
 
                 abiFilters("arm64-v8a")
diff --git a/src/dynarmic/CMakeLists.txt b/src/dynarmic/CMakeLists.txt
index 5c28435f72..d505d16553 100644
--- a/src/dynarmic/CMakeLists.txt
+++ b/src/dynarmic/CMakeLists.txt
@@ -23,6 +23,7 @@ option(DYNARMIC_USE_PRECOMPILED_HEADERS "Use precompiled headers" ON)
 option(DYNARMIC_INSTALL "Install dynarmic headers and CMake files" OFF)
 option(DYNARMIC_USE_BUNDLED_EXTERNALS "Use all bundled externals (useful when e.g. cross-compiling)" OFF)
 option(DYNARMIC_WARNINGS_AS_ERRORS "Warnings as errors" ${MASTER_PROJECT})
+option(DYNARMIC_ENABLE_LTO "Enable LTO" OFF)
 if (NOT DEFINED DYNARMIC_FRONTENDS)
     set(DYNARMIC_FRONTENDS "A32;A64" CACHE STRING "Selects which frontends to enable")
 endif()
diff --git a/src/dynarmic/src/dynarmic/CMakeLists.txt b/src/dynarmic/src/dynarmic/CMakeLists.txt
index e060989f82..7ec92206f9 100644
--- a/src/dynarmic/src/dynarmic/CMakeLists.txt
+++ b/src/dynarmic/src/dynarmic/CMakeLists.txt
@@ -440,6 +440,10 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Windows")
 endif()
 target_compile_definitions(dynarmic PRIVATE FMT_USE_USER_DEFINED_LITERALS=1)
 
+if (DYNARMIC_ENABLE_LTO)
+    set_property(TARGET dynarmic PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
+endif()
+
 if (DYNARMIC_USE_PRECOMPILED_HEADERS)
     set(PRECOMPILED_HEADERS "$<$<COMPILE_LANGUAGE:CXX>:${CMAKE_CURRENT_SOURCE_DIR}/ir/ir_emitter.h>")
     if ("x86_64" IN_LIST ARCHITECTURE)

From 43c41e4db5596e12ce63b002c8d1543c3e26ccca Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Mon, 8 Sep 2025 00:54:48 +0200
Subject: [PATCH 27/38] [compat] openbsd port fixes (#273)

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Co-authored-by: crueter <crueter@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/273
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 CMakeLists.txt                                   |  4 +++-
 README.md                                        |  1 +
 docs/Development.md                              |  1 +
 docs/build/OpenBSD.md                            | 10 ++++++++++
 externals/CMakeLists.txt                         |  2 +-
 src/core/debugger/debugger.cpp                   |  2 +-
 src/dynarmic/CMakeLists.txt                      | 16 ++++++++++++++--
 src/dynarmic/externals/CMakeLists.txt            |  7 -------
 .../src/dynarmic/backend/x64/block_of_code.cpp   |  8 +++++++-
 .../src/dynarmic/common/spin_lock_x64.cpp        |  2 +-
 10 files changed, 39 insertions(+), 14 deletions(-)
 create mode 100644 docs/build/OpenBSD.md

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9abca561f3..03f97eb7e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,7 +50,9 @@ CMAKE_DEPENDENT_OPTION(ENABLE_SDL2 "Enable the SDL2 frontend" ON "NOT ANDROID" O
 
 set(EXT_DEFAULT ON)
 
-if (PLATFORM_FREEBSD)
+# See https://github.com/llvm/llvm-project/issues/123946
+# OpenBSD va_list doesn't play nice with precompiled headers
+if (PLATFORM_FREEBSD OR PLATFORM_OPENBSD)
     set(EXT_DEFAULT OFF)
 endif()
 
diff --git a/README.md b/README.md
index 70f2c81296..e1f0b50b37 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,7 @@ If you would like to contribute, we are open to new developers and pull requests
 * **Solaris**: [Solaris Building Guide](./docs/build/Solaris.md)
 * **FreeBSD**: [FreeBSD Building Guide](./docs/build/FreeBSD.md)
 * **macOS**: [macOS Building Guide](./docs/build/macOS.md)
+* **OpenBSD**: [OpenBSD Building Guide](./docs/build/OpenBSD.md)
 
 ## Download
 
diff --git a/docs/Development.md b/docs/Development.md
index e60384e8ab..e4816cd1ec 100644
--- a/docs/Development.md
+++ b/docs/Development.md
@@ -6,6 +6,7 @@
 * **Solaris**: [Solaris Building Guide](./build/Solaris.md)
 * **FreeBSD**: [FreeBSD Building Guide](./build/FreeBSD.md)
 * **macOS**: [macOS Building Guide](./build/macOS.md)
+* **OpenBSD**: [OpenBSD Building Guide](./build/OpenBSD.md)
 
 # CPM
 
diff --git a/docs/build/OpenBSD.md b/docs/build/OpenBSD.md
new file mode 100644
index 0000000000..6a55fd269d
--- /dev/null
+++ b/docs/build/OpenBSD.md
@@ -0,0 +1,10 @@
+# Building for OpenBSD
+
+```sh
+pkg_add -u
+pkg_add cmake nasm git boost unzip--iconv autoconf-2.72p0 bash ffmpeg glslang gmake llvm-19.1.7p3 qt6 jq
+git --recursive https://git.eden-emu.dev/eden-emu/eden
+cmake -DCMAKE_C_COMPILER=clang-19 -DCMAKE_CXX_COMPILER=clang++-19 -DDYNARMIC_USE_PRECOMPILED_HEADERS=OFF -DCMAKE_BUILD_TYPE=Debug -DENABLE_QT=OFF -DENABLE_OPENSSL=OFF -DENABLE_WEB_SERVICE=OFF -B /usr/obj/eden
+```
+
+- Modify `externals/ffmpeg/CMakeFiles/ffmpeg-build/build.make` to use `-j$(nproc)` instead of just `-j`.
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index e917e4e7d8..25886021e2 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -33,7 +33,7 @@ endif()
 
 # Xbyak (also used by Dynarmic, so needs to be added first)
 if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
-    if (PLATFORM_SUN)
+    if (PLATFORM_SUN OR PLATFORM_OPENBSD)
         AddJsonPackage(xbyak_sun)
     else()
         AddJsonPackage(xbyak)
diff --git a/src/core/debugger/debugger.cpp b/src/core/debugger/debugger.cpp
index 7fe22fdce2..460e0d19b4 100644
--- a/src/core/debugger/debugger.cpp
+++ b/src/core/debugger/debugger.cpp
@@ -7,7 +7,7 @@
 #include <boost/asio.hpp>
 #include <boost/version.hpp>
 
-#if BOOST_VERSION > 108300 && (!defined(_WINDOWS) && !defined(ANDROID)) || defined(YUZU_BOOST_v1)
+#if BOOST_VERSION > 108400 && (!defined(_WINDOWS) && !defined(ANDROID)) || defined(YUZU_BOOST_v1)
 #define USE_BOOST_v1
 #endif
 
diff --git a/src/dynarmic/CMakeLists.txt b/src/dynarmic/CMakeLists.txt
index d505d16553..842eb91a88 100644
--- a/src/dynarmic/CMakeLists.txt
+++ b/src/dynarmic/CMakeLists.txt
@@ -14,12 +14,24 @@ endif()
 
 # Dynarmic project options
 option(DYNARMIC_ENABLE_CPU_FEATURE_DETECTION "Turning this off causes dynarmic to assume the host CPU doesn't support anything later than SSE3" ON)
-option(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT "Enables support for systems that require W^X" OFF)
+
+if (PLATFORM_OPENBSD)
+    option(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT "Enables support for systems that require W^X" ON)
+else()
+    option(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT "Enables support for systems that require W^X" OFF)
+endif()
+
 option(DYNARMIC_FATAL_ERRORS "Errors are fatal" OFF)
 option(DYNARMIC_IGNORE_ASSERTS "Ignore asserts" OFF)
 option(DYNARMIC_TESTS_USE_UNICORN "Enable fuzzing tests against unicorn" OFF)
 option(DYNARMIC_USE_LLVM "Support disassembly of jitted x86_64 code using LLVM" OFF)
-option(DYNARMIC_USE_PRECOMPILED_HEADERS "Use precompiled headers" ON)
+
+if (PLATFORM_OPENBSD)
+    option(DYNARMIC_USE_PRECOMPILED_HEADERS "Use precompiled headers" OFF)
+else()
+    option(DYNARMIC_USE_PRECOMPILED_HEADERS "Use precompiled headers" ON)
+endif()
+
 option(DYNARMIC_INSTALL "Install dynarmic headers and CMake files" OFF)
 option(DYNARMIC_USE_BUNDLED_EXTERNALS "Use all bundled externals (useful when e.g. cross-compiling)" OFF)
 option(DYNARMIC_WARNINGS_AS_ERRORS "Warnings as errors" ${MASTER_PROJECT})
diff --git a/src/dynarmic/externals/CMakeLists.txt b/src/dynarmic/externals/CMakeLists.txt
index ba70797a84..26f9290ed8 100644
--- a/src/dynarmic/externals/CMakeLists.txt
+++ b/src/dynarmic/externals/CMakeLists.txt
@@ -60,13 +60,6 @@ AddJsonPackage(
 #     endif()
 # endif()
 
-# unordered_dense
-
-# AddJsonPackage(
-#     NAME unordered-dense
-#     BUNDLED_PACKAGE ${DYNARMIC_USE_BUNDLED_EXTERNALS}
-# )
-
 # xbyak
 # uncomment if in an independent repo
 
diff --git a/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp b/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp
index 5a33ac7727..d5d5f089ff 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp
@@ -225,8 +225,14 @@ bool IsUnderRosetta() {
 
 }  // anonymous namespace
 
+#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT
+static const auto default_cg_mode = Xbyak::DontSetProtectRWE;
+#else
+static const auto default_cg_mode = nullptr; //Allow RWE
+#endif
+
 BlockOfCode::BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi, size_t total_code_size, std::function<void(BlockOfCode&)> rcp)
-        : Xbyak::CodeGenerator(total_code_size, nullptr, &s_allocator)
+        : Xbyak::CodeGenerator(total_code_size, default_cg_mode, &s_allocator)
         , cb(std::move(cb))
         , jsi(jsi)
         , constant_pool(*this, CONSTANT_POOL_SIZE)
diff --git a/src/dynarmic/src/dynarmic/common/spin_lock_x64.cpp b/src/dynarmic/src/dynarmic/common/spin_lock_x64.cpp
index 474c2f8404..7c0ba8a890 100644
--- a/src/dynarmic/src/dynarmic/common/spin_lock_x64.cpp
+++ b/src/dynarmic/src/dynarmic/common/spin_lock_x64.cpp
@@ -37,7 +37,7 @@ namespace {
 struct SpinLockImpl {
     void Initialize();
 
-    Xbyak::CodeGenerator code;
+    Xbyak::CodeGenerator code = Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE);
 
     void (*lock)(volatile int*);
     void (*unlock)(volatile int*);

From 2f82b63e6ad4f26544e74f86224f4e240933d602 Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Mon, 8 Sep 2025 00:57:08 +0200
Subject: [PATCH 28/38] [user] prioritize 'user' directory if it exists
 (without needing a portable build) + docs (#338)

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/338
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 CMakeLists.txt              |  2 --
 docs/User.md                | 11 +++++++++++
 src/common/CMakeLists.txt   |  4 ----
 src/common/fs/fs_paths.h    |  4 ++--
 src/common/fs/path_util.cpp | 30 +++++++++++-------------------
 5 files changed, 24 insertions(+), 27 deletions(-)
 create mode 100644 docs/User.md

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 03f97eb7e5..5e3a45d8c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -117,8 +117,6 @@ option(YUZU_ENABLE_LTO "Enable link-time optimization" OFF)
 
 option(YUZU_DOWNLOAD_TIME_ZONE_DATA "Always download time zone binaries" ON)
 
-option(YUZU_ENABLE_PORTABLE "Allow yuzu to enable portable mode if a user folder is found in the CWD" ON)
-
 CMAKE_DEPENDENT_OPTION(YUZU_USE_FASTER_LD "Check if a faster linker is available" ON "NOT WIN32" OFF)
 
 CMAKE_DEPENDENT_OPTION(USE_SYSTEM_MOLTENVK "Use the system MoltenVK lib (instead of the bundled one)" OFF "APPLE" OFF)
diff --git a/docs/User.md b/docs/User.md
new file mode 100644
index 0000000000..cfc81063f8
--- /dev/null
+++ b/docs/User.md
@@ -0,0 +1,11 @@
+# User configuration
+
+## Configuration directories
+
+Eden will store configuration in the following directories:
+
+- **Windows**: `%AppData%\Roaming`.
+- **Android**: Data is stored internally.
+- **Linux, macOS, FreeBSD, Solaris, OpenBSD**: `$XDG_DATA_HOME`, `$XDG_CACHE_HOME`, `$XDG_CONFIG_HOME`.
+
+If a `user` directory is present in the current working directory, that will override all global configuration directories and the emulator will use that instead.
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 1aa433db32..9b898837bc 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -162,10 +162,6 @@ add_library(
   zstd_compression.h
 )
 
-if(YUZU_ENABLE_PORTABLE)
-  add_compile_definitions(YUZU_ENABLE_PORTABLE)
-endif()
-
 if(WIN32)
   target_sources(common PRIVATE windows/timer_resolution.cpp
                                 windows/timer_resolution.h)
diff --git a/src/common/fs/fs_paths.h b/src/common/fs/fs_paths.h
index 40891380c9..5cdf9be39d 100644
--- a/src/common/fs/fs_paths.h
+++ b/src/common/fs/fs_paths.h
@@ -12,7 +12,6 @@
 #define PORTABLE_DIR "user"
 
 // Sub-directories contained within a yuzu data directory
-
 #define AMIIBO_DIR "amiibo"
 #define CACHE_DIR "cache"
 #define CONFIG_DIR "config"
@@ -28,11 +27,12 @@
 #define SHADER_DIR "shader"
 #define TAS_DIR "tas"
 #define ICONS_DIR "icons"
+
+// Compatibility with other emulators
 #define CITRON_DIR "citron"
 #define SUDACHI_DIR "sudachi"
 #define YUZU_DIR "yuzu"
 #define SUYU_DIR "suyu"
 
 // yuzu-specific files
-
 #define LOG_FILE "eden_log.txt"
diff --git a/src/common/fs/path_util.cpp b/src/common/fs/path_util.cpp
index fa1403225e..a2f5cb92ff 100644
--- a/src/common/fs/path_util.cpp
+++ b/src/common/fs/path_util.cpp
@@ -101,61 +101,53 @@ public:
         legacy_paths.insert_or_assign(legacy_path, new_path);
     }
 
+    /// In non-android devices, the current directory will first search for "user"
+    /// if such directory (and it must be a directory) is found, that takes priority
+    /// over the global configuration directory (in other words, portable directories
+    /// take priority over the global ones, always)
+    /// On Android, the behaviour is to look for the current directory only.
     void Reinitialize(fs::path eden_path = {}) {
         fs::path eden_path_cache;
         fs::path eden_path_config;
-
 #ifdef _WIN32
-#ifdef YUZU_ENABLE_PORTABLE
+        // User directory takes priority over global %AppData% directory
         eden_path = GetExeDirectory() / PORTABLE_DIR;
-#endif
-        if (!IsDir(eden_path)) {
+        if (!Exists(eden_path) || !IsDir(eden_path)) {
             eden_path = GetAppDataRoamingDirectory() / EDEN_DIR;
         }
-
         eden_path_cache = eden_path / CACHE_DIR;
         eden_path_config = eden_path / CONFIG_DIR;
-
 #define LEGACY_PATH(titleName, upperName) GenerateLegacyPath(LegacyPath::titleName##Dir, GetAppDataRoamingDirectory() / upperName##_DIR); \
         GenerateLegacyPath(LegacyPath::titleName##ConfigDir, GetAppDataRoamingDirectory() / upperName##_DIR / CONFIG_DIR); \
         GenerateLegacyPath(LegacyPath::titleName##CacheDir, GetAppDataRoamingDirectory() / upperName##_DIR / CACHE_DIR);
-
         LEGACY_PATH(Citron, CITRON)
         LEGACY_PATH(Sudachi, SUDACHI)
         LEGACY_PATH(Yuzu, YUZU)
         LEGACY_PATH(Suyu, SUYU)
 #undef LEGACY_PATH
-
 #elif ANDROID
         ASSERT(!eden_path.empty());
         eden_path_cache = eden_path / CACHE_DIR;
         eden_path_config = eden_path / CONFIG_DIR;
 #else
-#ifdef YUZU_ENABLE_PORTABLE
         eden_path = GetCurrentDir() / PORTABLE_DIR;
-#endif
-        if (Exists(eden_path) && IsDir(eden_path)) {
-            eden_path_cache = eden_path / CACHE_DIR;
-            eden_path_config = eden_path / CONFIG_DIR;
-        } else {
+        if (!Exists(eden_path) || !IsDir(eden_path)) {
             eden_path = GetDataDirectory("XDG_DATA_HOME") / EDEN_DIR;
             eden_path_cache = GetDataDirectory("XDG_CACHE_HOME") / EDEN_DIR;
             eden_path_config = GetDataDirectory("XDG_CONFIG_HOME") / EDEN_DIR;
+        } else {
+            eden_path_cache = eden_path / CACHE_DIR;
+            eden_path_config = eden_path / CONFIG_DIR;
         }
-
 #define LEGACY_PATH(titleName, upperName) GenerateLegacyPath(LegacyPath::titleName##Dir, GetDataDirectory("XDG_DATA_HOME") / upperName##_DIR); \
         GenerateLegacyPath(LegacyPath::titleName##ConfigDir, GetDataDirectory("XDG_CONFIG_HOME") / upperName##_DIR); \
         GenerateLegacyPath(LegacyPath::titleName##CacheDir, GetDataDirectory("XDG_CACHE_HOME") / upperName##_DIR);
-
         LEGACY_PATH(Citron, CITRON)
         LEGACY_PATH(Sudachi, SUDACHI)
         LEGACY_PATH(Yuzu, YUZU)
         LEGACY_PATH(Suyu, SUYU)
-
 #undef LEGACY_PATH
-
 #endif
-
         GenerateEdenPath(EdenPath::EdenDir, eden_path);
         GenerateEdenPath(EdenPath::AmiiboDir, eden_path / AMIIBO_DIR);
         GenerateEdenPath(EdenPath::CacheDir, eden_path_cache);

From ecc99ce9ab54e4a14dc5f67aac932316a332b6a1 Mon Sep 17 00:00:00 2001
From: crueter <crueter@eden-emu.dev>
Date: Mon, 8 Sep 2025 02:37:55 +0200
Subject: [PATCH 29/38] [dynarmic] spinlock: (re-)allow RWE on
 execute-supported targets (#393)

regr. #273

Signed-off-by: crueter <crueter@eden-emu.dev>

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/393
---
 src/dynarmic/src/dynarmic/common/spin_lock_x64.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/dynarmic/src/dynarmic/common/spin_lock_x64.cpp b/src/dynarmic/src/dynarmic/common/spin_lock_x64.cpp
index 7c0ba8a890..c949ed7de8 100644
--- a/src/dynarmic/src/dynarmic/common/spin_lock_x64.cpp
+++ b/src/dynarmic/src/dynarmic/common/spin_lock_x64.cpp
@@ -11,6 +11,12 @@
 #include "dynarmic/backend/x64/hostloc.h"
 #include "dynarmic/common/spin_lock.h"
 
+#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT
+static const auto default_cg_mode = Xbyak::DontSetProtectRWE;
+#else
+static const auto default_cg_mode = nullptr; //Allow RWE
+#endif
+
 namespace Dynarmic {
 
 void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) {
@@ -37,7 +43,7 @@ namespace {
 struct SpinLockImpl {
     void Initialize();
 
-    Xbyak::CodeGenerator code = Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE);
+    Xbyak::CodeGenerator code = Xbyak::CodeGenerator(4096, default_cg_mode);
 
     void (*lock)(volatile int*);
     void (*unlock)(volatile int*);

From 428f136a754096e34c7bfec902684e2546a43ca4 Mon Sep 17 00:00:00 2001
From: crueter <crueter@crueter.xyz>
Date: Mon, 8 Sep 2025 19:21:38 +0200
Subject: [PATCH 30/38] [cmake] CPMUtil: formatting, git_host, new repos, more
 system deps, libusb (#392)

I promise I'm a UI developer

- mbedtls can now be used as a system package
- zycore can now be used as a system package
- cleaned up dynarmic externals
- fixed libusb incorrectly showing as bundled
- add version/tag formatting to JSON
- add custom GIT_HOST option for packages
- moved some of my repos to my new git
- slightly better version identification
- combined VUL/VH since they are codependent (using my combo vendor)
- fix cpmfile inclusion
- remove libusb submodule

This PR succeeds #383 since it includes it

Co-authored-by: SDK Chan <sdkchan@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/392
Co-authored-by: crueter <crueter@crueter.xyz>
Co-committed-by: crueter <crueter@crueter.xyz>
---
 .ci/license-header.sh                       |  79 +++-
 .ci/license/header-hash.txt                 |   2 +
 .gitmodules                                 |   6 -
 CMakeLists.txt                              |  88 +----
 CMakeModules/CPMUtil.cmake                  |  66 +++-
 CMakeModules/Findmbedtls.cmake              |  17 +
 cpmfile.json                                |   4 +-
 docs/CPM.md                                 |  12 +-
 docs/build/Android.md                       |  84 ++---
 docs/build/FreeBSD.md                       | 164 ++++-----
 docs/build/Linux.md                         | 276 +++++++-------
 docs/build/Solaris.md                       | 100 ++---
 docs/build/Windows.md                       | 386 ++++++++++----------
 docs/build/macOS.md                         | 183 ++++------
 externals/CMakeLists.txt                    |  27 +-
 externals/cpmfile.json                      |  18 +-
 externals/ffmpeg/CMakeLists.txt             |   2 -
 externals/libusb/CMakeLists.txt             |  66 ++--
 externals/libusb/cpmfile.json               |   8 +
 externals/libusb/libusb                     |   1 -
 externals/nx_tzdb/CMakeLists.txt            |   2 -
 externals/nx_tzdb/cpmfile.json              |   5 +-
 src/CMakeLists.txt                          |  14 +-
 src/android/app/src/main/jni/CMakeLists.txt |   2 +-
 src/audio_core/CMakeLists.txt               |  12 +-
 src/core/CMakeLists.txt                     |   9 +-
 src/core/crypto/key_manager.cpp             |   7 +
 src/dedicated_room/CMakeLists.txt           |   5 +-
 src/dynarmic/CMakeLists.txt                 |  30 +-
 src/dynarmic/externals/CMakeLists.txt       |  50 +--
 src/dynarmic/externals/cpmfile.json         |   7 +-
 src/dynarmic/src/dynarmic/CMakeLists.txt    |   2 +-
 src/network/CMakeLists.txt                  |   5 +-
 src/video_core/CMakeLists.txt               |   6 +-
 src/yuzu/CMakeLists.txt                     |  30 +-
 src/yuzu/externals/CMakeLists.txt           |   2 -
 src/yuzu_cmd/CMakeLists.txt                 |   8 +-
 tools/cpm-fetch-all.sh                      |   3 +-
 tools/cpm-fetch.sh                          |  37 +-
 39 files changed, 921 insertions(+), 904 deletions(-)
 create mode 100644 .ci/license/header-hash.txt
 delete mode 100644 .gitmodules
 create mode 100644 CMakeModules/Findmbedtls.cmake
 create mode 100644 externals/libusb/cpmfile.json
 delete mode 160000 externals/libusb/libusb

diff --git a/.ci/license-header.sh b/.ci/license-header.sh
index d14d5adf42..fecffaa7d3 100755
--- a/.ci/license-header.sh
+++ b/.ci/license-header.sh
@@ -1,6 +1,7 @@
 #!/bin/sh -e
 
 HEADER="$(cat "$PWD/.ci/license/header.txt")"
+HEADER_HASH="$(cat "$PWD/.ci/license/header-hash.txt")"
 
 echo "Getting branch changes"
 
@@ -13,41 +14,86 @@ FILES=`git diff-tree --no-commit-id --name-only ${RANGE} -r`
 
 echo "Done"
 
+check_header() {
+    CONTENT="`head -n3 < $1`"
+    case "$CONTENT" in
+        "$HEADER"*) ;;
+        *) BAD_FILES="$BAD_FILES $1" ;;
+    esac
+}
+
+check_cmake_header() {
+    CONTENT="`head -n3 < $1`"
+
+    case "$CONTENT" in
+        "$HEADER_HASH"*) ;;
+        *)
+            BAD_CMAKE="$BAD_CMAKE $1" ;;
+    esac
+}
 for file in $FILES; do
     [ -f "$file" ] || continue
 
+    if [ `basename -- "$file"` = "CMakeLists.txt" ]; then
+        check_cmake_header "$file"
+        continue
+    fi
+
     EXTENSION="${file##*.}"
     case "$EXTENSION" in
         kts|kt|cpp|h)
-            CONTENT="`cat $file`"
-            case "$CONTENT" in
-                "$HEADER"*) ;;
-                *) BAD_FILES="$BAD_FILES $file" ;;
-            esac
+            check_header "$file"
+            ;;
+        cmake)
+            check_cmake_header "$file"
             ;;
     esac
 done
 
-if [ "$BAD_FILES" = "" ]; then
+if [ "$BAD_FILES" = "" ] && [ "$BAD_CMAKE" = "" ]; then
     echo
     echo "All good."
 
     exit
 fi
 
-echo "The following files have incorrect license headers:"
-echo
+if [ "$BAD_FILES" != "" ]; then
+    echo "The following source files have incorrect license headers:"
+    echo
 
-for file in $BAD_FILES; do echo $file; done
+    for file in $BAD_FILES; do echo $file; done
 
-cat << EOF
+    cat << EOF
 
-The following license header should be added to the start of all offending files:
+The following license header should be added to the start of all offending SOURCE files:
 
 === BEGIN ===
 $HEADER
 ===  END  ===
 
+EOF
+
+fi
+
+if [ "$BAD_CMAKE" != "" ]; then
+    echo "The following CMake files have incorrect license headers:"
+    echo
+
+    for file in $BAD_CMAKE; do echo $file; done
+
+    cat << EOF
+
+The following license header should be added to the start of all offending CMake files:
+
+=== BEGIN ===
+$HEADER_HASH
+===  END  ===
+
+EOF
+
+fi
+
+cat << EOF
 If some of the code in this PR is not being contributed by the original author,
 the files which have been exclusively changed by that code can be ignored.
 If this happens, this PR requirement can be bypassed once all other files are addressed.
@@ -70,6 +116,17 @@ if [ "$FIX" = "true" ]; then
         git add $file
     done
 
+    for file in $BAD_CMAKE; do
+        cat $file > $file.bak
+
+        cat .ci/license/header-hash.txt > $file
+        echo >> $file
+        cat $file.bak >> $file
+
+        rm $file.bak
+
+        git add $file
+    done
     echo "License headers fixed."
 
     if [ "$COMMIT" = "true" ]; then
diff --git a/.ci/license/header-hash.txt b/.ci/license/header-hash.txt
new file mode 100644
index 0000000000..91bc195e23
--- /dev/null
+++ b/.ci/license/header-hash.txt
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 94ac4d33f3..0000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-FileCopyrightText: 2014 Citra Emulator Project
-# SPDX-License-Identifier: GPL-2.0-or-later
-
-[submodule "libusb"]
-	path = externals/libusb/libusb
-	url = https://github.com/libusb/libusb.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e3a45d8c5..9e23f8f87f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,12 +48,10 @@ endif()
 # On Linux system SDL2 is likely to be lacking HIDAPI support which have drawbacks but is needed for SDL motion
 CMAKE_DEPENDENT_OPTION(ENABLE_SDL2 "Enable the SDL2 frontend" ON "NOT ANDROID" OFF)
 
-set(EXT_DEFAULT ON)
+set(EXT_DEFAULT OFF)
 
-# See https://github.com/llvm/llvm-project/issues/123946
-# OpenBSD va_list doesn't play nice with precompiled headers
-if (PLATFORM_FREEBSD OR PLATFORM_OPENBSD)
-    set(EXT_DEFAULT OFF)
+if (MSVC OR ANDROID)
+    set(EXT_DEFAULT ON)
 endif()
 
 CMAKE_DEPENDENT_OPTION(YUZU_USE_EXTERNAL_SDL2 "Compile external SDL2" ${EXT_DEFAULT} "ENABLE_SDL2;NOT MSVC" OFF)
@@ -69,14 +67,13 @@ option(ENABLE_QT_UPDATE_CHECKER "Enable update checker for the Qt frontend" OFF)
 
 CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" "${MSVC}" "ENABLE_QT" OFF)
 
-option(YUZU_USE_CPM "Use CPM to fetch Eden dependencies if needed" ON)
+option(YUZU_USE_CPM "Use CPM to fetch system dependencies (fmt, boost, etc) if needed. Externals will still be fetched." ${EXT_DEFAULT})
 
 option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON)
 option(ENABLE_WIFI_SCAN "Enable WiFi scanning" OFF)
 
 option(YUZU_USE_BUNDLED_FFMPEG "Download/Build bundled FFmpeg" ${EXT_DEFAULT})
-option(YUZU_USE_EXTERNAL_VULKAN_HEADERS "Use Vulkan-Headers from externals" ${EXT_DEFAULT})
-option(YUZU_USE_EXTERNAL_VULKAN_UTILITY_LIBRARIES "Use Vulkan-Utility-Libraries from externals" ${EXT_DEFAULT})
+option(YUZU_USE_EXTERNAL_VULKAN_UTILITY_LIBRARIES "Use Vulkan Utility Headers from externals" ${EXT_DEFAULT})
 option(YUZU_USE_EXTERNAL_VULKAN_SPIRV_TOOLS "Use SPIRV-Tools from externals" ${EXT_DEFAULT})
 
 option(YUZU_USE_QT_MULTIMEDIA "Use QtMultimedia for Camera" OFF)
@@ -95,10 +92,12 @@ option(YUZU_TESTS "Compile tests" "${BUILD_TESTING}")
 
 option(YUZU_USE_PRECOMPILED_HEADERS "Use precompiled headers" ${EXT_DEFAULT})
 
+# TODO(crueter): CI this?
 option(YUZU_DOWNLOAD_ANDROID_VVL "Download validation layer binary for android" ON)
 
 option(FORCE_DOWNLOAD_WIN_BUNDLES "Forcefully download bundled Windows dependencies (useful for CI)" OFF)
 
+# TODO(crueter): Cleanup, each dep that has a bundled option should allow to choose between bundled, external, system
 if (YUZU_USE_CPM AND ENABLE_SDL2)
     option(YUZU_USE_BUNDLED_SDL2 "Download bundled SDL2 build" "${MSVC}")
 endif()
@@ -107,12 +106,10 @@ CMAKE_DEPENDENT_OPTION(YUZU_ROOM "Enable dedicated room functionality" ON "NOT A
 
 CMAKE_DEPENDENT_OPTION(YUZU_ROOM_STANDALONE "Enable standalone room executable" ON "YUZU_ROOM" OFF)
 
-CMAKE_DEPENDENT_OPTION(YUZU_CMD "Compile the eden-cli executable" ON "NOT ANDROID" OFF)
+CMAKE_DEPENDENT_OPTION(YUZU_CMD "Compile the eden-cli executable" ON "ENABLE_SDL2;NOT ANDROID" OFF)
 
 CMAKE_DEPENDENT_OPTION(YUZU_CRASH_DUMPS "Compile crash dump (Minidump) support" OFF "WIN32 OR LINUX" OFF)
 
-option(YUZU_CHECK_SUBMODULES "Check if submodules are present" ${EXT_DEFAULT})
-
 option(YUZU_ENABLE_LTO "Enable link-time optimization" OFF)
 
 option(YUZU_DOWNLOAD_TIME_ZONE_DATA "Always download time zone binaries" ON)
@@ -194,53 +191,6 @@ if(EXISTS ${PROJECT_SOURCE_DIR}/hooks/pre-commit AND NOT EXISTS ${PROJECT_SOURCE
     endif()
 endif()
 
-# Sanity check : Check that all submodules are present
-# =======================================================================
-
-function(check_submodules_present)
-    file(READ "${PROJECT_SOURCE_DIR}/.gitmodules" gitmodules)
-    string(REGEX MATCHALL "path *= *[^ \t\r\n]*" gitmodules ${gitmodules})
-    foreach(module ${gitmodules})
-        string(REGEX REPLACE "path *= *" "" module ${module})
-
-        file(GLOB RESULT "${PROJECT_SOURCE_DIR}/${module}/*")
-        list(LENGTH RESULT RES_LEN)
-        if(RES_LEN EQUAL 0)
-            message(FATAL_ERROR "Git submodule ${module} not found. "
-                "Please run: \ngit submodule update --init --recursive")
-        endif()
-        if (EXISTS "${PROJECT_SOURCE_DIR}/${module}/.git")
-            set(SUBMODULE_DIR "${PROJECT_SOURCE_DIR}/${module}")
-
-            execute_process(
-                COMMAND git rev-parse --short=10 HEAD
-                WORKING_DIRECTORY ${SUBMODULE_DIR}
-                OUTPUT_VARIABLE SUBMODULE_SHA
-            )
-
-            # would probably be better to do string parsing, but whatever
-            execute_process(
-                COMMAND git remote get-url origin
-                WORKING_DIRECTORY ${SUBMODULE_DIR}
-                OUTPUT_VARIABLE SUBMODULE_URL
-            )
-
-            string(REGEX REPLACE "\n|\r" "" SUBMODULE_SHA ${SUBMODULE_SHA})
-            string(REGEX REPLACE "\n|\r|\\.git" "" SUBMODULE_URL ${SUBMODULE_URL})
-
-            get_filename_component(SUBMODULE_NAME ${SUBMODULE_DIR} NAME)
-
-            set_property(GLOBAL APPEND PROPERTY CPM_PACKAGE_NAMES ${SUBMODULE_NAME})
-            set_property(GLOBAL APPEND PROPERTY CPM_PACKAGE_SHAS ${SUBMODULE_SHA})
-            set_property(GLOBAL APPEND PROPERTY CPM_PACKAGE_URLS ${SUBMODULE_URL})
-        endif()
-    endforeach()
-endfunction()
-
-if(EXISTS ${PROJECT_SOURCE_DIR}/.gitmodules AND YUZU_CHECK_SUBMODULES)
-    check_submodules_present()
-endif()
-
 configure_file(${PROJECT_SOURCE_DIR}/dist/compatibility_list/compatibility_list.qrc
     ${PROJECT_BINARY_DIR}/dist/compatibility_list/compatibility_list.qrc
     COPYONLY)
@@ -277,7 +227,7 @@ function(detect_architecture symbol arch)
         if (ARCHITECTURE_${arch})
             set(ARCHITECTURE "${arch}" PARENT_SCOPE)
             set(ARCHITECTURE_${arch} 1 PARENT_SCOPE)
-            add_definitions(-DARCHITECTURE_${arch}=1)
+            add_compile_definitions(ARCHITECTURE_${arch}=1)
         endif()
     endif()
 endfunction()
@@ -299,7 +249,7 @@ endif()
 if (NOT DEFINED ARCHITECTURE)
     set(ARCHITECTURE "GENERIC")
     set(ARCHITECTURE_GENERIC 1)
-    add_definitions(-DARCHITECTURE_GENERIC=1)
+    add_compile_definitions(ARCHITECTURE_GENERIC=1)
 endif()
 
 message(STATUS "Target architecture: ${ARCHITECTURE}")
@@ -311,16 +261,16 @@ if (MSVC AND ARCHITECTURE_x86)
 endif()
 
 if (UNIX)
-    add_definitions(-DYUZU_UNIX=1)
+    add_compile_definitions(YUZU_UNIX=1)
 endif()
 
 if (ARCHITECTURE_arm64 AND (ANDROID OR PLATFORM_LINUX))
     set(HAS_NCE 1)
-    add_definitions(-DHAS_NCE=1)
+    add_compile_definitions(HAS_NCE=1)
 endif()
 
 if (YUZU_ROOM)
-    add_definitions(-DYUZU_ROOM)
+    add_compile_definitions(YUZU_ROOM)
 endif()
 
 # Build/optimization presets
@@ -489,14 +439,6 @@ if(NOT TARGET Boost::headers)
     AddJsonPackage(boost_headers)
 endif()
 
-if (ENABLE_LIBUSB)
-    if (PLATFORM_FREEBSD)
-        find_package(libusb MODULE)
-    else()
-        find_package(libusb 1.0.24 MODULE)
-    endif()
-endif()
-
 # DiscordRPC
 if (USE_DISCORD_PRESENCE)
     AddJsonPackage(discord-rpc)
@@ -601,8 +543,8 @@ endfunction()
 add_subdirectory(externals)
 
 # pass targets from externals
-find_package(VulkanHeaders)
 find_package(VulkanUtilityLibraries)
+find_package(libusb)
 find_package(VulkanMemoryAllocator)
 find_package(SPIRV-Tools)
 
@@ -736,7 +678,7 @@ if (APPLE)
     list(APPEND PLATFORM_LIBRARIES ${ICONV_LIBRARY})
 elseif (WIN32)
     # Target Windows 10
-    add_definitions(-D_WIN32_WINNT=0x0A00 -DWINVER=0x0A00)
+    add_compile_definitions(_WIN32_WINNT=0x0A00 WINVER=0x0A00)
     set(PLATFORM_LIBRARIES winmm ws2_32 iphlpapi)
     if (MINGW)
         # PSAPI is the Process Status API
diff --git a/CMakeModules/CPMUtil.cmake b/CMakeModules/CPMUtil.cmake
index 9daada47ad..db9cce4c66 100644
--- a/CMakeModules/CPMUtil.cmake
+++ b/CMakeModules/CPMUtil.cmake
@@ -11,10 +11,11 @@
 # Future crueter: Wow this was a lie and a half, at this point I might as well make my own CPN
 # haha just kidding... unless?
 
+# TODO(crueter): Remember to get more than 6 hours of sleep whenever making giant cmake changes
 if (MSVC OR ANDROID)
-    set(BUNDLED_DEFAULT OFF)
-else()
     set(BUNDLED_DEFAULT ON)
+else()
+    set(BUNDLED_DEFAULT OFF)
 endif()
 
 option(CPMUTIL_FORCE_BUNDLED
@@ -26,8 +27,7 @@ option(CPMUTIL_FORCE_SYSTEM
 cmake_minimum_required(VERSION 3.22)
 include(CPM)
 
-# TODO(crueter): Better solution for separate cpmfiles e.g. per-directory
-set(CPMUTIL_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/cpmfile.json" CACHE STRING "Location of cpmfile.json")
+set(CPMUTIL_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/cpmfile.json")
 
 if (EXISTS ${CPMUTIL_JSON_FILE})
     file(READ ${CPMUTIL_JSON_FILE} CPMFILE_CONTENT)
@@ -148,11 +148,32 @@ function(AddJsonPackage)
     get_json_element("${object}" tag tag "")
     get_json_element("${object}" artifact artifact "")
     get_json_element("${object}" git_version git_version "")
+    get_json_element("${object}" git_host git_host "")
     get_json_element("${object}" source_subdir source_subdir "")
     get_json_element("${object}" bundled bundled "unset")
     get_json_element("${object}" find_args find_args "")
     get_json_element("${object}" raw_patches patches "")
 
+    # okay here comes the fun part: REPLACEMENTS!
+    # first: tag gets %VERSION% replaced if applicable, with either git_version (preferred) or version
+    # second: artifact gets %VERSION% and %TAG% replaced accordingly (same rules for VERSION)
+
+    if (git_version)
+        set(version_replace ${git_version})
+    else()
+        set(version_replace ${version})
+    endif()
+
+    # TODO(crueter): fmt module for cmake
+    if (tag)
+        string(REPLACE "%VERSION%" "${version_replace}" tag ${tag})
+    endif()
+
+    if (artifact)
+        string(REPLACE "%VERSION%" "${version_replace}" artifact ${artifact})
+        string(REPLACE "%TAG%" "${tag}" artifact ${artifact})
+    endif()
+
     # format patchdir
     if (raw_patches)
         math(EXPR range "${raw_patches_LENGTH} - 1")
@@ -201,6 +222,8 @@ function(AddJsonPackage)
         SOURCE_SUBDIR "${source_subdir}"
 
         GIT_VERSION ${git_version}
+        GIT_HOST ${git_host}
+
         ARTIFACT ${artifact}
         TAG ${tag}
     )
@@ -240,6 +263,7 @@ function(AddPackage)
         NAME
         VERSION
         GIT_VERSION
+        GIT_HOST
 
         REPO
         TAG
@@ -272,11 +296,17 @@ function(AddPackage)
     option(${PKG_ARGS_NAME}_FORCE_SYSTEM "Force the system package for ${PKG_ARGS_NAME}")
     option(${PKG_ARGS_NAME}_FORCE_BUNDLED "Force the bundled package for ${PKG_ARGS_NAME}")
 
+    if (NOT DEFINED PKG_ARGS_GIT_HOST)
+        set(git_host github.com)
+    else()
+        set(git_host ${PKG_ARGS_GIT_HOST})
+    endif()
+
     if (DEFINED PKG_ARGS_URL)
         set(pkg_url ${PKG_ARGS_URL})
 
         if (DEFINED PKG_ARGS_REPO)
-            set(pkg_git_url https://github.com/${PKG_ARGS_REPO})
+            set(pkg_git_url https://${git_host}/${PKG_ARGS_REPO})
         else()
             if (DEFINED PKG_ARGS_GIT_URL)
                 set(pkg_git_url ${PKG_ARGS_GIT_URL})
@@ -285,7 +315,7 @@ function(AddPackage)
             endif()
         endif()
     elseif (DEFINED PKG_ARGS_REPO)
-        set(pkg_git_url https://github.com/${PKG_ARGS_REPO})
+        set(pkg_git_url https://${git_host}/${PKG_ARGS_REPO})
 
         if (DEFINED PKG_ARGS_TAG)
             set(pkg_key ${PKG_ARGS_TAG})
@@ -316,25 +346,23 @@ function(AddPackage)
 
     cpm_utils_message(STATUS ${PKG_ARGS_NAME} "Download URL is ${pkg_url}")
 
-    if (DEFINED PKG_ARGS_GIT_VERSION)
-        set(git_version ${PKG_ARGS_GIT_VERSION})
-    elseif(DEFINED PKG_ARGS_VERSION)
-        set(git_version ${PKG_ARGS_VERSION})
-    endif()
-
     if (NOT DEFINED PKG_ARGS_KEY)
         if (DEFINED PKG_ARGS_SHA)
             string(SUBSTRING ${PKG_ARGS_SHA} 0 4 pkg_key)
             cpm_utils_message(DEBUG ${PKG_ARGS_NAME}
                         "No custom key defined, using ${pkg_key} from sha")
-        elseif (DEFINED git_version)
-            set(pkg_key ${git_version})
+        elseif(DEFINED PKG_ARGS_GIT_VERSION)
+            set(pkg_key ${PKG_ARGS_GIT_VERSION})
             cpm_utils_message(DEBUG ${PKG_ARGS_NAME}
                         "No custom key defined, using ${pkg_key}")
         elseif (DEFINED PKG_ARGS_TAG)
             set(pkg_key ${PKG_ARGS_TAG})
             cpm_utils_message(DEBUG ${PKG_ARGS_NAME}
                         "No custom key defined, using ${pkg_key}")
+        elseif (DEFINED PKG_ARGS_VERSION)
+            set(pkg_key ${PKG_ARGS_VERSION})
+            cpm_utils_message(DEBUG ${PKG_ARGS_NAME}
+                        "No custom key defined, using ${pkg_key}")
         else()
             cpm_utils_message(WARNING ${PKG_ARGS_NAME}
                         "Could not determine cache key, using CPM defaults")
@@ -445,12 +473,15 @@ function(AddPackage)
         if (DEFINED PKG_ARGS_SHA)
             set_property(GLOBAL APPEND PROPERTY CPM_PACKAGE_SHAS
                          ${PKG_ARGS_SHA})
-        elseif(DEFINED git_version)
-            set_property(GLOBAL APPEND PROPERTY CPM_PACKAGE_SHAS
-                         ${git_version})
+         elseif (DEFINED PKG_ARGS_GIT_VERSION)
+             set_property(GLOBAL APPEND PROPERTY CPM_PACKAGE_SHAS
+                          ${PKG_ARGS_GIT_VERSION})
         elseif (DEFINED PKG_ARGS_TAG)
             set_property(GLOBAL APPEND PROPERTY CPM_PACKAGE_SHAS
                          ${PKG_ARGS_TAG})
+         elseif(DEFINED PKG_ARGS_VERSION)
+             set_property(GLOBAL APPEND PROPERTY CPM_PACKAGE_SHAS
+                          ${PKG_ARGS_VERSION})
         else()
             cpm_utils_message(WARNING ${PKG_ARGS_NAME}
                         "Package has no specified sha, tag, or version")
@@ -495,6 +526,7 @@ function(add_ci_package key)
     set(ARTIFACT_DIR ${${ARTIFACT_PACKAGE}_SOURCE_DIR} PARENT_SCOPE)
 endfunction()
 
+# TODO(crueter): we could do an AddMultiArchPackage, multiplatformpackage?
 # name is the artifact name, package is for find_package override
 function(AddCIPackage)
     set(oneValueArgs
diff --git a/CMakeModules/Findmbedtls.cmake b/CMakeModules/Findmbedtls.cmake
new file mode 100644
index 0000000000..f5ebf1abdc
--- /dev/null
+++ b/CMakeModules/Findmbedtls.cmake
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+include(FindPackageHandleStandardArgs)
+
+find_package(PkgConfig QUIET)
+pkg_search_module(mbedtls QUIET IMPORTED_TARGET mbedtls)
+find_package_handle_standard_args(mbedtls
+    REQUIRED_VARS mbedtls_LINK_LIBRARIES
+    VERSION_VAR mbedtls_VERSION
+)
+
+pkg_search_module(mbedcrypto QUIET IMPORTED_TARGET mbedcrypto)
+find_package_handle_standard_args(mbedcrypto
+    REQUIRED_VARS mbedcrypto_LINK_LIBRARIES
+    VERSION_VAR mbedcrypto_VERSION
+)
diff --git a/cpmfile.json b/cpmfile.json
index 495382fed0..c720b69e89 100644
--- a/cpmfile.json
+++ b/cpmfile.json
@@ -10,8 +10,8 @@
     "boost": {
         "package": "Boost",
         "repo": "boostorg/boost",
-        "tag": "boost-1.88.0",
-        "artifact": "boost-1.88.0-cmake.7z",
+        "tag": "boost-%VERSION%",
+        "artifact": "%TAG%-cmake.7z",
         "hash": "e5b049e5b61964480ca816395f63f95621e66cb9bcf616a8b10e441e0e69f129e22443acb11e77bc1e8170f8e4171b9b7719891efc43699782bfcd4b3a365f01",
         "git_version": "1.88.0",
         "version": "1.57"
diff --git a/docs/CPM.md b/docs/CPM.md
index f90002891c..2afcdaf164 100644
--- a/docs/CPM.md
+++ b/docs/CPM.md
@@ -23,7 +23,7 @@ CPMUtil is a wrapper around CPM that aims to reduce boilerplate and add useful u
 
 - `NAME` (required): The package name (must be the same as the `find_package` name if applicable)
 - `VERSION`: The minimum version of this package that can be used on the system
-- `GIT_VERSION`: The version found within git, only used for identification
+- `GIT_VERSION`: The "version" found within git
 - `URL`: The URL to fetch.
 - `REPO`: The GitHub repo to use (`owner/repo`).
   * Only GitHub is supported for now, though other platforms will see support at some point
@@ -71,8 +71,9 @@ Hashing strategies, descending order of precedence:
 - `KEY`: Custom cache key to use (stored as `.cache/cpm/${packagename_lower}/${key}`)
   * Default is based on, in descending order of precedence:
     - First 4 characters of the sha
-    - `GIT_VERSION`, or `VERSION` if not specified
+    - `GIT_VERSION`
     - Tag
+    - `VERSION`
     - Otherwise, CPM defaults will be used. This is not recommended as it doesn't produce reproducible caches
 - `DOWNLOAD_ONLY`: Whether or not to configure the downloaded package via CMake
   * Useful to turn `OFF` if the project doesn't use CMake
@@ -232,12 +233,9 @@ In order: OpenSSL CI, Boost (tag + artifact), discord-rpc (sha + options + patch
 To include CPMUtil:
 
 ```cmake
-set(CPMUTIL_JSON_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cpmfile.json)
 include(CPMUtil)
 ```
 
-You may omit the first line if you are not utilizing cpmfile.
-
 ## Prefetching
 
 - To prefetch a CPM dependency (requires cpmfile):
@@ -245,8 +243,8 @@ You may omit the first line if you are not utilizing cpmfile.
 - To prefetch all CPM dependencies:
   * `tools/cpm-fetch-all.sh`
 
-Currently, `cpm-fetch.sh` defines the following directories for cpmfiles:
+Currently, `cpm-fetch.sh` defines the following directories for cpmfiles (max depth of 2, so subdirs are caught as well):
 
-`externals src/yuzu/externals externals/ffmpeg src/dynarmic/externals externals/nx_tzdb`
+`externals src/yuzu src/dynarmic .`
 
 Whenever you add a new cpmfile, update the script accordingly
\ No newline at end of file
diff --git a/docs/build/Android.md b/docs/build/Android.md
index 4bb1c868b6..0538d351ea 100644
--- a/docs/build/Android.md
+++ b/docs/build/Android.md
@@ -1,42 +1,42 @@
-# Note: These build instructions are a work-in-progress.
-
-## Dependencies
-* [Android Studio](https://developer.android.com/studio)
-* [NDK 25.2.9519653 and CMake 3.22.1](https://developer.android.com/studio/projects/install-ndk#default-version)
-* [Git](https://git-scm.com/download)
-
-### WINDOWS ONLY - Additional Dependencies
-  * **[Visual Studio 2022 Community](https://visualstudio.microsoft.com/downloads/)** - **Make sure to select "Desktop development with C++" support in the installer. Make sure to update to the latest version if already installed.**
-  * **[Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows)** - **Make sure to select Latest SDK.**
-    - A convenience script to install the latest SDK is provided in `.ci\windows\install-vulkan-sdk.ps1`.
-
-## Cloning Eden with Git
-```
-git clone --recursive https://git.eden-emu.dev/eden-emu/eden.git
-```
-Eden by default will be cloned into -
-* `C:\Users\<user-name>\eden` on Windows
-* `~/eden` on Linux
-* And wherever on macOS
-
-## Building
-1. Start Android Studio, on the startup dialog select `Open`.
-2. Navigate to the `eden/src/android` directory and click on `OK`.
-3. In `Build > Select Build Variant`, select `release` or `relWithDebInfo` as the "Active build variant".
-4. Build the project with `Build > Make Project` or run it on an Android device with `Run > Run 'app'`.
-
-## Building with Terminal
-1. Download the SDK and NDK from Android Studio.
-2. Navigate to SDK and NDK paths.
-3. Then set ANDROID_SDK_ROOT and ANDROID_NDK_ROOT in terminal via
-`export ANDROID_SDK_ROOT=path/to/sdk`
-`export ANDROID_NDK_ROOT=path/to/ndk`.
-4. Navigate to `eden/src/android`.
-5. Then Build with `./gradlew assemblerelWithDebInfo`.
-6. To build the optimised build use `./gradlew assembleGenshinSpoofRelWithDebInfo`.
-
-### Script
-A convenience script for building is provided in `.ci/android/build.sh`. The built APK can be put into an `artifacts` directory via `.ci/android/package.sh`. On Windows, these must be done in the Git Bash or MinGW terminal.
-
-### Additional Resources
-https://developer.android.com/studio/intro
+# Note: These build instructions are a work-in-progress.
+
+## Dependencies
+* [Android Studio](https://developer.android.com/studio)
+* [NDK 25.2.9519653 and CMake 3.22.1](https://developer.android.com/studio/projects/install-ndk#default-version)
+* [Git](https://git-scm.com/download)
+
+### WINDOWS ONLY - Additional Dependencies
+  * **[Visual Studio 2022 Community](https://visualstudio.microsoft.com/downloads/)** - **Make sure to select "Desktop development with C++" support in the installer. Make sure to update to the latest version if already installed.**
+  * **[Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows)** - **Make sure to select Latest SDK.**
+    - A convenience script to install the latest SDK is provided in `.ci\windows\install-vulkan-sdk.ps1`.
+
+## Cloning Eden with Git
+```
+git clone --recursive https://git.eden-emu.dev/eden-emu/eden.git
+```
+Eden by default will be cloned into -
+* `C:\Users\<user-name>\eden` on Windows
+* `~/eden` on Linux
+* And wherever on macOS
+
+## Building
+1. Start Android Studio, on the startup dialog select `Open`.
+2. Navigate to the `eden/src/android` directory and click on `OK`.
+3. In `Build > Select Build Variant`, select `release` or `relWithDebInfo` as the "Active build variant".
+4. Build the project with `Build > Make Project` or run it on an Android device with `Run > Run 'app'`.
+
+## Building with Terminal
+1. Download the SDK and NDK from Android Studio.
+2. Navigate to SDK and NDK paths.
+3. Then set ANDROID_SDK_ROOT and ANDROID_NDK_ROOT in terminal via
+`export ANDROID_SDK_ROOT=path/to/sdk`
+`export ANDROID_NDK_ROOT=path/to/ndk`.
+4. Navigate to `eden/src/android`.
+5. Then Build with `./gradlew assemblerelWithDebInfo`.
+6. To build the optimised build use `./gradlew assembleGenshinSpoofRelWithDebInfo`.
+
+### Script
+A convenience script for building is provided in `.ci/android/build.sh`. The built APK can be put into an `artifacts` directory via `.ci/android/package.sh`. On Windows, these must be done in the Git Bash or MinGW terminal.
+
+### Additional Resources
+https://developer.android.com/studio/intro
diff --git a/docs/build/FreeBSD.md b/docs/build/FreeBSD.md
index 475378125c..97eef8f9d8 100644
--- a/docs/build/FreeBSD.md
+++ b/docs/build/FreeBSD.md
@@ -1,85 +1,81 @@
-## One word of caution before proceeding.
-
-This is not the usual or preferred way to build programs on FreeBSD.
-As of writing there is no official fresh port available for Eden, but it is in the works.
-After it is available you can find a link to the eden-emu fresh port here and on Escary's github repo.
-See this build as an AppImage alternative for FreeBSD.
-
-## Dependencies.
-Before we start we need some dependencies.
-These dependencies are generally needed to build Eden on FreeBSD.
-
-```
-devel/cmake
-devel/sdl20
-devel/boost-libs
-devel/catch2
-devel/libfmt
-devel/nlohmann-json
-devel/ninja
-devel/nasm
-devel/autoconf
-devel/pkgconf
-devel/qt6-base
-
-multimedia/ffnvcodec-headers
-multimedia/ffmpeg
-
-audio/opus
-
-archivers/liblz4
-
-lang/gcc12
-
-graphics/glslang
-graphics/vulkan-utility-libraries
-```
-
-If using FreeBSD 12 or prior, use `devel/pkg-config` instead.
-
----
-
-### Build preparations:
-Run the following command to clone eden with git:
-```sh
-git clone --recursive https://git.eden-emu.dev/eden-emu/eden
-```
-You usually want to add the `--recursive` parameter as it also takes care of the external dependencies for you.
-
-Now change into the eden directory and create a build directory there:
-```sh
-cd eden
-mkdir build
-```
-
-Change into that build directory:
-```sh
-cd build
-```
-
-#### 1. Building in Release Mode (usually preferred and the most performant choice):
-```sh
-cmake .. -GNinja -DYUZU_TESTS=OFF
-```
-
-#### 2. Building in Release Mode with debugging symbols (useful if you want to debug errors for a eventual fix):
-```sh
-cmake .. -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU_TESTS=ON
-```
-
-Build the emulator locally:
-```sh
-ninja
-```
-
-Optional: If you wish to install eden globally onto your system issue the following command:
-```sh
-sudo ninja install
-```
-OR
-```sh
-doas -- ninja install
-```
-
-## OpenSSL
+Eden is not currently available as a port on FreeBSD, though it is in the works. For now, the recommended method of usage is to compile it yourself. Check back often, as the build process frequently changes.
+
+## Dependencies.
+Eden needs the following dependencies:
+
+```
+devel/cmake
+devel/sdl20
+devel/boost-libs
+devel/catch2
+devel/libfmt
+devel/nlohmann-json
+devel/ninja
+devel/nasm
+devel/autoconf
+devel/pkgconf
+devel/qt6-base
+
+net/enet
+
+multimedia/ffnvcodec-headers
+multimedia/ffmpeg
+
+audio/opus
+
+archivers/liblz4
+
+lang/gcc12
+
+graphics/glslang
+graphics/vulkan-utility-libraries
+```
+
+If using FreeBSD 12 or prior, use `devel/pkg-config` instead.
+
+---
+
+### Build preparations:
+Run the following command to clone eden with git:
+```sh
+git clone --recursive https://git.eden-emu.dev/eden-emu/eden
+```
+You usually want to add the `--recursive` parameter as it also takes care of the external dependencies for you.
+
+Now change into the eden directory and create a build directory there:
+```sh
+cd eden
+mkdir build
+```
+
+Change into that build directory:
+```sh
+cd build
+```
+
+#### 1. Building in Release Mode (usually preferred and the most performant choice):
+```sh
+cmake .. -GNinja -DYUZU_TESTS=OFF
+```
+
+#### 2. Building in Release Mode with debugging symbols (useful if you want to debug errors for a eventual fix):
+```sh
+cmake .. -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU_TESTS=ON
+```
+
+Build the emulator locally:
+```sh
+ninja
+```
+
+Optional: If you wish to install eden globally onto your system issue the following command:
+```sh
+sudo ninja install
+```
+OR
+```sh
+doas -- ninja install
+```
+
+## OpenSSL
 The available OpenSSL port (3.0.17) is out-of-date, and using a bundled static library instead is recommended; to do so, add `-DYUZU_USE_CPM=ON` to your CMake configure command.
\ No newline at end of file
diff --git a/docs/build/Linux.md b/docs/build/Linux.md
index be58b451fa..a6b7b2dda7 100644
--- a/docs/build/Linux.md
+++ b/docs/build/Linux.md
@@ -1,138 +1,138 @@
-### Dependencies
-
-You'll need to download and install the following to build Eden:
-
-  * [GCC](https://gcc.gnu.org/) v11+ (for C++20 support) & misc
-  * If GCC 12 is installed, [Clang](https://clang.llvm.org/) v14+ is required for compiling
-  * [CMake](https://www.cmake.org/) 3.22+
-
-The following are handled by Eden's externals:
-
-  * [FFmpeg](https://ffmpeg.org/)
-  * [SDL2](https://www.libsdl.org/download-2.0.php) 2.0.18+
-  * [opus](https://opus-codec.org/downloads/) 1.3+
-  
-All other dependencies will be downloaded and built by [CPM](https://github.com/cpm-cmake/CPM.cmake/) if `YUZU_USE_CPM` is on, but will always use system dependencies if available:
-
-  * [Boost](https://www.boost.org/users/download/) 1.79.0+
-  * [Catch2](https://github.com/catchorg/Catch2) 2.13.7 - 2.13.9
-  * [fmt](https://fmt.dev/) 8.0.1+
-  * [lz4](http://www.lz4.org) 1.8+
-  * [nlohmann_json](https://github.com/nlohmann/json) 3.8+
-  * [OpenSSL](https://www.openssl.org/source/) 1.1.1+
-  * [ZLIB](https://www.zlib.net/) 1.2+
-  * [zstd](https://facebook.github.io/zstd/) 1.5+
-  * [enet](http://enet.bespin.org/) 1.3+
-  * [cubeb](https://github.com/mozilla/cubeb)
-  * [SimpleIni](https://github.com/brofield/simpleini)
-
-Certain other dependencies (httplib, jwt, sirit, etc.) will be fetched by CPM regardless. System packages *can* be used for these libraries but this is generally not recommended.
-
-Dependencies are listed here as commands that can be copied/pasted. Of course, they should be inspected before being run.
-
-- Arch / Manjaro:
-  - `sudo pacman -Syu --needed base-devel boost catch2 cmake enet ffmpeg fmt git glslang libzip lz4 mbedtls ninja nlohmann-json openssl opus qt6-base qt6-multimedia sdl2 zlib zstd zip unzip`
-  - Building with QT Web Engine requires `qt6-webengine` as well.
-  - Proper wayland support requires `qt6-wayland`
-  - GCC 11 or later is required.
-  
-- Ubuntu / Linux Mint / Debian:
-  - `sudo apt-get install autoconf cmake g++ gcc git glslang-tools libasound2 libboost-context-dev libglu1-mesa-dev libhidapi-dev libpulse-dev libtool libudev-dev libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-render-util0 libxcb-xinerama0 libxcb-xkb1 libxext-dev libxkbcommon-x11-0 mesa-common-dev nasm ninja-build qt6-base-private-dev libmbedtls-dev catch2 libfmt-dev liblz4-dev nlohmann-json3-dev libzstd-dev libssl-dev libavfilter-dev libavcodec-dev libswscale-dev pkg-config zlib1g-dev libva-dev libvdpau-dev`
-  - Ubuntu 22.04, Linux Mint 20, or Debian 12 or later is required.
-  - Users need to manually specify building with QT Web Engine enabled.  This is done using the parameter `-DYUZU_USE_QT_WEB_ENGINE=ON` when running CMake.
-  - Users need to manually disable building SDL2 from externals if they intend to use the version provided by their system by adding the parameters `-DYUZU_USE_EXTERNAL_SDL2=OFF`
-
-```sh
-git submodule update --init --recursive
-cmake .. -GNinja -DCMAKE_C_COMPILER=gcc-11 -DCMAKE_CXX_COMPILER=g++-11
-```
-
-- Fedora:
-  - `sudo dnf install autoconf ccache cmake fmt-devel gcc{,-c++} glslang hidapi-devel json-devel libtool libusb1-devel libzstd-devel lz4-devel nasm ninja-build openssl-devel pulseaudio-libs-devel qt6-linguist qt6-qtbase{-private,}-devel qt6-qtwebengine-devel qt6-qtmultimedia-devel speexdsp-devel wayland-devel zlib-devel ffmpeg-devel libXext-devel`
-  - Fedora 32 or later is required.
-  - Due to GCC 12, Fedora 36 or later users need to install `clang`, and configure CMake to use it via `-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang`
-  - CMake arguments to force system libraries:
-    - SDL2: `-DYUZU_USE_BUNDLED_SDL2=OFF -DYUZU_USE_EXTERNAL_SDL2=OFF`
-    - FFmpeg: `-DYUZU_USE_EXTERNAL_FFMPEG=OFF`
-  - [RPM Fusion](https://rpmfusion.org/) (free) is required to install `ffmpeg-devel`
-
-### Cloning Eden with Git
-
-**Master:**
-
-```bash
-git clone --recursive https://git.eden-emu.dev/eden-emu/eden
-cd eden
-```
-
-The `--recursive` option automatically clones the required Git submodules.
-
-### Building Eden in Release Mode (Optimised)
-
-If you need to run ctests, you can disable `-DYUZU_TESTS=OFF` and install Catch2.
-
-```bash
-mkdir build && cd build
-cmake .. -GNinja -DYUZU_TESTS=OFF
-ninja
-sudo ninja install 
-```
-You may also want to include support for Discord Rich Presence by adding `-DUSE_DISCORD_PRESENCE=ON` after `cmake ..`
-
-`-DYUZU_USE_EXTERNAL_VULKAN_SPIRV_TOOLS=OFF` might be needed if ninja command failed with `undefined reference to symbol 'spvOptimizerOptionsCreate`, reason currently unknown
-
-Optionally, you can use `cmake-gui ..` to adjust various options (e.g. disable the Qt GUI).
-
-### Building Eden in Debug Mode (Slow)
-
-```bash
-mkdir build && cd build
-cmake .. -GNinja -DCMAKE_BUILD_TYPE=Debug -DYUZU_TESTS=OFF
-ninja
-```
-
-### Building with debug symbols
-
-```bash
-mkdir build && cd build
-cmake .. -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU -DYUZU_TESTS=OFF
-ninja
-```
-
-### Building with Scripts
-A convenience script for building is provided in `.ci/linux/build.sh`. You must provide an arch target for optimization, e.g. `.ci/linux/build.sh amd64`. Valid targets:
-- `legacy`: x86_64 generic, only needed for CPUs older than 2013 or so
-- `amd64`: x86_64-v3, for CPUs newer than 2013 or so
-- `steamdeck` / `zen2`: For Steam Deck or Zen >= 2 AMD CPUs (untested on Intel)
-- `rog-ally` / `allyx` / `zen4`: For ROG Ally X or Zen >= 4 AMD CPUs (untested on Intel)
-- `aarch64`: For armv8-a CPUs, older than mid-2021 or so
-- `armv9`: For armv9-a CPUs, newer than mid-2021 or so
-- `native`: Optimize to your native host architecture
-
-Extra flags to pass to CMake should be passed after the arch target.
-
-Additional environment variables can be used to control building:
-- `NPROC`: Number of threads to use for compilation (defaults to all)
-- `TARGET`: Set to `appimage` to disable standalone `eden-cli` and `eden-room` executables
-- `BUILD_TYPE`: Sets the build type to use. Defaults to `Release`
-
-The following environment variables are boolean flags. Set to `true` to enable or `false` to disable:
-- `DEVEL` (default FALSE): Disable Qt update checker
-- `USE_WEBENGINE` (default FALSE): Enable Qt WebEngine
-- `USE_MULTIMEDIA` (default TRUE): Enable Qt Multimedia
-
-After building, an AppImage can be packaged via `.ci/linux/package.sh`. This script takes the same arch targets as the build script. If the build was created in a different directory, you can specify its path relative to the source directory, e.g. `.ci/linux/package.sh amd64 build-appimage`. Additionally, set the `DEVEL` environment variable to `true` to change the app name to `Eden Nightly`.
-
-### Running without installing
-
-After building, the binaries `eden` and `eden-cmd` (depending on your build options) will end up in `build/bin/`.
-
-```bash
-# SDL
-cd build/bin/
-./eden-cmd
-
-# Qt
-cd build/bin/
-./eden
-```
+### Dependencies
+
+You'll need to download and install the following to build Eden:
+
+  * [GCC](https://gcc.gnu.org/) v11+ (for C++20 support) & misc
+  * If GCC 12 is installed, [Clang](https://clang.llvm.org/) v14+ is required for compiling
+  * [CMake](https://www.cmake.org/) 3.22+
+
+The following are handled by Eden's externals:
+
+  * [FFmpeg](https://ffmpeg.org/)
+  * [SDL2](https://www.libsdl.org/download-2.0.php) 2.0.18+
+  * [opus](https://opus-codec.org/downloads/) 1.3+
+  
+All other dependencies will be downloaded and built by [CPM](https://github.com/cpm-cmake/CPM.cmake/) if `YUZU_USE_CPM` is on, but will always use system dependencies if available:
+
+  * [Boost](https://www.boost.org/users/download/) 1.79.0+
+  * [Catch2](https://github.com/catchorg/Catch2) 2.13.7 - 2.13.9
+  * [fmt](https://fmt.dev/) 8.0.1+
+  * [lz4](http://www.lz4.org) 1.8+
+  * [nlohmann_json](https://github.com/nlohmann/json) 3.8+
+  * [OpenSSL](https://www.openssl.org/source/) 1.1.1+
+  * [ZLIB](https://www.zlib.net/) 1.2+
+  * [zstd](https://facebook.github.io/zstd/) 1.5+
+  * [enet](http://enet.bespin.org/) 1.3+
+  * [cubeb](https://github.com/mozilla/cubeb)
+  * [SimpleIni](https://github.com/brofield/simpleini)
+
+Certain other dependencies (httplib, jwt, sirit, etc.) will be fetched by CPM regardless. System packages *can* be used for these libraries but this is generally not recommended.
+
+Dependencies are listed here as commands that can be copied/pasted. Of course, they should be inspected before being run.
+
+- Arch / Manjaro:
+  - `sudo pacman -Syu --needed base-devel boost catch2 cmake enet ffmpeg fmt git glslang libzip lz4 mbedtls ninja nlohmann-json openssl opus qt6-base qt6-multimedia sdl2 zlib zstd zip unzip`
+  - Building with QT Web Engine requires `qt6-webengine` as well.
+  - Proper wayland support requires `qt6-wayland`
+  - GCC 11 or later is required.
+  
+- Ubuntu / Linux Mint / Debian:
+  - `sudo apt-get install autoconf cmake g++ gcc git glslang-tools libasound2 libboost-context-dev libglu1-mesa-dev libhidapi-dev libpulse-dev libtool libudev-dev libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-render-util0 libxcb-xinerama0 libxcb-xkb1 libxext-dev libxkbcommon-x11-0 mesa-common-dev nasm ninja-build qt6-base-private-dev libmbedtls-dev catch2 libfmt-dev liblz4-dev nlohmann-json3-dev libzstd-dev libssl-dev libavfilter-dev libavcodec-dev libswscale-dev pkg-config zlib1g-dev libva-dev libvdpau-dev`
+  - Ubuntu 22.04, Linux Mint 20, or Debian 12 or later is required.
+  - Users need to manually specify building with QT Web Engine enabled.  This is done using the parameter `-DYUZU_USE_QT_WEB_ENGINE=ON` when running CMake.
+  - Users need to manually disable building SDL2 from externals if they intend to use the version provided by their system by adding the parameters `-DYUZU_USE_EXTERNAL_SDL2=OFF`
+
+```sh
+git submodule update --init --recursive
+cmake .. -GNinja -DCMAKE_C_COMPILER=gcc-11 -DCMAKE_CXX_COMPILER=g++-11
+```
+
+- Fedora:
+  - `sudo dnf install autoconf ccache cmake fmt-devel gcc{,-c++} glslang hidapi-devel json-devel libtool libusb1-devel libzstd-devel lz4-devel nasm ninja-build openssl-devel pulseaudio-libs-devel qt6-linguist qt6-qtbase{-private,}-devel qt6-qtwebengine-devel qt6-qtmultimedia-devel speexdsp-devel wayland-devel zlib-devel ffmpeg-devel libXext-devel`
+  - Fedora 32 or later is required.
+  - Due to GCC 12, Fedora 36 or later users need to install `clang`, and configure CMake to use it via `-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang`
+  - CMake arguments to force system libraries:
+    - SDL2: `-DYUZU_USE_BUNDLED_SDL2=OFF -DYUZU_USE_EXTERNAL_SDL2=OFF`
+    - FFmpeg: `-DYUZU_USE_EXTERNAL_FFMPEG=OFF`
+  - [RPM Fusion](https://rpmfusion.org/) (free) is required to install `ffmpeg-devel`
+
+### Cloning Eden with Git
+
+**Master:**
+
+```bash
+git clone --recursive https://git.eden-emu.dev/eden-emu/eden
+cd eden
+```
+
+The `--recursive` option automatically clones the required Git submodules.
+
+### Building Eden in Release Mode (Optimised)
+
+If you need to run ctests, you can disable `-DYUZU_TESTS=OFF` and install Catch2.
+
+```bash
+mkdir build && cd build
+cmake .. -GNinja -DYUZU_TESTS=OFF
+ninja
+sudo ninja install 
+```
+You may also want to include support for Discord Rich Presence by adding `-DUSE_DISCORD_PRESENCE=ON` after `cmake ..`
+
+`-DYUZU_USE_EXTERNAL_VULKAN_SPIRV_TOOLS=OFF` might be needed if ninja command failed with `undefined reference to symbol 'spvOptimizerOptionsCreate`, reason currently unknown
+
+Optionally, you can use `cmake-gui ..` to adjust various options (e.g. disable the Qt GUI).
+
+### Building Eden in Debug Mode (Slow)
+
+```bash
+mkdir build && cd build
+cmake .. -GNinja -DCMAKE_BUILD_TYPE=Debug -DYUZU_TESTS=OFF
+ninja
+```
+
+### Building with debug symbols
+
+```bash
+mkdir build && cd build
+cmake .. -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU -DYUZU_TESTS=OFF
+ninja
+```
+
+### Building with Scripts
+A convenience script for building is provided in `.ci/linux/build.sh`. You must provide an arch target for optimization, e.g. `.ci/linux/build.sh amd64`. Valid targets:
+- `legacy`: x86_64 generic, only needed for CPUs older than 2013 or so
+- `amd64`: x86_64-v3, for CPUs newer than 2013 or so
+- `steamdeck` / `zen2`: For Steam Deck or Zen >= 2 AMD CPUs (untested on Intel)
+- `rog-ally` / `allyx` / `zen4`: For ROG Ally X or Zen >= 4 AMD CPUs (untested on Intel)
+- `aarch64`: For armv8-a CPUs, older than mid-2021 or so
+- `armv9`: For armv9-a CPUs, newer than mid-2021 or so
+- `native`: Optimize to your native host architecture
+
+Extra flags to pass to CMake should be passed after the arch target.
+
+Additional environment variables can be used to control building:
+- `NPROC`: Number of threads to use for compilation (defaults to all)
+- `TARGET`: Set to `appimage` to disable standalone `eden-cli` and `eden-room` executables
+- `BUILD_TYPE`: Sets the build type to use. Defaults to `Release`
+
+The following environment variables are boolean flags. Set to `true` to enable or `false` to disable:
+- `DEVEL` (default FALSE): Disable Qt update checker
+- `USE_WEBENGINE` (default FALSE): Enable Qt WebEngine
+- `USE_MULTIMEDIA` (default TRUE): Enable Qt Multimedia
+
+After building, an AppImage can be packaged via `.ci/linux/package.sh`. This script takes the same arch targets as the build script. If the build was created in a different directory, you can specify its path relative to the source directory, e.g. `.ci/linux/package.sh amd64 build-appimage`. Additionally, set the `DEVEL` environment variable to `true` to change the app name to `Eden Nightly`.
+
+### Running without installing
+
+After building, the binaries `eden` and `eden-cmd` (depending on your build options) will end up in `build/bin/`.
+
+```bash
+# SDL
+cd build/bin/
+./eden-cmd
+
+# Qt
+cd build/bin/
+./eden
+```
diff --git a/docs/build/Solaris.md b/docs/build/Solaris.md
index d4cfdbb6a9..f7174c2869 100644
--- a/docs/build/Solaris.md
+++ b/docs/build/Solaris.md
@@ -1,51 +1,51 @@
-# Building for Solaris
-
-## Dependencies.  
-Always consult [the OpenIndiana package list](https://pkg.openindiana.org/hipster/en/index.shtml) to cross-verify availability.
-
-Run the usual update + install of essential toolings: `sudo pkg update && sudo pkg install git cmake`.
-
-- **gcc**: `sudo pkg install developer/gcc-14`.
-- **clang**: Version 20 is broken, use `sudo pkg install developer/clang-19`.
-
-Then install the libraies: `sudo pkg install qt6 boost glslang libzip library/lz4 nlohmann-json openssl opus sdl2 zlib compress/zstd unzip pkg-config nasm autoconf mesa library/libdrm header-drm developer/fmt`.
-
-### Building
-
-Clone eden with git `git clone --recursive https://git.eden-emu.dev/eden-emu/eden`
-
-```sh
-# Needed for some dependencies that call cc directly (tz)
-echo '#!/bin/sh' >cc
-echo 'gcc $@' >>cc
-chmod +x cc
-export PATH="$PATH:$PWD"
-```
-
-Patch for FFmpeg:
-```sh
-sed -i 's/ make / gmake /' externals/ffmpeg/CMakeFiles/ffmpeg-build.dir/build.make
-```
-
-- **Configure**: `cmake -B build -DYUZU_USE_CPM=ON -DCMAKE_CXX_FLAGS="-I/usr/include/SDL2" -DCMAKE_C_FLAGS="-I/usr/include/SDL2"`.
-- **Build**: `cmake --build build`.
-- **Installing**: `sudo cmake --install build`.
-
-### Running
-
-Default Mesa is a bit outdated, the following environment variables should be set for a smoother experience:
-```sh
-export MESA_GL_VERSION_OVERRIDE=4.6
-export MESA_GLSL_VERSION_OVERRIDE=460
-export MESA_EXTENSION_MAX_YEAR=2025
-export MESA_DEBUG=1
-export MESA_VK_VERSION_OVERRIDE=1.3
-# Only if nvidia/intel drm drivers cause crashes, will severely hinder performance
-export LIBGL_ALWAYS_SOFTWARE=1
-```
-
-### Notes
-
-- Modify the generated ffmpeg.make (in build dir) if using multiple threads (base system `make` doesn't use `-j4`, so change for `gmake`).
-- If using OpenIndiana, due to a bug in SDL2 cmake configuration; Audio driver defaults to SunOS `<sys/audioio.h>`, which does not exist on OpenIndiana.
+# Building for Solaris
+
+## Dependencies.  
+Always consult [the OpenIndiana package list](https://pkg.openindiana.org/hipster/en/index.shtml) to cross-verify availability.
+
+Run the usual update + install of essential toolings: `sudo pkg update && sudo pkg install git cmake`.
+
+- **gcc**: `sudo pkg install developer/gcc-14`.
+- **clang**: Version 20 is broken, use `sudo pkg install developer/clang-19`.
+
+Then install the libraies: `sudo pkg install qt6 boost glslang libzip library/lz4 nlohmann-json openssl opus sdl2 zlib compress/zstd unzip pkg-config nasm autoconf mesa library/libdrm header-drm developer/fmt`.
+
+### Building
+
+Clone eden with git `git clone --recursive https://git.eden-emu.dev/eden-emu/eden`
+
+```sh
+# Needed for some dependencies that call cc directly (tz)
+echo '#!/bin/sh' >cc
+echo 'gcc $@' >>cc
+chmod +x cc
+export PATH="$PATH:$PWD"
+```
+
+Patch for FFmpeg:
+```sh
+sed -i 's/ make / gmake /' externals/ffmpeg/CMakeFiles/ffmpeg-build.dir/build.make
+```
+
+- **Configure**: `cmake -B build -DYUZU_USE_CPM=ON -DCMAKE_CXX_FLAGS="-I/usr/include/SDL2" -DCMAKE_C_FLAGS="-I/usr/include/SDL2"`.
+- **Build**: `cmake --build build`.
+- **Installing**: `sudo cmake --install build`.
+
+### Running
+
+Default Mesa is a bit outdated, the following environment variables should be set for a smoother experience:
+```sh
+export MESA_GL_VERSION_OVERRIDE=4.6
+export MESA_GLSL_VERSION_OVERRIDE=460
+export MESA_EXTENSION_MAX_YEAR=2025
+export MESA_DEBUG=1
+export MESA_VK_VERSION_OVERRIDE=1.3
+# Only if nvidia/intel drm drivers cause crashes, will severely hinder performance
+export LIBGL_ALWAYS_SOFTWARE=1
+```
+
+### Notes
+
+- Modify the generated ffmpeg.make (in build dir) if using multiple threads (base system `make` doesn't use `-j4`, so change for `gmake`).
+- If using OpenIndiana, due to a bug in SDL2 cmake configuration; Audio driver defaults to SunOS `<sys/audioio.h>`, which does not exist on OpenIndiana.
 - System OpenSSL generally does not work. Instead, use `-DYUZU_USE_CPM=ON` to use a bundled static OpenSSL, or build a system dependency from source.
\ No newline at end of file
diff --git a/docs/build/Windows.md b/docs/build/Windows.md
index 3b8c459073..c1792983aa 100644
--- a/docs/build/Windows.md
+++ b/docs/build/Windows.md
@@ -1,193 +1,193 @@
-# THIS GUIDE IS INTENDED FOR DEVELOPERS ONLY, SUPPORT WILL ONLY BE GIVEN IF YOU'RE A DEVELOPER.
-
-## Method I: MSVC Build for Windows
-
-### Minimal Dependencies
-
-On Windows, all library dependencies are automatically included within the `externals` folder, or can be downloaded on-demand. To build Eden, you need to install:
-
-  * **[Visual Studio 2022 Community](https://visualstudio.microsoft.com/downloads/)** - **Make sure to select C++ support in the installer. Make sure to update to the latest version if already installed.**
-  * **[CMake](https://cmake.org/download/)** - Used to generate Visual Studio project files. Does not matter if either 32-bit or 64-bit version is installed.
-  * **[Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows)** - **Make sure to select Latest SDK.**
-    - A convenience script to install the latest SDK is provided in `.ci\windows\install-vulkan-sdk.ps1`.
-
-  ![2](https://i.imgur.com/giDwuTm.png)
-
-  * **Git** - We recommend [Git for Windows](https://gitforwindows.org).
-
-  ![3](https://i.imgur.com/UeSzkBw.png)
-
-  * While installing Git Bash, you should tell it to include Git in your system path. (Choose the "Git from the command line and also from 3rd-party software" option.) If you missed that, don't worry, you'll just have to manually tell CMake where your git.exe is, since it's used to include version info into the built executable.
-
-  ![4](https://i.imgur.com/x0rRs1t.png)
-
-### Cloning Eden with Git
-
-**Master:**
-  ```cmd
-  git clone --recursive https://git.eden-emu.dev/eden-emu/eden
-  cd eden
-  ```
-
-  ![9](https://i.imgur.com/CcxIAht.png)
-
-* *(Note: eden by default downloads to `C:\Users\<user-name>\eden` (Master)
-
-### Building
-
-* Open the CMake GUI application and point it to the `eden` (Master) 
-
-  ![10](https://i.imgur.com/qOslIWv.png)
-
-* For the build directory, use a `/build` subdirectory inside the source directory or some other directory of your choice. (Tell CMake to create it.)
-
-* Click the "Configure" button and choose `Visual Studio 17 2022`, with `x64` for the optional platform.
-
-  ![12](https://i.imgur.com/DKiREaK.png)
-
-  * *(Note: If you used GitHub's own app to clone, run `git submodule update --init --recursive` to get the remaining dependencies)*
-
-  * *(You may also want to disable `YUZU_TESTS` in this case since Catch2 is not yet supported with this.)*
-
-  ![13](https://user-images.githubusercontent.com/22451773/180585999-07316d6e-9751-4d11-b957-1cf57cd7cd58.png)
-
-* Click "Generate" to create the project files.
-
-  ![15](https://i.imgur.com/5LKg92k.png)
-
-* Open the solution file `yuzu.sln` in Visual Studio 2022, which is located in the build folder.
-
-  ![16](https://i.imgur.com/208yMml.png)
-
-* Depending if you want a graphical user interface or not (`eden` has the graphical user interface, while `eden-cmd` doesn't), select `eden` or `eden-cmd` in the Solution Explorer, right-click and `Set as StartUp Project`.
-
-  ![17](https://i.imgur.com/nPMajnn.png)  ![18](https://i.imgur.com/BDMLzRZ.png)
-
-* Select the appropriate build type, Debug for debug purposes or Release for performance (in case of doubt choose Release).
-
-  ![19](https://i.imgur.com/qxg4roC.png)
-
-* Right-click the project you want to build and press Build in the submenu or press F5.
-
-  ![20](https://i.imgur.com/CkQgOFW.png)
-
-## Method II: MinGW-w64 Build with MSYS2
-
-### Prerequisites to install
-
-* [MSYS2](https://www.msys2.org)
-* [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows) - **Make sure to select Latest SDK.**
-* Make sure to follow the instructions and update to the latest version by running `pacman -Syu` as many times as needed.
-
-### Install eden dependencies for MinGW-w64
-
-* Open the `MSYS2 MinGW 64-bit` (mingw64.exe) shell
-* Download and install all dependencies using: `pacman -Syu git make mingw-w64-x86_64-SDL2 mingw-w64-x86_64-cmake mingw-w64-x86_64-python-pip mingw-w64-x86_64-qt6 mingw-w64-x86_64-toolchain autoconf libtool automake-wrapper`
-* Add MinGW binaries to the PATH: `echo 'PATH=/mingw64/bin:$PATH' >> ~/.bashrc`
-* Add glslangValidator to the PATH: `echo 'PATH=$(readlink -e /c/VulkanSDK/*/Bin/):$PATH' >> ~/.bashrc`
-
-### Clone the eden repository with Git
-
-  ```bash
-  git clone --recursive https://git.eden-emu.dev/eden-emu/eden
-  cd eden
-  ```
-
-### Run the following commands to build eden (dynamically linked build)
-
-```bash
-mkdir build && cd build
-cmake -G "MSYS Makefiles" -DYUZU_TESTS=OFF ..
-make -j$(nproc)
-# test eden out with
-./bin/eden.exe
-```
-
-* *(Note: This build is not a static build meaning that you need to include all of the DLLs with the .exe in order to use it!)*
-
-e.g.
-```Bash
-cp externals/ffmpeg-*/bin/*.dll bin/
-```
-
-Bonus Note: Running programs from inside `MSYS2 MinGW x64` shell has a different %PATH% than directly from explorer. This different %PATH% has the locations of the other DLLs required.
-![image](https://user-images.githubusercontent.com/190571/165000848-005e8428-8a82-41b1-bb4d-4ce7797cdac8.png)
-
-
-### Building without Qt (Optional)
-
-Doesn't require the rather large Qt dependency, but you will lack a GUI frontend:
-
-  * Pass the `-DENABLE_QT=no` flag to cmake
-
-## Method III: CLion Environment Setup
-
-### Minimal Dependencies
-
-To build eden, you need to install the following:
-
-* [CLion](https://www.jetbrains.com/clion/) - This IDE is not free; for a free alternative, check Method I
-* [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows) - Make sure to select the Latest SDK.
-
-### Cloning eden with CLion
-
-* Clone the Repository:
-
-![1](https://user-images.githubusercontent.com/42481638/216899046-0d41d7d6-8e4d-4ed2-9587-b57088af5214.png)
-![2](https://user-images.githubusercontent.com/42481638/216899061-b2ea274a-e88c-40ae-bf0b-4450b46e9fea.png)
-![3](https://user-images.githubusercontent.com/42481638/216899076-0e5988c4-d431-4284-a5ff-9ecff973db76.png)
-
-
-
-### Building & Setup
-
-* Once Cloned, You will be taken to a prompt like the image below:
-
-![4](https://user-images.githubusercontent.com/42481638/216899092-3fe4cec6-a540-44e3-9e1e-3de9c2fffc2f.png)
-
-* Set the settings to the image below:
-* Change `Build type: Release`
-* Change `Name: Release`
-* Change `Toolchain Visual Studio`
-* Change `Generator: Let CMake decide`
-* Change `Build directory: build`
-
-![5](https://user-images.githubusercontent.com/42481638/216899164-6cee8482-3d59-428f-b1bc-e6dc793c9b20.png)
-
-* Click OK; now Clion will build a directory and index your code to allow for IntelliSense. Please be patient.
-* Once this process has been completed (No loading bar bottom right), you can now build eden
-* In the top right, click on the drop-down menu, select all configurations, then select eden
-
-![6](https://user-images.githubusercontent.com/42481638/216899226-975048e9-bc6d-4ec1-bc2d-bd8a1e15ed04.png)
-
-* Now run by clicking the play button or pressing Shift+F10, and eden will auto-launch once built.
-
-![7](https://user-images.githubusercontent.com/42481638/216899275-d514ec6a-e563-470e-81e2-3e04f0429b68.png)
-
-## Building from the command line with MSVC
-
-```cmd
-git clone --recursive https://git.eden-emu.dev/eden-emu/eden
-cd eden
-mkdir build
-cd build
-cmake .. -G "Visual Studio 17 2022" -A x64
-cmake --build . --config Release
-```
-
-### Building with Scripts
-A convenience script for building is provided in `.ci/windows/build.sh`. You must run this with Bash, e.g. Git Bash or MinGW TTY. To use this script, you must have windeployqt installed (usually bundled with Qt) and set the `WINDEPLOYQT` environment variable to its canonical Bash location, e.g. `WINDEPLOYQT="/c/Qt/6.9.1/msvc2022_64/bin/windeployqt6.exe" .ci/windows/build.sh`.
-
-Extra CMake flags should be placed in the arguments of the script.
-
-Additional environment variables can be used to control building:
-- `BUILD_TYPE`: Sets the build type to use. Defaults to `Release`
-
-The following environment variables are boolean flags. Set to `true` to enable or `false` to disable:
-- `DEVEL` (default FALSE): Disable Qt update checker
-- `USE_WEBENGINE` (default FALSE): Enable Qt WebEngine
-- `USE_MULTIMEDIA` (default TRUE): Enable Qt Multimedia
-- `BUNDLE_QT` (default FALSE): Use bundled Qt
-  * Note that using system Qt requires you to include the Qt CMake directory in `CMAKE_PREFIX_PATH`, e.g. `.ci/windows/build.sh -DCMAKE_PREFIX_PATH=C:/Qt/6.9.0/msvc2022_64/lib/cmake/Qt6`
-
-After building, a zip can be packaged via `.ci/windows/package.sh`. Note that you must have 7-zip installed and in your PATH. The resulting zip will be placed into `artifacts` in the source directory.
+# THIS GUIDE IS INTENDED FOR DEVELOPERS ONLY, SUPPORT WILL ONLY BE GIVEN IF YOU'RE A DEVELOPER.
+
+## Method I: MSVC Build for Windows
+
+### Minimal Dependencies
+
+On Windows, all library dependencies are automatically included within the `externals` folder, or can be downloaded on-demand. To build Eden, you need to install:
+
+  * **[Visual Studio 2022 Community](https://visualstudio.microsoft.com/downloads/)** - **Make sure to select C++ support in the installer. Make sure to update to the latest version if already installed.**
+  * **[CMake](https://cmake.org/download/)** - Used to generate Visual Studio project files. Does not matter if either 32-bit or 64-bit version is installed.
+  * **[Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows)** - **Make sure to select Latest SDK.**
+    - A convenience script to install the latest SDK is provided in `.ci\windows\install-vulkan-sdk.ps1`.
+
+  ![2](https://i.imgur.com/giDwuTm.png)
+
+  * **Git** - We recommend [Git for Windows](https://gitforwindows.org).
+
+  ![3](https://i.imgur.com/UeSzkBw.png)
+
+  * While installing Git Bash, you should tell it to include Git in your system path. (Choose the "Git from the command line and also from 3rd-party software" option.) If you missed that, don't worry, you'll just have to manually tell CMake where your git.exe is, since it's used to include version info into the built executable.
+
+  ![4](https://i.imgur.com/x0rRs1t.png)
+
+### Cloning Eden with Git
+
+**Master:**
+  ```cmd
+  git clone --recursive https://git.eden-emu.dev/eden-emu/eden
+  cd eden
+  ```
+
+  ![9](https://i.imgur.com/CcxIAht.png)
+
+* *(Note: eden by default downloads to `C:\Users\<user-name>\eden` (Master)
+
+### Building
+
+* Open the CMake GUI application and point it to the `eden` (Master) 
+
+  ![10](https://i.imgur.com/qOslIWv.png)
+
+* For the build directory, use a `/build` subdirectory inside the source directory or some other directory of your choice. (Tell CMake to create it.)
+
+* Click the "Configure" button and choose `Visual Studio 17 2022`, with `x64` for the optional platform.
+
+  ![12](https://i.imgur.com/DKiREaK.png)
+
+  * *(Note: If you used GitHub's own app to clone, run `git submodule update --init --recursive` to get the remaining dependencies)*
+
+  * *(You may also want to disable `YUZU_TESTS` in this case since Catch2 is not yet supported with this.)*
+
+  ![13](https://user-images.githubusercontent.com/22451773/180585999-07316d6e-9751-4d11-b957-1cf57cd7cd58.png)
+
+* Click "Generate" to create the project files.
+
+  ![15](https://i.imgur.com/5LKg92k.png)
+
+* Open the solution file `yuzu.sln` in Visual Studio 2022, which is located in the build folder.
+
+  ![16](https://i.imgur.com/208yMml.png)
+
+* Depending if you want a graphical user interface or not (`eden` has the graphical user interface, while `eden-cmd` doesn't), select `eden` or `eden-cmd` in the Solution Explorer, right-click and `Set as StartUp Project`.
+
+  ![17](https://i.imgur.com/nPMajnn.png)  ![18](https://i.imgur.com/BDMLzRZ.png)
+
+* Select the appropriate build type, Debug for debug purposes or Release for performance (in case of doubt choose Release).
+
+  ![19](https://i.imgur.com/qxg4roC.png)
+
+* Right-click the project you want to build and press Build in the submenu or press F5.
+
+  ![20](https://i.imgur.com/CkQgOFW.png)
+
+## Method II: MinGW-w64 Build with MSYS2
+
+### Prerequisites to install
+
+* [MSYS2](https://www.msys2.org)
+* [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows) - **Make sure to select Latest SDK.**
+* Make sure to follow the instructions and update to the latest version by running `pacman -Syu` as many times as needed.
+
+### Install eden dependencies for MinGW-w64
+
+* Open the `MSYS2 MinGW 64-bit` (mingw64.exe) shell
+* Download and install all dependencies using: `pacman -Syu git make mingw-w64-x86_64-SDL2 mingw-w64-x86_64-cmake mingw-w64-x86_64-python-pip mingw-w64-x86_64-qt6 mingw-w64-x86_64-toolchain autoconf libtool automake-wrapper`
+* Add MinGW binaries to the PATH: `echo 'PATH=/mingw64/bin:$PATH' >> ~/.bashrc`
+* Add glslangValidator to the PATH: `echo 'PATH=$(readlink -e /c/VulkanSDK/*/Bin/):$PATH' >> ~/.bashrc`
+
+### Clone the eden repository with Git
+
+  ```bash
+  git clone --recursive https://git.eden-emu.dev/eden-emu/eden
+  cd eden
+  ```
+
+### Run the following commands to build eden (dynamically linked build)
+
+```bash
+mkdir build && cd build
+cmake -G "MSYS Makefiles" -DYUZU_TESTS=OFF ..
+make -j$(nproc)
+# test eden out with
+./bin/eden.exe
+```
+
+* *(Note: This build is not a static build meaning that you need to include all of the DLLs with the .exe in order to use it!)*
+
+e.g.
+```Bash
+cp externals/ffmpeg-*/bin/*.dll bin/
+```
+
+Bonus Note: Running programs from inside `MSYS2 MinGW x64` shell has a different %PATH% than directly from explorer. This different %PATH% has the locations of the other DLLs required.
+![image](https://user-images.githubusercontent.com/190571/165000848-005e8428-8a82-41b1-bb4d-4ce7797cdac8.png)
+
+
+### Building without Qt (Optional)
+
+Doesn't require the rather large Qt dependency, but you will lack a GUI frontend:
+
+  * Pass the `-DENABLE_QT=no` flag to cmake
+
+## Method III: CLion Environment Setup
+
+### Minimal Dependencies
+
+To build eden, you need to install the following:
+
+* [CLion](https://www.jetbrains.com/clion/) - This IDE is not free; for a free alternative, check Method I
+* [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows) - Make sure to select the Latest SDK.
+
+### Cloning eden with CLion
+
+* Clone the Repository:
+
+![1](https://user-images.githubusercontent.com/42481638/216899046-0d41d7d6-8e4d-4ed2-9587-b57088af5214.png)
+![2](https://user-images.githubusercontent.com/42481638/216899061-b2ea274a-e88c-40ae-bf0b-4450b46e9fea.png)
+![3](https://user-images.githubusercontent.com/42481638/216899076-0e5988c4-d431-4284-a5ff-9ecff973db76.png)
+
+
+
+### Building & Setup
+
+* Once Cloned, You will be taken to a prompt like the image below:
+
+![4](https://user-images.githubusercontent.com/42481638/216899092-3fe4cec6-a540-44e3-9e1e-3de9c2fffc2f.png)
+
+* Set the settings to the image below:
+* Change `Build type: Release`
+* Change `Name: Release`
+* Change `Toolchain Visual Studio`
+* Change `Generator: Let CMake decide`
+* Change `Build directory: build`
+
+![5](https://user-images.githubusercontent.com/42481638/216899164-6cee8482-3d59-428f-b1bc-e6dc793c9b20.png)
+
+* Click OK; now Clion will build a directory and index your code to allow for IntelliSense. Please be patient.
+* Once this process has been completed (No loading bar bottom right), you can now build eden
+* In the top right, click on the drop-down menu, select all configurations, then select eden
+
+![6](https://user-images.githubusercontent.com/42481638/216899226-975048e9-bc6d-4ec1-bc2d-bd8a1e15ed04.png)
+
+* Now run by clicking the play button or pressing Shift+F10, and eden will auto-launch once built.
+
+![7](https://user-images.githubusercontent.com/42481638/216899275-d514ec6a-e563-470e-81e2-3e04f0429b68.png)
+
+## Building from the command line with MSVC
+
+```cmd
+git clone --recursive https://git.eden-emu.dev/eden-emu/eden
+cd eden
+mkdir build
+cd build
+cmake .. -G "Visual Studio 17 2022" -A x64
+cmake --build . --config Release
+```
+
+### Building with Scripts
+A convenience script for building is provided in `.ci/windows/build.sh`. You must run this with Bash, e.g. Git Bash or MinGW TTY. To use this script, you must have windeployqt installed (usually bundled with Qt) and set the `WINDEPLOYQT` environment variable to its canonical Bash location, e.g. `WINDEPLOYQT="/c/Qt/6.9.1/msvc2022_64/bin/windeployqt6.exe" .ci/windows/build.sh`.
+
+Extra CMake flags should be placed in the arguments of the script.
+
+Additional environment variables can be used to control building:
+- `BUILD_TYPE`: Sets the build type to use. Defaults to `Release`
+
+The following environment variables are boolean flags. Set to `true` to enable or `false` to disable:
+- `DEVEL` (default FALSE): Disable Qt update checker
+- `USE_WEBENGINE` (default FALSE): Enable Qt WebEngine
+- `USE_MULTIMEDIA` (default TRUE): Enable Qt Multimedia
+- `BUNDLE_QT` (default FALSE): Use bundled Qt
+  * Note that using system Qt requires you to include the Qt CMake directory in `CMAKE_PREFIX_PATH`, e.g. `.ci/windows/build.sh -DCMAKE_PREFIX_PATH=C:/Qt/6.9.0/msvc2022_64/lib/cmake/Qt6`
+
+After building, a zip can be packaged via `.ci/windows/package.sh`. Note that you must have 7-zip installed and in your PATH. The resulting zip will be placed into `artifacts` in the source directory.
diff --git a/docs/build/macOS.md b/docs/build/macOS.md
index 6cb62273cb..fd1873b849 100644
--- a/docs/build/macOS.md
+++ b/docs/build/macOS.md
@@ -1,105 +1,78 @@
-Please note this article is intended for development, and eden on macOS is not currently ready for regular use.
-
-This article was written for developers. eden support for macOS is not ready for casual use.
-
-## Method I: ninja
----
-If you are compiling on Intel Mac or are using a Rosetta Homebrew installation, you must replace all references of `/opt/homebrew` to `/usr/local`.
-
-Install dependencies from Homebrew:
-```sh
-brew install autoconf automake boost ccache ffmpeg fmt glslang hidapi libtool libusb lz4 ninja nlohmann-json openssl pkg-config qt@6 sdl2 speexdsp zlib zlib zstd cmake Catch2 molten-vk vulkan-loader
-```
-
-Clone the repo
-```sh
-git clone --recursive https://git.eden-emu.dev/eden-emu/eden
-
-cd eden
-```
-
-Build for release
-```sh
-mkdir build && cd build
-
-export Qt6_DIR="/opt/homebrew/opt/qt@6/lib/cmake"
-
-export LIBVULKAN_PATH=/opt/homebrew/lib/libvulkan.dylib
-
-cmake .. -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU_USE_BUNDLED_VCPKG=OFF -DYUZU_TESTS=OFF -DENABLE_WEB_SERVICE=ON -DENABLE_LIBUSB=OFF -DCLANG_FORMAT=ON -DSDL2_DISABLE_INSTALL=ON -DSDL_ALTIVEC=ON
-
-ninja
-```
-
-You may also want to include support for Discord Rich Presence by adding `-DUSE_DISCORD_PRESENCE=ON` after `cmake ..`
-
-Build with debug symbols (vcpkg is not currently used due to broken boost-context library):
-```sh
-mkdir build && cd build
-export Qt6_DIR="/opt/homebrew/opt/qt@6/lib/cmake"
-cmake .. -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU_USE_BUNDLED_VCPKG=OFF -DYUZU_TESTS=OFF -DENABLE_WEB_SERVICE=OFF -DENABLE_LIBUSB=OFF
-ninja
-```
-
-Run the output:
-```
-bin/eden.app/Contents/MacOS/eden
-```
-
-## Method II: Xcode
-
----
-If you are compiling on Intel Mac or are using a Rosetta Homebrew installation, you must replace all references of `/opt/homebrew` to `/usr/local`.
-
-Install dependencies from Homebrew:
-```sh
-brew install autoconf automake boost ccache ffmpeg fmt glslang hidapi libtool libusb lz4 ninja nlohmann-json openssl pkg-config qt@6 sdl2 speexdsp zlib zlib zstd cmake Catch2 molten-vk vulkan-loader
-```
-
-Clone the repo
-```sh
-git clone --recursive https://git.eden-emu.dev/eden-emu/eden
-
-cd eden
-```
-
-Build for release
-```sh
-mkdir build && cd build
-
-export Qt6_DIR="/opt/homebrew/opt/qt@6/lib/cmake"
-
-export LIBVULKAN_PATH=/opt/homebrew/lib/libvulkan.dylib
-
-cmake .. -GXcode -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU_USE_BUNDLED_VCPKG=OFF -DYUZU_TESTS=OFF -DENABLE_WEB_SERVICE=ON -DENABLE_LIBUSB=OFF -DCLANG_FORMAT=ON -DSDL2_DISABLE_INSTALL=ON -DSDL_ALTIVEC=ON
-
-xcodebuild build -project eden.xcodeproj -scheme "eden" -configuration "RelWithDebInfo"
-```
-
-You may also want to include support for Discord Rich Presence by adding `-DUSE_DISCORD_PRESENCE=ON` after `cmake ..`
-
-Build with debug symbols (vcpkg is not currently used due to broken boost-context library):
-```sh
-mkdir build && cd build
-export Qt6_DIR="/opt/homebrew/opt/qt@6/lib/cmake"
-cmake .. -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU_USE_BUNDLED_VCPKG=OFF -DYUZU_TESTS=OFF -DENABLE_WEB_SERVICE=OFF -DENABLE_LIBUSB=OFF
-ninja
-```
-
-Run the output:
-```
-bin/eden.app/Contents/MacOS/eden
-```
-
----
-
-To run with MoltenVK, install additional dependencies:
-```sh
-brew install molten-vk vulkan-loader
-```
-
-Run with Vulkan loader path:
-```sh
-export LIBVULKAN_PATH=/opt/homebrew/lib/libvulkan.dylib
-bin/eden.app/Contents/MacOS/eden
-```
\ No newline at end of file
+Please note this article is intended for development, and Eden on macOS is not currently ready for regular use.
+
+This article was written for developers. Eden support for macOS is not ready for casual use.
+
+## Dependencies
+Install dependencies from Homebrew:
+```sh
+brew install autoconf automake boost ffmpeg fmt glslang hidapi libtool libusb lz4 ninja nlohmann-json openssl pkg-config qt@6 sdl2 speexdsp zlib zstd cmake Catch2 molten-vk vulkan-loader spirv-tools
+```
+
+If you are compiling on Intel Mac, or are using a Rosetta Homebrew installation, you must replace all references of `/opt/homebrew` with `/usr/local`.
+
+Now, clone the repo:
+```sh
+git clone --recursive https://git.eden-emu.dev/eden-emu/eden
+cd eden
+```
+
+## Method I: ninja
+
+---
+Build for release
+```sh
+export Qt6_DIR="/opt/homebrew/opt/qt@6/lib/cmake"
+export LIBVULKAN_PATH=/opt/homebrew/lib/libvulkan.dylib
+cmake -B build -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU_TESTS=OFF -DENABLE_WEB_SERVICE=ON -DENABLE_LIBUSB=OFF -DCLANG_FORMAT=ON -DSDL2_DISABLE_INSTALL=ON -DSDL_ALTIVEC=ON
+ninja
+```
+
+You may also want to include support for Discord Rich Presence by adding `-DUSE_DISCORD_PRESENCE=ON`
+```sh
+export Qt6_DIR="/opt/homebrew/opt/qt@6/lib/cmake"
+cmake -B build -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU_TESTS=OFF -DENABLE_WEB_SERVICE=OFF -DENABLE_LIBUSB=OFF
+ninja
+```
+
+Run the output:
+```
+bin/eden.app/Contents/MacOS/eden
+```
+
+## Method II: Xcode
+
+---
+Build for release
+```sh
+export Qt6_DIR="/opt/homebrew/opt/qt@6/lib/cmake"
+export LIBVULKAN_PATH=/opt/homebrew/lib/libvulkan.dylib
+# Only if having errors about Xcode 15.0
+sudo /usr/bin/xcode-select --switch /Users/admin/Downloads/Xcode.ap
+cmake -B build -GXcode -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU_TESTS=OFF -DENABLE_WEB_SERVICE=ON -DENABLE_LIBUSB=OFF -DCLANG_FORMAT=ON -DSDL2_DISABLE_INSTALL=ON -DSDL_ALTIVEC=ON
+xcodebuild build -project yuzu.xcodeproj -scheme "yuzu" -configuration "RelWithDebInfo"
+```
+
+Build with debug symbols:
+```sh
+export Qt6_DIR="/opt/homebrew/opt/qt@6/lib/cmake"
+cmake -B build -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DYUZU_TESTS=OFF -DENABLE_WEB_SERVICE=OFF -DENABLE_LIBUSB=OFF
+ninja
+```
+
+Run the output:
+```
+bin/eden.app/Contents/MacOS/eden
+```
+
+---
+
+To run with MoltenVK, install additional dependencies:
+```sh
+brew install molten-vk vulkan-loader
+```
+
+Run with Vulkan loader path:
+```sh
+export LIBVULKAN_PATH=/opt/homebrew/lib/libvulkan.dylib
+bin/eden.app/Contents/MacOS/eden
+```
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 25886021e2..6f64c79f5d 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
 # SPDX-FileCopyrightText: 2025 Eden Emulator Project
 # SPDX-License-Identifier: GPL-3.0-or-later
 
@@ -7,8 +10,6 @@
 # TODO(crueter): A lot of this should be moved to the root.
 # otherwise we have to do weird shenanigans with library linking and stuff
 
-# Explicitly include CPMUtil here since we have a separate cpmfile for externals
-set(CPMUTIL_JSON_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cpmfile.json)
 include(CPMUtil)
 
 # Explicitly declare this option here to propagate to the oaknut CPM call
@@ -67,7 +68,7 @@ if (mbedtls_ADDED)
 endif()
 
 # libusb
-if (ENABLE_LIBUSB AND NOT TARGET libusb::usb)
+if (ENABLE_LIBUSB)
     add_subdirectory(libusb)
 endif()
 
@@ -107,21 +108,17 @@ if (YUZU_USE_BUNDLED_FFMPEG)
     set(FFmpeg_INCLUDE_DIR "${FFmpeg_INCLUDE_DIR}" PARENT_SCOPE)
 endif()
 
-# Vulkan-Headers
-
-# TODO(crueter): Vk1.4 impl
-
+# VulkanUtilityHeaders - pulls in headers and utility libs
 AddJsonPackage(
-    NAME vulkan-headers
-    BUNDLED_PACKAGE ${YUZU_USE_EXTERNAL_VULKAN_HEADERS}
-)
-
-# Vulkan-Utility-Libraries
-AddJsonPackage(
-    NAME vulkan-utility-libraries
+    NAME vulkan-utility-headers
     BUNDLED_PACKAGE ${YUZU_USE_EXTERNAL_VULKAN_UTILITY_LIBRARIES}
 )
 
+# small hack
+if (NOT VulkanUtilityLibraries_ADDED)
+    find_package(VulkanHeaders 1.3.274 REQUIRED)
+endif()
+
 # SPIRV Tools
 AddJsonPackage(
     NAME spirv-tools
@@ -239,7 +236,7 @@ if (YUZU_CRASH_DUMPS AND NOT TARGET libbreakpad_client)
         file(GLOB_RECURSE LIBBREAKPAD_CLIENT_SOURCES ${breakpad_SOURCE_DIR}/src/client/mac/*.cc ${breakpad_SOURCE_DIR}/src/common/mac/*.cc)
         list(APPEND LIBBREAKPAD_CLIENT_SOURCES ${breakpad_SOURCE_DIR}/src/common/mac/MachIPC.mm)
     else()
-        target_compile_definitions(libbreakpad_client PUBLIC -DHAVE_A_OUT_H)
+        target_compile_definitions(libbreakpad_client PUBLIC HAVE_A_OUT_H)
         file(GLOB_RECURSE LIBBREAKPAD_CLIENT_SOURCES ${breakpad_SOURCE_DIR}/src/client/linux/*.cc ${breakpad_SOURCE_DIR}/src/common/linux/*.cc)
     endif()
     list(APPEND LIBBREAKPAD_CLIENT_SOURCES ${LIBBREAKPAD_COMMON_SOURCES})
diff --git a/externals/cpmfile.json b/externals/cpmfile.json
index 4bc4a97ca4..f8ca528951 100644
--- a/externals/cpmfile.json
+++ b/externals/cpmfile.json
@@ -3,6 +3,7 @@
         "repo": "Mbed-TLS/mbedtls",
         "sha": "8c88150ca1",
         "hash": "769ad1e94c570671071e1f2a5c0f1027e0bf6bcdd1a80ea8ac970f2c86bc45ce4e31aa88d6d8110fc1bed1de81c48bc624df1b38a26f8b340a44e109d784a966",
+        "find_args": "MODULE",
         "patches": [
             "0001-cmake-version.patch"
         ]
@@ -42,18 +43,13 @@
             "0002-missing-decl.patch"
         ]
     },
-    "vulkan-headers": {
-        "package": "VulkanHeaders",
-        "version": "1.3.274",
-        "repo": "KhronosGroup/Vulkan-Headers",
-        "sha": "89268a6d17",
-        "hash": "3ab349f74298ba72cafb8561015690c0674d428a09fb91ccd3cd3daca83650d190d46d33fd97b0a8fd4223fe6df2bcabae89136fbbf7c0bfeb8776f9448304c8"
-    },
-    "vulkan-utility-libraries": {
+    "vulkan-utility-headers": {
         "package": "VulkanUtilityLibraries",
-        "repo": "KhronosGroup/Vulkan-Utility-Libraries",
-        "sha": "df2e358152",
-        "hash": "3e468c3d9ff93f6d418d71e5527abe0a12c8c7ab5b0b52278bbbee4d02bb87e99073906729b727e0147242b7e3fd5dedf68b803f1878cb4c0e4f730bc2238d79"
+        "repo": "scripts/VulkanUtilityHeaders",
+        "tag": "1.4.326",
+        "artifact": "VulkanUtilityHeaders.tar.zst",
+        "git_host": "git.crueter.xyz",
+        "hash": "5924629755cb1605c4aa4eee20ef7957a9dd8d61e4df548be656d98054f2730c4109693c1bd35811f401f4705d2ccff9fc849be32b0d8480bc3f73541a5e0964"
     },
     "vulkan-memory-allocator": {
         "package": "VulkanMemoryAllocator",
diff --git a/externals/ffmpeg/CMakeLists.txt b/externals/ffmpeg/CMakeLists.txt
index ff35c8dc2c..8908aa234f 100644
--- a/externals/ffmpeg/CMakeLists.txt
+++ b/externals/ffmpeg/CMakeLists.txt
@@ -1,8 +1,6 @@
 # SPDX-FileCopyrightText: 2021 yuzu Emulator Project
 # SPDX-License-Identifier: GPL-2.0-or-later
 
-# Explicitly include CPMUtil here since we have a separate cpmfile for ffmpeg
-set(CPMUTIL_JSON_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cpmfile.json)
 include(CPMUtil)
 
 if (NOT WIN32 AND NOT ANDROID)
diff --git a/externals/libusb/CMakeLists.txt b/externals/libusb/CMakeLists.txt
index ec7724e874..0a20ca94b8 100644
--- a/externals/libusb/CMakeLists.txt
+++ b/externals/libusb/CMakeLists.txt
@@ -1,7 +1,15 @@
 # SPDX-FileCopyrightText: 2020 yuzu Emulator Project
 # SPDX-License-Identifier: GPL-2.0-or-later
 
-if (MINGW OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux") OR APPLE)
+include(CPMUtil)
+
+AddJsonPackage(libusb)
+
+if (NOT libusb_ADDED)
+    return()
+endif()
+
+if (MINGW OR PLATFORM_LINUX OR APPLE)
     set(LIBUSB_FOUND ON CACHE BOOL "libusb is present" FORCE)
     set(LIBUSB_VERSION "1.0.24" CACHE STRING "libusb version string" FORCE)
 
@@ -19,8 +27,8 @@ if (MINGW OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux") OR APPLE)
         message(FATAL_ERROR "Required program `libtoolize` not found.")
     endif()
 
-    set(LIBUSB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/libusb")
-    set(LIBUSB_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/libusb")
+    set(LIBUSB_PREFIX "${libusb_BINARY_DIR}")
+    set(LIBUSB_SRC_DIR "${libusb_SOURCE_DIR}")
 
     # Workarounds for MSYS/MinGW
     if (MSYS)
@@ -118,27 +126,27 @@ else() # MINGW OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
     endif()
 
     add_library(usb
-        libusb/libusb/core.c
-        libusb/libusb/core.c
-        libusb/libusb/descriptor.c
-        libusb/libusb/hotplug.c
-        libusb/libusb/io.c
-        libusb/libusb/strerror.c
-        libusb/libusb/sync.c
+        ${libusb_SOURCE_DIR}/libusb/core.c
+        ${libusb_SOURCE_DIR}/libusb/core.c
+        ${libusb_SOURCE_DIR}/libusb/descriptor.c
+        ${libusb_SOURCE_DIR}/libusb/hotplug.c
+        ${libusb_SOURCE_DIR}/libusb/io.c
+        ${libusb_SOURCE_DIR}/libusb/strerror.c
+        ${libusb_SOURCE_DIR}/libusb/sync.c
     )
     set_target_properties(usb PROPERTIES VERSION 1.0.24)
     if(WIN32)
         target_include_directories(usb
             BEFORE
             PUBLIC
-              libusb/libusb
+              ${libusb_SOURCE_DIR}/libusb
 
             PRIVATE
               "${CMAKE_CURRENT_BINARY_DIR}"
         )
 
         if (NOT MINGW)
-            target_include_directories(usb BEFORE PRIVATE libusb/msvc)
+            target_include_directories(usb BEFORE PRIVATE ${libusb_SOURCE_DIR}/msvc)
         endif()
 
     else()
@@ -148,7 +156,7 @@ else() # MINGW OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
             BEFORE
 
             PUBLIC
-              libusb/libusb
+              ${libusb_SOURCE_DIR}/libusb
 
             PRIVATE
               "${CMAKE_CURRENT_BINARY_DIR}"
@@ -157,15 +165,15 @@ else() # MINGW OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
 
     if(WIN32 OR CYGWIN)
         target_sources(usb PRIVATE
-          libusb/libusb/os/threads_windows.c
-          libusb/libusb/os/windows_winusb.c
-          libusb/libusb/os/windows_usbdk.c
-          libusb/libusb/os/windows_common.c
+          ${libusb_SOURCE_DIR}/libusb/os/threads_windows.c
+          ${libusb_SOURCE_DIR}/libusb/os/windows_winusb.c
+          ${libusb_SOURCE_DIR}/libusb/os/windows_usbdk.c
+          ${libusb_SOURCE_DIR}/libusb/os/windows_common.c
         )
         set(OS_WINDOWS TRUE)
     elseif(APPLE)
         target_sources(usb PRIVATE
-            libusb/libusb/os/darwin_usb.c
+            ${libusb_SOURCE_DIR}/libusb/os/darwin_usb.c
         )
         find_library(COREFOUNDATION_LIBRARY CoreFoundation)
         find_library(IOKIT_LIBRARY IOKit)
@@ -178,20 +186,20 @@ else() # MINGW OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
         set(OS_DARWIN TRUE)
     elseif(ANDROID)
         target_sources(usb PRIVATE
-            libusb/libusb/os/linux_usbfs.c
-            libusb/libusb/os/linux_netlink.c
+            ${libusb_SOURCE_DIR}/libusb/os/linux_usbfs.c
+            ${libusb_SOURCE_DIR}/libusb/os/linux_netlink.c
         )
         find_library(LOG_LIBRARY log)
         target_link_libraries(usb PRIVATE ${LOG_LIBRARY})
         set(OS_LINUX TRUE)
     elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
         target_sources(usb PRIVATE
-            libusb/libusb/os/linux_usbfs.c
+            ${libusb_SOURCE_DIR}/libusb/os/linux_usbfs.c
         )
         find_package(Libudev)
         if(LIBUDEV_FOUND)
             target_sources(usb PRIVATE
-                libusb/libusb/os/linux_udev.c
+                ${libusb_SOURCE_DIR}/libusb/os/linux_udev.c
             )
             target_link_libraries(usb PRIVATE "${LIBUDEV_LIBRARIES}")
             target_include_directories(usb PRIVATE "${LIBUDEV_INCLUDE_DIR}")
@@ -199,26 +207,26 @@ else() # MINGW OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
             set(USE_UDEV TRUE)
         else()
             target_sources(usb PRIVATE
-                libusb/libusb/os/linux_netlink.c
+                ${libusb_SOURCE_DIR}/libusb/os/linux_netlink.c
             )
         endif()
         set(OS_LINUX TRUE)
     elseif(${CMAKE_SYSTEM_NAME} MATCHES "NetBSD")
         target_sources(usb PRIVATE
-            libusb/libusb/os/netbsd_usb.c
+            ${libusb_SOURCE_DIR}/libusb/os/netbsd_usb.c
         )
         set(OS_NETBSD TRUE)
     elseif(${CMAKE_SYSTEM_NAME} MATCHES "OpenBSD")
         target_sources(usb PRIVATE
-            libusb/libusb/os/openbsd_usb.c
+            ${libusb_SOURCE_DIR}/libusb/os/openbsd_usb.c
         )
         set(OS_OPENBSD TRUE)
     endif()
 
     if(UNIX)
         target_sources(usb PRIVATE
-            libusb/libusb/os/events_posix.c
-            libusb/libusb/os/threads_posix.c
+            ${libusb_SOURCE_DIR}/libusb/os/events_posix.c
+            ${libusb_SOURCE_DIR}/libusb/os/threads_posix.c
         )
         find_package(Threads REQUIRED)
         if(THREADS_HAVE_PTHREAD_ARG)
@@ -230,8 +238,8 @@ else() # MINGW OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
         set(THREADS_POSIX TRUE)
     elseif(WIN32)
         target_sources(usb PRIVATE
-            libusb/libusb/os/events_windows.c
-            libusb/libusb/os/threads_windows.c
+            ${libusb_SOURCE_DIR}/libusb/os/events_windows.c
+            ${libusb_SOURCE_DIR}/libusb/os/threads_windows.c
         )
     endif()
 
diff --git a/externals/libusb/cpmfile.json b/externals/libusb/cpmfile.json
new file mode 100644
index 0000000000..0bfa0d7a86
--- /dev/null
+++ b/externals/libusb/cpmfile.json
@@ -0,0 +1,8 @@
+{
+	"libusb": {
+		"repo": "libusb/libusb",
+		"sha": "c060e9ce30",
+		"hash": "44647357ba1179020cfa6674d809fc35cf6f89bff1c57252fe3a610110f5013ad678fc6eb5918e751d4384c30e2fe678868dbffc5f85736157e546cb9d10accc",
+		"find_args": "MODULE"
+	}
+}
\ No newline at end of file
diff --git a/externals/libusb/libusb b/externals/libusb/libusb
deleted file mode 160000
index c060e9ce30..0000000000
--- a/externals/libusb/libusb
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c060e9ce30ac2e3ffb49d94209c4dae77b6642f7
diff --git a/externals/nx_tzdb/CMakeLists.txt b/externals/nx_tzdb/CMakeLists.txt
index 35d3e6d2a8..242e1e1fcf 100644
--- a/externals/nx_tzdb/CMakeLists.txt
+++ b/externals/nx_tzdb/CMakeLists.txt
@@ -4,8 +4,6 @@
 # SPDX-FileCopyrightText: 2023 yuzu Emulator Project
 # SPDX-License-Identifier: GPL-2.0-or-later
 
-# Explicitly include CPMUtil here since we have a separate cpmfile for nx_tzdb
-set(CPMUTIL_JSON_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cpmfile.json)
 include(CPMUtil)
 
 set(NX_TZDB_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/include")
diff --git a/externals/nx_tzdb/cpmfile.json b/externals/nx_tzdb/cpmfile.json
index fc7dd77628..feb9daf7da 100644
--- a/externals/nx_tzdb/cpmfile.json
+++ b/externals/nx_tzdb/cpmfile.json
@@ -1,7 +1,10 @@
 {
     "tzdb": {
         "package": "nx_tzdb",
-        "url": "https://github.com/crueter/tzdb_to_nx/releases/download/250725/250725.zip",
+        "repo": "misc/tzdb_to_nx",
+        "git_host": "git.crueter.xyz",
+        "artifact": "%VERSION%.zip",
+        "tag": "%VERSION%",
         "hash": "8f60b4b29f285e39c0443f3d5572a73780f3dbfcfd5b35004451fadad77f3a215b2e2aa8d0fffe7e348e2a7b0660882b35228b6178dda8804a14ce44509fd2ca",
         "version": "250725"
     }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bd1285b2bc..b1fbab6a59 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -22,16 +22,16 @@ if (MSVC)
     set(CMAKE_CONFIGURATION_TYPES Debug Release CACHE STRING "" FORCE)
 
     # Silence "deprecation" warnings
-    add_definitions(-D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE -D_SCL_SECURE_NO_WARNINGS)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS _CRT_NONSTDC_NO_DEPRECATE _SCL_SECURE_NO_WARNINGS)
 
     # Avoid windows.h junk
-    add_definitions(-DNOMINMAX)
+    add_compile_definitions(NOMINMAX)
 
     # Avoid windows.h from including some usually unused libs like winsocks.h, since this might cause some redefinition errors.
-    add_definitions(-DWIN32_LEAN_AND_MEAN)
+    add_compile_definitions(WIN32_LEAN_AND_MEAN)
 
     # Ensure that projects are built with Unicode support.
-    add_definitions(-DUNICODE -D_UNICODE)
+    add_compile_definitions(UNICODE _UNICODE)
 
     # /W4                 - Level 4 warnings
     # /MP                 - Multi-threaded compilation
@@ -169,15 +169,15 @@ else()
     # glibc, which may default to 32 bits. glibc allows this to be configured
     # by setting _FILE_OFFSET_BITS.
     if(CMAKE_SYSTEM_NAME STREQUAL "Linux" OR MINGW)
-        add_definitions(-D_FILE_OFFSET_BITS=64)
+        add_compile_definitions(_FILE_OFFSET_BITS=64)
     endif()
 
     if (MINGW)
-        add_definitions(-DMINGW_HAS_SECURE_API)
+        add_compile_definitions(MINGW_HAS_SECURE_API)
         add_compile_options("-msse4.1")
 
         if (MINGW_STATIC_BUILD)
-            add_definitions(-DQT_STATICPLUGIN)
+            add_compile_definitions(QT_STATICPLUGIN)
             add_compile_options("-static")
         endif()
     endif()
diff --git a/src/android/app/src/main/jni/CMakeLists.txt b/src/android/app/src/main/jni/CMakeLists.txt
index 9dbee1fcef..9ad00d26ee 100644
--- a/src/android/app/src/main/jni/CMakeLists.txt
+++ b/src/android/app/src/main/jni/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(yuzu-android SHARED
 
 set_property(TARGET yuzu-android PROPERTY IMPORTED_LOCATION ${FFmpeg_LIBRARY_DIR})
 
-target_link_libraries(yuzu-android PRIVATE audio_core common core input_common frontend_common Vulkan::Headers GPUOpen::VulkanMemoryAllocator)
+target_link_libraries(yuzu-android PRIVATE audio_core common core input_common frontend_common video_core)
 target_link_libraries(yuzu-android PRIVATE android camera2ndk EGL glad jnigraphics log)
 if (ARCHITECTURE_arm64)
     target_link_libraries(yuzu-android PRIVATE adrenotools)
diff --git a/src/audio_core/CMakeLists.txt b/src/audio_core/CMakeLists.txt
index e040ec756d..389b1044e9 100644
--- a/src/audio_core/CMakeLists.txt
+++ b/src/audio_core/CMakeLists.txt
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
 # SPDX-FileCopyrightText: 2018 yuzu Emulator Project
 # SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -229,9 +232,10 @@ endif()
 target_include_directories(audio_core PRIVATE ${OPUS_INCLUDE_DIRS})
 target_link_libraries(audio_core PUBLIC common core opus)
 
-if (ARCHITECTURE_x86_64 OR ARCHITECTURE_arm64)
-    target_link_libraries(audio_core PRIVATE dynarmic::dynarmic)
-endif()
+# what?
+# if (ARCHITECTURE_x86_64 OR ARCHITECTURE_arm64)
+#     target_link_libraries(audio_core PRIVATE dynarmic::dynarmic)
+# endif()
 
 if (ENABLE_CUBEB)
     target_sources(audio_core PRIVATE
@@ -240,7 +244,7 @@ if (ENABLE_CUBEB)
     )
 
     target_link_libraries(audio_core PRIVATE cubeb)
-    target_compile_definitions(audio_core PRIVATE -DHAVE_CUBEB=1)
+    target_compile_definitions(audio_core PRIVATE HAVE_CUBEB=1)
 endif()
 
 if (ENABLE_SDL2)
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 33990d61a5..1979d427b5 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
 # SPDX-FileCopyrightText: 2018 yuzu Emulator Project
 # SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -1155,7 +1158,7 @@ add_library(core STATIC
 
 if (ENABLE_WIFI_SCAN)
     # find_package(libiw REQUIRED)
-    target_compile_definitions(core PRIVATE -DENABLE_WIFI_SCAN)
+    target_compile_definitions(core PRIVATE ENABLE_WIFI_SCAN)
     target_link_libraries(core PRIVATE iw)
 endif()
 
@@ -1196,13 +1199,13 @@ else()
     target_link_libraries(core PUBLIC Boost::headers)
 endif()
 
-target_link_libraries(core PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls RenderDoc::API)
+target_link_libraries(core PRIVATE fmt::fmt nlohmann_json::nlohmann_json RenderDoc::API mbedtls)
 if (MINGW)
     target_link_libraries(core PRIVATE ${MSWSOCK_LIBRARY})
 endif()
 
 if (ENABLE_WEB_SERVICE)
-    target_compile_definitions(core PUBLIC -DENABLE_WEB_SERVICE)
+    target_compile_definitions(core PUBLIC ENABLE_WEB_SERVICE)
     target_link_libraries(core PUBLIC web_service)
 endif()
 
diff --git a/src/core/crypto/key_manager.cpp b/src/core/crypto/key_manager.cpp
index 74b1ca04b1..04b75d5e8f 100644
--- a/src/core/crypto/key_manager.cpp
+++ b/src/core/crypto/key_manager.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -31,6 +34,10 @@
 #include "core/hle/service/filesystem/filesystem.h"
 #include "core/loader/loader.h"
 
+#ifndef MBEDTLS_CMAC_C
+#error mbedtls was compiled without CMAC support. Check your USE flags (Gentoo) or contact your package maintainer.
+#endif
+
 namespace Core::Crypto {
 namespace {
 
diff --git a/src/dedicated_room/CMakeLists.txt b/src/dedicated_room/CMakeLists.txt
index 9391a71b6c..e5934c941a 100644
--- a/src/dedicated_room/CMakeLists.txt
+++ b/src/dedicated_room/CMakeLists.txt
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
 # SPDX-FileCopyrightText: 2017 Citra Emulator Project
 # SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -13,7 +16,7 @@ add_library(yuzu-room STATIC EXCLUDE_FROM_ALL
 
 target_link_libraries(yuzu-room PRIVATE common network)
 if (ENABLE_WEB_SERVICE)
-    target_compile_definitions(yuzu-room PRIVATE -DENABLE_WEB_SERVICE)
+    target_compile_definitions(yuzu-room PRIVATE ENABLE_WEB_SERVICE)
     target_link_libraries(yuzu-room PRIVATE web_service)
 endif()
 
diff --git a/src/dynarmic/CMakeLists.txt b/src/dynarmic/CMakeLists.txt
index 842eb91a88..0065b1cf7f 100644
--- a/src/dynarmic/CMakeLists.txt
+++ b/src/dynarmic/CMakeLists.txt
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
 cmake_minimum_required(VERSION 3.12)
 project(dynarmic LANGUAGES C CXX ASM VERSION 6.7.0)
 
@@ -147,28 +150,26 @@ else()
     endif()
 endif()
 
-# Forced use of individual bundled libraries for non-REQUIRED library is possible with e.g. cmake -DCMAKE_DISABLE_FIND_PACKAGE_fmt=ON ...
-
-if (DYNARMIC_USE_BUNDLED_EXTERNALS)
-    set(CMAKE_DISABLE_FIND_PACKAGE_biscuit ON)
-    set(CMAKE_DISABLE_FIND_PACKAGE_fmt ON)
-    set(CMAKE_DISABLE_FIND_PACKAGE_mcl ON)
-    set(CMAKE_DISABLE_FIND_PACKAGE_oaknut ON)
-    set(CMAKE_DISABLE_FIND_PACKAGE_unordered_dense ON)
-    set(CMAKE_DISABLE_FIND_PACKAGE_xbyak ON)
-    set(CMAKE_DISABLE_FIND_PACKAGE_Zydis ON)
-    set(CMAKE_DISABLE_FIND_PACKAGE_Zycore ON)
-endif()
-
 find_package(Boost 1.57 REQUIRED)
 find_package(fmt 9 CONFIG)
 
+# Pull in externals CMakeLists for libs where available
+add_subdirectory(externals)
+
+find_package(mcl 0.1.12 REQUIRED)
+
 if ("arm64" IN_LIST ARCHITECTURE OR DYNARMIC_TESTS)
     find_package(oaknut 2.0.1 CONFIG)
 endif()
 
+if ("riscv" IN_LIST ARCHITECTURE)
+    find_package(biscuit 0.9.1 REQUIRED)
+endif()
+
 if ("x86_64" IN_LIST ARCHITECTURE)
     find_package(xbyak 7 CONFIG)
+    find_package(zycore REQUIRED)
+    find_package(zydis 4 REQUIRED)
 endif()
 
 if (DYNARMIC_USE_LLVM)
@@ -183,9 +184,6 @@ if (DYNARMIC_TESTS)
     endif()
 endif()
 
-# Pull in externals CMakeLists for libs where available
-add_subdirectory(externals)
-
 # Dynarmic project files
 add_subdirectory(src/dynarmic)
 if (DYNARMIC_TESTS)
diff --git a/src/dynarmic/externals/CMakeLists.txt b/src/dynarmic/externals/CMakeLists.txt
index 26f9290ed8..ea666ddc52 100644
--- a/src/dynarmic/externals/CMakeLists.txt
+++ b/src/dynarmic/externals/CMakeLists.txt
@@ -1,5 +1,6 @@
-# Explicitly include CPMUtil here since we have a separate cpmfile for dynarmic
-set(CPMUTIL_JSON_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cpmfile.json)
+# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
 include(CPMUtil)
 
 # Always build externals as static libraries, even when dynarmic is built as shared
@@ -20,62 +21,25 @@ set(BUILD_TESTING OFF)
 # biscuit
 
 if ("riscv" IN_LIST ARCHITECTURE)
-    add_subdirectory(biscuit)
-
     AddJsonPackage(
         NAME biscuit
         BUNDLED_PACKAGE ${DYNARMIC_USE_BUNDLED_EXTERNALS}
     )
 endif()
 
-# catch
-
-# if (NOT TARGET Catch2::Catch2WithMain)
-#     if (DYNARMIC_TESTS)
-#         find_package(Catch2 3.0.1 REQUIRED)
-#     endif()
-# endif()
-
-# fmt
-
-# if (NOT TARGET fmt::fmt)
-#     # fmtlib formatting library
-#     set(FMT_INSTALL ON)
-#     add_subdirectory(fmt)
-# endif()
-
 # mcl
 AddJsonPackage(
     NAME mcl
     BUNDLED_PACKAGE ${DYNARMIC_USE_BUNDLED_EXTERNALS}
 )
 
-# oaknut
-
-# if (NOT TARGET merry::oaknut)
-#     if ("arm64" IN_LIST ARCHITECTURE)
-#         add_subdirectory(oaknut)
-#     elseif (DYNARMIC_TESTS)
-#         add_subdirectory(oaknut EXCLUDE_FROM_ALL)
-#     endif()
-# endif()
-
-# xbyak
-# uncomment if in an independent repo
-
-# if (NOT TARGET xbyak::xbyak)
-#     if ("x86_64" IN_LIST ARCHITECTURE)
-#         add_subdirectory(xbyak)
-#     endif()
-# endif()
-
-# zydis
-
 # TODO(crueter): maybe it's just Gentoo but zydis system package really sucks
 if ("x86_64" IN_LIST ARCHITECTURE)
     set(CMAKE_DISABLE_FIND_PACKAGE_Doxygen ON)
-    # TODO(crueter): system zycore doesn't work with zydis
-    AddJsonPackage(zycore)
+    AddJsonPackage(
+        NAME zycore
+        BUNDLED_PACKAGE ${DYNARMIC_USE_BUNDLED_EXTERNALS}
+    )
 
     AddJsonPackage(
         NAME zydis
diff --git a/src/dynarmic/externals/cpmfile.json b/src/dynarmic/externals/cpmfile.json
index e9406cbe81..718163baf5 100644
--- a/src/dynarmic/externals/cpmfile.json
+++ b/src/dynarmic/externals/cpmfile.json
@@ -15,14 +15,13 @@
         ]
     },
     "zycore": {
-        "package": "Zycore",
+        "package": "zycore",
         "repo": "zyantific/zycore-c",
         "sha": "75a36c45ae",
-        "hash": "15aa399f39713e042c4345bc3175c82f14dca849fde2a21d4f591f62c43e227b70d868d8bb86beb5f4eb68b1d6bd3792cdd638acf89009e787e3d10ee7401924",
-        "bundled": true
+        "hash": "15aa399f39713e042c4345bc3175c82f14dca849fde2a21d4f591f62c43e227b70d868d8bb86beb5f4eb68b1d6bd3792cdd638acf89009e787e3d10ee7401924"
     },
     "zydis": {
-        "package": "Zydis",
+        "package": "zydis",
         "version": "4",
         "repo": "zyantific/zydis",
         "sha": "c2d2bab025",
diff --git a/src/dynarmic/src/dynarmic/CMakeLists.txt b/src/dynarmic/src/dynarmic/CMakeLists.txt
index 7ec92206f9..efae44d917 100644
--- a/src/dynarmic/src/dynarmic/CMakeLists.txt
+++ b/src/dynarmic/src/dynarmic/CMakeLists.txt
@@ -164,7 +164,7 @@ if ("x86_64" IN_LIST ARCHITECTURE)
     target_link_libraries(dynarmic
         PRIVATE
             xbyak::xbyak
-            Zydis
+            Zydis::Zydis
     )
 
     target_architecture_specific_sources(dynarmic "x86_64"
diff --git a/src/network/CMakeLists.txt b/src/network/CMakeLists.txt
index d0787b0936..1487033b22 100644
--- a/src/network/CMakeLists.txt
+++ b/src/network/CMakeLists.txt
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
 # SPDX-FileCopyrightText: 2022 yuzu Emulator Project
 # SPDX-License-Identifier: GPL-3.0-or-later
 
@@ -21,7 +24,7 @@ create_target_directory_groups(network)
 
 target_link_libraries(network PRIVATE common enet Boost::headers)
 if (ENABLE_WEB_SERVICE)
-    target_compile_definitions(network PRIVATE -DENABLE_WEB_SERVICE)
+    target_compile_definitions(network PRIVATE ENABLE_WEB_SERVICE)
     target_link_libraries(network PRIVATE web_service)
 endif()
 
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 8131d42aae..e0f7f82fbe 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -332,8 +332,10 @@ target_link_options(video_core PRIVATE ${FFmpeg_LDFLAGS})
 add_dependencies(video_core host_shaders)
 target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
 
-target_link_libraries(video_core PRIVATE sirit Vulkan::Headers Vulkan::UtilityHeaders)
-target_link_libraries(video_core PUBLIC GPUOpen::VulkanMemoryAllocator)
+target_link_libraries(video_core PRIVATE sirit)
+
+# Header-only stuff needed by all dependent targets
+target_link_libraries(video_core PUBLIC Vulkan::UtilityHeaders GPUOpen::VulkanMemoryAllocator)
 
 if (ENABLE_NSIGHT_AFTERMATH)
     if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK})
diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt
index 0ce8f3b898..38b7b0eec7 100644
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@@ -401,7 +401,6 @@ target_link_libraries(yuzu PRIVATE nlohmann_json::nlohmann_json)
 target_link_libraries(yuzu PRIVATE Boost::headers glad Qt6::Widgets)
 target_link_libraries(yuzu PRIVATE ${PLATFORM_LIBRARIES} Threads::Threads)
 
-target_link_libraries(yuzu PRIVATE Vulkan::Headers)
 if (NOT WIN32)
     target_include_directories(yuzu PRIVATE ${Qt6Gui_PRIVATE_INCLUDE_DIRS})
 endif()
@@ -416,24 +415,24 @@ endif()
 target_compile_definitions(yuzu PRIVATE
     # Use QStringBuilder for string concatenation to reduce
     # the overall number of temporary strings created.
-    -DQT_USE_QSTRINGBUILDER
+    QT_USE_QSTRINGBUILDER
 
     # Disable implicit conversions from/to C strings
-    -DQT_NO_CAST_FROM_ASCII
-    -DQT_NO_CAST_TO_ASCII
+    QT_NO_CAST_FROM_ASCII
+    QT_NO_CAST_TO_ASCII
 
     # Disable implicit type narrowing in signal/slot connect() calls.
-    -DQT_NO_NARROWING_CONVERSIONS_IN_CONNECT
+    QT_NO_NARROWING_CONVERSIONS_IN_CONNECT
 
     # Disable unsafe overloads of QProcess' start() function.
-    -DQT_NO_PROCESS_COMBINED_ARGUMENT_START
+    QT_NO_PROCESS_COMBINED_ARGUMENT_START
 
     # Disable implicit QString->QUrl conversions to enforce use of proper resolving functions.
-    -DQT_NO_URL_CAST_FROM_STRING
+    QT_NO_URL_CAST_FROM_STRING
 )
 
 if (YUZU_ENABLE_COMPATIBILITY_REPORTING)
-    target_compile_definitions(yuzu PRIVATE -DYUZU_ENABLE_COMPATIBILITY_REPORTING)
+    target_compile_definitions(yuzu PRIVATE YUZU_ENABLE_COMPATIBILITY_REPORTING)
 endif()
 
 if (USE_DISCORD_PRESENCE)
@@ -441,22 +440,22 @@ if (USE_DISCORD_PRESENCE)
         discord_impl.cpp
         discord_impl.h
     )
-    target_link_libraries(yuzu PRIVATE DiscordRPC::discord-rpc httplib::httplib Qt${QT_MAJOR_VERSION}::Network)
-    target_compile_definitions(yuzu PRIVATE -DUSE_DISCORD_PRESENCE)
+    target_link_libraries(yuzu PRIVATE DiscordRPC::discord-rpc httplib::httplib Qt6::Network)
+    target_compile_definitions(yuzu PRIVATE USE_DISCORD_PRESENCE)
 endif()
 
 if (ENABLE_WEB_SERVICE)
-    target_compile_definitions(yuzu PRIVATE -DENABLE_WEB_SERVICE)
+    target_compile_definitions(yuzu PRIVATE ENABLE_WEB_SERVICE)
 endif()
 
 if (YUZU_USE_QT_MULTIMEDIA)
-    target_link_libraries(yuzu PRIVATE Qt${QT_MAJOR_VERSION}::Multimedia)
-    target_compile_definitions(yuzu PRIVATE -DYUZU_USE_QT_MULTIMEDIA)
+    target_link_libraries(yuzu PRIVATE Qt6::Multimedia)
+    target_compile_definitions(yuzu PRIVATE YUZU_USE_QT_MULTIMEDIA)
 endif ()
 
 if (YUZU_USE_QT_WEB_ENGINE)
-    target_link_libraries(yuzu PRIVATE Qt${QT_MAJOR_VERSION}::WebEngineCore Qt${QT_MAJOR_VERSION}::WebEngineWidgets)
-    target_compile_definitions(yuzu PRIVATE -DYUZU_USE_QT_WEB_ENGINE)
+    target_link_libraries(yuzu PRIVATE Qt6::WebEngineCore Qt6::WebEngineWidgets)
+    target_compile_definitions(yuzu PRIVATE YUZU_USE_QT_WEB_ENGINE)
 endif ()
 
 if(UNIX AND NOT APPLE)
@@ -468,6 +467,7 @@ if (WIN32 AND NOT YUZU_USE_BUNDLED_QT AND QT_VERSION VERSION_GREATER_EQUAL 6)
     add_custom_command(TARGET yuzu POST_BUILD COMMAND ${WINDEPLOYQT_EXECUTABLE} "${YUZU_EXE_DIR}/eden.exe" --dir "${YUZU_EXE_DIR}" --libdir "${YUZU_EXE_DIR}" --plugindir "${YUZU_EXE_DIR}/plugins" --no-compiler-runtime --no-opengl-sw --no-system-d3d-compiler --no-translations --verbose 0)
 endif()
 
+# TODO(crueter): this can be done with system qt in a better way
 if (YUZU_USE_BUNDLED_QT)
     include(CopyYuzuQt6Deps)
     copy_yuzu_Qt6_deps(yuzu)
diff --git a/src/yuzu/externals/CMakeLists.txt b/src/yuzu/externals/CMakeLists.txt
index 7de41f6dfd..50594a741f 100644
--- a/src/yuzu/externals/CMakeLists.txt
+++ b/src/yuzu/externals/CMakeLists.txt
@@ -1,8 +1,6 @@
 # SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-# Explicitly include CPMUtil here since we have a separate cpmfile for Qt externals
-set(CPMUTIL_JSON_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cpmfile.json)
 include(CPMUtil)
 
 # Disable tests/tools in all externals supporting the standard option name
diff --git a/src/yuzu_cmd/CMakeLists.txt b/src/yuzu_cmd/CMakeLists.txt
index a7cf6d204c..a60650bc19 100644
--- a/src/yuzu_cmd/CMakeLists.txt
+++ b/src/yuzu_cmd/CMakeLists.txt
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
 # SPDX-FileCopyrightText: 2018 yuzu Emulator Project
 # SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -28,7 +31,7 @@ add_executable(yuzu-cmd
     yuzu.rc
 )
 
-target_link_libraries(yuzu-cmd PRIVATE common core input_common frontend_common)
+target_link_libraries(yuzu-cmd PRIVATE common core input_common frontend_common video_core)
 target_link_libraries(yuzu-cmd PRIVATE glad)
 if (MSVC)
     target_link_libraries(yuzu-cmd PRIVATE getopt)
@@ -38,8 +41,7 @@ target_link_libraries(yuzu-cmd PRIVATE ${PLATFORM_LIBRARIES} Threads::Threads)
 create_resource("../../dist/yuzu.bmp" "yuzu_cmd/yuzu_icon.h" "yuzu_icon")
 target_include_directories(yuzu-cmd PRIVATE ${RESOURCES_DIR})
 
-target_link_libraries(yuzu-cmd PRIVATE SDL2::SDL2 Vulkan::Headers)
-target_link_libraries(yuzu-cmd PRIVATE GPUOpen::VulkanMemoryAllocator)
+target_link_libraries(yuzu-cmd PRIVATE SDL2::SDL2)
 
 if(UNIX AND NOT APPLE)
     install(TARGETS yuzu-cmd)
diff --git a/tools/cpm-fetch-all.sh b/tools/cpm-fetch-all.sh
index 38f7b1f941..eac0f861a4 100755
--- a/tools/cpm-fetch-all.sh
+++ b/tools/cpm-fetch-all.sh
@@ -6,5 +6,6 @@
 # SPDX-FileCopyrightText: 2025 crueter
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-LIBS=$(find . externals externals/nx_tzdb src/yuzu/externals externals/ffmpeg src/dynarmic/externals -maxdepth 1 -name cpmfile.json -exec jq -j 'keys_unsorted | join(" ")' {} \; -printf " ")
+LIBS=$(find . externals src/yuzu/externals src/dynarmic -maxdepth 2 -name cpmfile.json -exec jq -j 'keys_unsorted | join(" ")' {} \; -printf " ")
+
 tools/cpm-fetch.sh $LIBS
\ No newline at end of file
diff --git a/tools/cpm-fetch.sh b/tools/cpm-fetch.sh
index 1c2ce007d2..648bbae1c8 100755
--- a/tools/cpm-fetch.sh
+++ b/tools/cpm-fetch.sh
@@ -84,7 +84,7 @@ ci_package() {
 
   for platform in windows-amd64 windows-arm64 android solaris freebsd linux linux-aarch64; do
     FILENAME="${NAME}-${platform}-${VERSION}.${EXT}"
-    DOWNLOAD="https://github.com/${REPO}/releases/download/v${VERSION}/${FILENAME}"
+    DOWNLOAD="https://$GIT_HOST/${REPO}/releases/download/v${VERSION}/${FILENAME}"
     PACKAGE_NAME="$PACKAGE"
     KEY=$platform
 
@@ -122,14 +122,32 @@ do
   URL=$(jq -r ".url" <<< "$JSON")
   REPO=$(jq -r ".repo" <<< "$JSON")
   SHA=$(jq -r ".sha" <<< "$JSON")
+  GIT_HOST=$(jq -r ".git_host" <<< "$JSON")
+
+  [ "$GIT_HOST" == null ] && GIT_HOST=github.com
+
+  VERSION=$(jq -r ".version" <<< "$JSON")
+  GIT_VERSION=$(jq -r ".git_version" <<< "$JSON")
+
+  if [ "$GIT_VERSION" != null ]; then
+    VERSION_REPLACE="$GIT_VERSION"
+  else
+    VERSION_REPLACE="$VERSION"
+  fi
+
+  TAG=$(jq -r ".tag" <<< "$JSON")
+
+  TAG=$(sed "s/%VERSION%/$VERSION_REPLACE/" <<< $TAG)
+
+  ARTIFACT=$(jq -r ".artifact" <<< "$JSON")
+  ARTIFACT=$(sed "s/%VERSION%/$VERSION_REPLACE/" <<< $ARTIFACT)
+  ARTIFACT=$(sed "s/%TAG%/$TAG/" <<< $ARTIFACT)
 
   if [ "$URL" != "null" ]; then
     DOWNLOAD="$URL"
   elif [ "$REPO" != "null" ]; then
-    GIT_URL="https://github.com/$REPO"
+    GIT_URL="https://$GIT_HOST/$REPO"
 
-    TAG=$(jq -r ".tag" <<< "$JSON")
-    ARTIFACT=$(jq -r ".artifact" <<< "$JSON")
     BRANCH=$(jq -r ".branch" <<< "$JSON")
 
     if [ "$TAG" != "null" ]; then
@@ -155,24 +173,21 @@ do
   # key parsing
   KEY=$(jq -r ".key" <<< "$JSON")
 
-  if [ "$KEY" == null ]; then
-    VERSION=$(jq -r ".version" <<< "$JSON")
-    GIT_VERSION=$(jq -r ".git_version" <<< "$JSON")
-    
+  if [ "$KEY" == null ]; then    
     if [ "$SHA" != null ]; then
       KEY=$(cut -c1-4 - <<< "$SHA")
     elif [ "$GIT_VERSION" != null ]; then
       KEY="$GIT_VERSION"
+    elif [ "$TAG" != null ]; then
+      KEY="$TAG"
     elif [ "$VERSION" != null ]; then
       KEY="$VERSION"
     else
-      echo "No valid key could be determined for $package. Must define one of: key, sha, version, git_version"
+      echo "No valid key could be determined for $package. Must define one of: key, sha, tag, version, git_version"
       continue
     fi
   fi
 
-  echo $KEY
-
   echo "Downloading regular package $package, with key $KEY, from $DOWNLOAD"
 
   # hash parsing

From 9d2681ecc9565681db623fb71799e76381998512 Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Tue, 9 Sep 2025 20:47:49 +0200
Subject: [PATCH 31/38] [cmake] enable clang-cl and WoA builds (#348)

Compilation and CMake fixes for both Windows on ARM and clang-cl, meaning Windows can now be built on both MSVC and clang on both amd64 and aarch64.

Compiling on clang is *dramatically* faster so this should be useful for CI.

Co-authored-by: crueter <crueter@eden-emu.dev>
Co-authored-by: crueter <crueter@crueter.xyz>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/348
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 .ci/windows/build.sh                          |  67 +++----
 .patch/boost/0001-clang-cl.patch              |  13 ++
 .patch/boost/0002-use-marmasm.patch           |  11 ++
 .patch/boost/0003-armasm-options.patch        |  14 ++
 .patch/cpp-jwt/0001-no-install.patch          |  47 -----
 .patch/cpp-jwt/0002-missing-decl.patch        |  13 --
 .patch/discord-rpc/0001-cmake-version.patch   |  10 -
 .patch/discord-rpc/0002-no-clang-format.patch |  40 ----
 .patch/discord-rpc/0003-fix-cpp17.patch       |  31 ---
 .patch/unordered-dense/0001-cmake.patch       |  22 ---
 CMakeLists.txt                                | 179 ++++++++++++------
 CMakeModules/DownloadExternals.cmake          |  13 +-
 CMakeModules/GenerateSCMRev.cmake             |   2 +
 CMakeModules/WindowsCopyFiles.cmake           |  33 ++--
 cpmfile.json                                  |  36 ++--
 externals/CMakeLists.txt                      |  23 ++-
 externals/cpmfile.json                        |  31 +--
 externals/sse2neon/sse2neon.h                 |  22 +--
 src/CMakeLists.txt                            |  39 ++--
 .../apps/audio_renderer/audio_renderer.cpp    |   2 +-
 src/audio_core/common/common.h                |   4 +-
 src/audio_core/device/audio_buffers.h         |   4 +-
 src/audio_core/renderer/audio_device.cpp      |   4 +-
 .../renderer/behavior/behavior_info.cpp       |   2 +-
 .../renderer/command/command_buffer.cpp       |   2 +-
 .../renderer/command/command_generator.h      |   6 +-
 .../renderer/command/data_source/decode.cpp   |  18 +-
 .../renderer/command/effect/aux_.cpp          |   4 +-
 .../renderer/command/effect/biquad_filter.cpp |   8 +-
 .../renderer/command/effect/capture.cpp       |   2 +-
 .../renderer/command/effect/i3dl2_reverb.cpp  |   6 +-
 .../renderer/command/effect/light_limiter.cpp |   8 +-
 .../renderer/command/effect/reverb.cpp        |   8 +-
 .../command/mix/depop_for_mix_buffers.cpp     |   2 +-
 .../renderer/command/resample/upsample.cpp    |   2 +-
 .../renderer/command/sink/circular_buffer.cpp |   4 +-
 .../renderer/command/sink/device.cpp          |   4 +-
 src/audio_core/renderer/mix/mix_context.cpp   |   2 +-
 src/audio_core/renderer/sink/sink_info_base.h |   4 +-
 .../renderer/splitter/splitter_context.cpp    |   2 +-
 src/audio_core/renderer/system.cpp            |   2 +-
 src/audio_core/sink/cubeb_sink.cpp            |   6 +-
 src/audio_core/sink/sink_stream.cpp           |   8 +-
 src/common/CMakeLists.txt                     |   4 +-
 src/common/free_region_manager.h              |   4 +-
 src/common/fs/path_util.cpp                   |   6 +-
 src/common/heap_tracker.cpp                   |   7 +-
 src/common/host_memory.cpp                    |  12 +-
 src/common/logging/log.h                      |   2 +-
 src/common/math_util.h                        |   8 +-
 src/common/overflow.h                         |   4 +-
 src/common/range_map.h                        |   4 +-
 src/common/range_sets.inc                     |   2 +-
 src/common/ring_buffer.h                      |  12 +-
 src/common/scm_rev.cpp.in                     |  76 ++------
 src/common/scm_rev.h                          |  10 +-
 src/common/settings.h                         |   8 +-
 src/common/settings_setting.h                 |   6 +-
 src/common/slot_vector.h                      |   2 +-
 src/common/thread.cpp                         |   4 +-
 src/common/tiny_mt.h                          |   2 +-
 src/common/uint128.h                          |   6 +-
 src/common/x64/cpu_wait.cpp                   |   2 +-
 src/core/CMakeLists.txt                       |   1 +
 src/core/arm/debug.cpp                        |   4 +-
 src/core/arm/dynarmic/dynarmic_cp15.cpp       |   4 +
 src/core/arm/nce/interpreter_visitor.h        |   5 +
 src/core/arm/nce/patcher.cpp                  |   2 +
 src/core/core.cpp                             |   2 +-
 src/core/crypto/xts_encryption_layer.cpp      |   4 +-
 src/core/debugger/gdbstub.cpp                 |   4 +-
 src/core/debugger/gdbstub_arch.cpp            |   2 +-
 src/core/device_memory_manager.inc            |   2 +-
 src/core/file_sys/fs_path_utility.h           |   2 +-
 src/core/file_sys/fsa/fs_i_directory.h        |   2 +-
 src/core/file_sys/fsa/fs_i_file.h             |   2 +-
 ...ystem_aes_ctr_counter_extended_storage.cpp |   2 +-
 .../fssystem/fssystem_aes_ctr_storage.cpp     |   2 +-
 .../fssystem/fssystem_aes_xts_storage.cpp     |   6 +-
 ...system_alignment_matching_storage_impl.cpp |   4 +-
 .../fssystem_crypto_configuration.cpp         |   4 +-
 ...ssystem_integrity_verification_storage.cpp |   2 +-
 .../file_sys/fssystem/fssystem_nca_header.cpp |   2 +-
 .../fssystem/fssystem_pooled_buffer.cpp       |   2 +-
 .../fssystem/fssystem_sparse_storage.h        |   4 +-
 src/core/file_sys/nca_metadata.cpp            |   2 +-
 src/core/file_sys/registered_cache.cpp        |   2 +-
 src/core/file_sys/romfs.cpp                   |   2 +-
 src/core/file_sys/vfs/vfs.cpp                 |   4 +-
 src/core/file_sys/vfs/vfs_static.h            |   4 +-
 src/core/file_sys/vfs/vfs_vector.cpp          |   4 +-
 src/core/file_sys/vfs/vfs_vector.h            |   2 +-
 src/core/frontend/emu_window.cpp              |   8 +-
 src/core/frontend/framebuffer_layout.cpp      |   2 +-
 .../board/nintendo/nx/k_memory_layout.cpp     |   2 +-
 .../board/nintendo/nx/k_system_control.cpp    |   6 +-
 src/core/hle/kernel/k_dynamic_page_manager.h  |   4 +-
 src/core/hle/kernel/k_handle_table.h          |   2 +-
 src/core/hle/kernel/k_hardware_timer.cpp      |   8 +-
 src/core/hle/kernel/k_hardware_timer.h        |   2 +-
 .../hle/kernel/k_light_server_session.cpp     |   2 +-
 src/core/hle/kernel/k_light_server_session.h  |   2 +-
 src/core/hle/kernel/k_memory_block.h          |   2 +-
 src/core/hle/kernel/k_memory_layout.cpp       |   4 +-
 src/core/hle/kernel/k_memory_manager.cpp      |   6 +-
 src/core/hle/kernel/k_memory_manager.h        |   6 +-
 src/core/hle/kernel/k_memory_region.h         |   2 +-
 src/core/hle/kernel/k_page_bitmap.h           |   2 +-
 src/core/hle/kernel/k_page_heap.h             |   2 +-
 src/core/hle/kernel/k_page_table_base.cpp     |  14 +-
 src/core/hle/kernel/k_process.h               |   2 +-
 src/core/hle/kernel/k_resource_limit.cpp      |   2 +-
 src/core/hle/kernel/k_slab_heap.h             |   2 +-
 src/core/hle/kernel/k_thread.cpp              |   2 +-
 src/core/hle/kernel/kernel.cpp                |   2 +-
 .../hle/kernel/svc/svc_address_arbiter.cpp    |   4 +-
 .../hle/kernel/svc/svc_condition_variable.cpp |   4 +-
 src/core/hle/kernel/svc/svc_ipc.cpp           |   4 +-
 src/core/hle/kernel/svc/svc_process.cpp       |   2 +-
 src/core/hle/kernel/svc/svc_thread.cpp        |   6 +-
 src/core/hle/service/acc/acc.cpp              |   2 +-
 .../service/am/frontend/applet_cabinet.cpp    |   2 +-
 .../service/am/frontend/applet_controller.cpp |   2 +-
 .../am/service/application_accessor.cpp       |   2 +-
 .../am/service/application_functions.cpp      |   4 +-
 .../service/library_applet_self_accessor.cpp  |   2 +-
 src/core/hle/service/bcat/bcat_service.cpp    |   2 +-
 .../bcat/delivery_cache_directory_service.cpp |   4 +-
 .../bcat/delivery_cache_storage_service.cpp   |   2 +-
 src/core/hle/service/cmif_serialization.h     |   2 +-
 src/core/hle/service/es/es.cpp                |   4 +-
 .../fsp/fs_i_save_data_info_reader.cpp        |   2 +-
 src/core/hle/service/glue/notif.cpp           |   4 +-
 src/core/hle/service/glue/time/manager.cpp    |   6 +-
 src/core/hle/service/hid/hid_debug_server.cpp |   2 +-
 src/core/hle/service/jit/jit_context.cpp      |   2 +-
 src/core/hle/service/ldn/ldn_types.h          |   2 +-
 src/core/hle/service/lm/lm.cpp                |   2 +-
 src/core/hle/service/nfc/common/device.cpp    |   2 +-
 src/core/hle/service/nifm/nifm.cpp            |   4 +-
 .../ns/application_manager_interface.cpp      |   4 +-
 .../service/ns/platform_service_manager.cpp   |   2 +-
 .../service/nvdrv/devices/nvhost_as_gpu.cpp   |   2 +-
 .../hle/service/nvdrv/devices/nvhost_as_gpu.h |   4 +-
 .../nvdrv/devices/nvhost_nvdec_common.cpp     |   4 +-
 .../nvnflinger/buffer_queue_consumer.cpp      |   2 +-
 .../service/nvnflinger/buffer_queue_core.cpp  |   2 +-
 .../nvnflinger/buffer_queue_producer.cpp      |   4 +-
 .../service/nvnflinger/hardware_composer.cpp  |   2 +-
 src/core/hle/service/psc/time/common.h        |  10 +-
 .../psc/time/power_state_request_manager.cpp  |   2 +-
 src/core/hle/service/set/settings_server.cpp  |  10 +-
 .../service/set/system_settings_server.cpp    |   4 +-
 src/core/hle/service/sm/sm_controller.cpp     |   8 +-
 src/core/hle/service/sockets/bsd.cpp          |   4 +-
 src/core/hle/service/spl/spl_module.cpp       |   2 +-
 src/core/hle/service/ssl/ssl.cpp              |   2 +-
 .../vi/application_display_service.cpp        |   4 +-
 src/core/internal_network/network.cpp         |   8 +-
 .../internal_network/network_interface.cpp    |   2 +-
 src/core/internal_network/socket_proxy.cpp    |   6 +-
 src/core/loader/nca.cpp                       |   2 +-
 src/core/memory.cpp                           |   8 +-
 src/core/tools/renderdoc.cpp                  |  13 +-
 src/dynarmic/CMakeLists.txt                   |   6 +-
 .../src/dynarmic/backend/x64/emit_x64.cpp     |   2 +-
 .../backend/x64/emit_x64_saturation.cpp       |   4 +-
 .../dynarmic/backend/x64/emit_x64_vector.cpp  |  34 ++--
 .../x64/emit_x64_vector_floating_point.cpp    |   2 +-
 .../src/dynarmic/backend/x64/reg_alloc.cpp    |   8 +-
 .../src/dynarmic/backend/x64/reg_alloc.h      |   6 +-
 .../src/dynarmic/frontend/A32/a32_types.h     |   1 +
 src/hid_core/frontend/emulated_controller.cpp |   4 +-
 src/hid_core/hidbus/ringcon.cpp               |   2 +-
 .../irsensor/image_transfer_processor.cpp     |   2 +-
 .../abstract_battery_handler.cpp              |   2 +-
 .../abstract_button_handler.cpp               |   2 +-
 .../abstract_ir_sensor_handler.cpp            |   2 +-
 .../abstracted_pad/abstract_led_handler.cpp   |   2 +-
 .../abstracted_pad/abstract_mcu_handler.cpp   |   2 +-
 .../abstracted_pad/abstract_nfc_handler.cpp   |   2 +-
 .../resources/abstracted_pad/abstract_pad.cpp |   2 +-
 .../abstracted_pad/abstract_pad_holder.cpp    |   2 +-
 .../abstracted_pad/abstract_palma_handler.cpp |   2 +-
 .../abstract_properties_handler.cpp           |   2 +-
 .../abstract_sixaxis_handler.cpp              |   2 +-
 .../abstract_vibration_handler.cpp            |   2 +-
 src/hid_core/resources/applet_resource.cpp    |   2 +-
 src/hid_core/resources/npad/npad.cpp          |   2 +-
 src/hid_core/resources/npad/npad_data.cpp     |   2 +-
 src/hid_core/resources/npad/npad_resource.cpp |   2 +-
 src/hid_core/resources/palma/palma.cpp        |   2 +-
 .../touch_screen/gesture_handler.cpp          |   2 +-
 .../touch_screen/touch_screen_resource.cpp    |   8 +-
 src/input_common/drivers/mouse.cpp            |   4 +-
 src/input_common/drivers/sdl_driver.cpp       |   4 +-
 src/input_common/drivers/udp_client.cpp       |   4 +-
 .../helpers/joycon_protocol/calibration.cpp   |   4 +-
 .../helpers/joycon_protocol/nfc.cpp           |  12 +-
 .../helpers/joycon_protocol/rumble.cpp        |   2 +-
 src/network/room.cpp                          |   2 +-
 .../backend/glasm/reg_alloc.cpp               |   2 +-
 .../backend/glsl/emit_glsl_integer.cpp        |   2 +-
 .../backend/glsl/glsl_emit_context.cpp        |   6 +-
 .../backend/glsl/var_alloc.cpp                |   2 +-
 .../spirv/emit_spirv_context_get_set.cpp      |   2 +-
 .../backend/spirv/emit_spirv_integer.cpp      |   2 +-
 .../backend/spirv/spirv_emit_context.cpp      |   2 +-
 .../frontend/maxwell/decode.cpp               |   4 +-
 .../floating_point_conversion_integer.cpp     |  24 +--
 .../integer_floating_point_conversion.cpp     |   4 +-
 .../frontend/maxwell/translate_program.cpp    |   2 +-
 .../ir_opt/collect_shader_info_pass.cpp       |   2 +-
 .../ir_opt/constant_propagation_pass.cpp      |   2 +-
 src/shader_recompiler/ir_opt/texture_pass.cpp |   4 +-
 src/video_core/CMakeLists.txt                 |   2 +-
 src/video_core/buffer_cache/buffer_cache.h    |  28 +--
 .../buffer_cache/memory_tracker_base.h        |   6 +-
 src/video_core/buffer_cache/word_manager.h    |  12 +-
 src/video_core/control/channel_state_cache.h  |   2 +-
 src/video_core/engines/engine_interface.h     |   2 +-
 src/video_core/engines/engine_upload.cpp      |   4 +-
 src/video_core/engines/maxwell_3d.cpp         |  12 +-
 src/video_core/engines/maxwell_3d.h           |   6 +-
 src/video_core/engines/maxwell_dma.cpp        |   6 +-
 .../engines/sw_blitter/converter.cpp          |   2 +-
 src/video_core/host1x/codecs/h264.cpp         |   2 +-
 src/video_core/host1x/codecs/vp9.cpp          |   4 +-
 src/video_core/host1x/vic.cpp                 |  38 ++--
 src/video_core/macro/macro_hle.cpp            |   2 +-
 src/video_core/memory_manager.cpp             |   4 +-
 src/video_core/memory_manager.h               |   2 +-
 src/video_core/renderer_opengl/blit_image.cpp |   4 +-
 .../renderer_opengl/gl_buffer_cache.cpp       |   2 +-
 .../renderer_opengl/gl_buffer_cache.h         |   2 +-
 .../renderer_opengl/gl_rasterizer.cpp         |   2 +-
 .../renderer_opengl/gl_shader_cache.cpp       |   2 +-
 .../gl_staging_buffer_pool.cpp                |   6 +-
 .../renderer_opengl/gl_state_tracker.h        |   2 +-
 .../renderer_opengl/gl_texture_cache.cpp      |  18 +-
 src/video_core/renderer_vulkan/blit_image.cpp |   4 +-
 .../renderer_vulkan/vk_buffer_cache.cpp       |   4 +-
 .../renderer_vulkan/vk_graphics_pipeline.cpp  |   2 +-
 .../renderer_vulkan/vk_present_manager.cpp    |   4 +-
 .../renderer_vulkan/vk_query_cache.cpp        |  20 +-
 .../renderer_vulkan/vk_rasterizer.cpp         |  16 +-
 .../vk_staging_buffer_pool.cpp                |  14 +-
 .../renderer_vulkan/vk_state_tracker.h        |   2 +-
 .../renderer_vulkan/vk_swapchain.cpp          |  16 +-
 .../renderer_vulkan/vk_texture_cache.cpp      |  50 ++---
 src/video_core/shader_environment.cpp         |   6 +-
 src/video_core/shader_environment.h           |   4 +-
 src/video_core/texture_cache/decode_bc.cpp    |   4 +-
 src/video_core/texture_cache/image_base.cpp   |  10 +-
 .../texture_cache/image_view_base.cpp         |   6 +-
 .../texture_cache/image_view_info.cpp         |   2 +-
 src/video_core/texture_cache/texture_cache.h  |  24 +--
 .../texture_cache/texture_cache_base.h        |   2 +-
 src/video_core/texture_cache/util.cpp         |  14 +-
 src/video_core/textures/astc.cpp              |  10 +-
 src/video_core/textures/decoders.cpp          |   8 +-
 src/video_core/textures/texture.cpp           |   2 +-
 src/video_core/textures/workers.cpp           |   2 +-
 src/video_core/transform_feedback.cpp         |   4 +-
 .../vulkan_common/vulkan_device.cpp           |   4 +-
 .../vulkan_common/vulkan_memory_allocator.cpp |   4 +-
 src/video_core/vulkan_common/vulkan_wrapper.h |   4 +-
 src/yuzu/CMakeLists.txt                       |   2 +-
 src/yuzu/about_dialog.cpp                     |   9 +-
 src/yuzu/bootmanager.cpp                      |   4 +-
 .../configure_touch_from_button.cpp           |   4 +-
 src/yuzu/game_list.cpp                        |   2 +-
 src/yuzu/main.cpp                             |  29 +--
 src/yuzu/play_time_manager.cpp                |   2 +-
 src/yuzu/util/util.cpp                        |   2 +-
 tools/cpm-fetch.sh                            |  12 +-
 276 files changed, 973 insertions(+), 1010 deletions(-)
 create mode 100644 .patch/boost/0001-clang-cl.patch
 create mode 100644 .patch/boost/0002-use-marmasm.patch
 create mode 100644 .patch/boost/0003-armasm-options.patch
 delete mode 100644 .patch/cpp-jwt/0001-no-install.patch
 delete mode 100644 .patch/cpp-jwt/0002-missing-decl.patch
 delete mode 100644 .patch/discord-rpc/0001-cmake-version.patch
 delete mode 100644 .patch/discord-rpc/0002-no-clang-format.patch
 delete mode 100644 .patch/discord-rpc/0003-fix-cpp17.patch
 delete mode 100644 .patch/unordered-dense/0001-cmake.patch

diff --git a/.ci/windows/build.sh b/.ci/windows/build.sh
index 681f327793..a0ab69a440 100644
--- a/.ci/windows/build.sh
+++ b/.ci/windows/build.sh
@@ -1,59 +1,45 @@
-#!/bin/bash -e
+#!/bin/bash -ex
 
-# SPDX-FileCopyrightText: 2025 eden Emulator Project
+# SPDX-FileCopyrightText: 2025 Eden Emulator Project
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-if [ "$DEVEL" != "true" ]; then
-    export EXTRA_CMAKE_FLAGS=("${EXTRA_CMAKE_FLAGS[@]}" -DENABLE_QT_UPDATE_CHECKER=ON)
+if [ "$COMPILER" == "clang" ]
+then
+    EXTRA_CMAKE_FLAGS+=(
+        -DCMAKE_CXX_COMPILER=clang-cl
+        -DCMAKE_C_COMPILER=clang-cl
+        -DCMAKE_CXX_FLAGS="-O3"
+        -DCMAKE_C_FLAGS="-O3"
+    )
+
+    BUILD_TYPE="RelWithDebInfo"
 fi
 
-if [ "$CCACHE" = "true" ]; then
-    export EXTRA_CMAKE_FLAGS=("${EXTRA_CMAKE_FLAGS[@]}" -DUSE_CCACHE=ON)
-fi
+[ -z "$WINDEPLOYQT" ] && { echo "WINDEPLOYQT environment variable required."; exit 1; }
 
-if [ "$BUNDLE_QT" = "true" ]; then
-    export EXTRA_CMAKE_FLAGS=("${EXTRA_CMAKE_FLAGS[@]}" -DYUZU_USE_BUNDLED_QT=ON)
-else
-    export EXTRA_CMAKE_FLAGS=("${EXTRA_CMAKE_FLAGS[@]}" -DYUZU_USE_BUNDLED_QT=OFF)
-fi
-
-if [ -z "$BUILD_TYPE" ]; then
-    export BUILD_TYPE="Release"
-fi
-
-if [ "$WINDEPLOYQT" == "" ]; then
-    echo "You must supply the WINDEPLOYQT environment variable."
-    exit 1
-fi
-
-if [ "$USE_WEBENGINE" = "true" ]; then
-    WEBENGINE=ON
-else
-    WEBENGINE=OFF
-fi
-
-if [ "$USE_MULTIMEDIA" = "false" ]; then
-    MULTIMEDIA=OFF
-else
-    MULTIMEDIA=ON
-fi
-
-export EXTRA_CMAKE_FLAGS=("${EXTRA_CMAKE_FLAGS[@]}" $@)
+echo $EXTRA_CMAKE_FLAGS
 
 mkdir -p build && cd build
 cmake .. -G Ninja \
-    -DCMAKE_BUILD_TYPE="$BUILD_TYPE" \
-    -DENABLE_QT_TRANSLATION=ON \
+    -DCMAKE_BUILD_TYPE="${BUILD_TYPE:-Release}" \
+	-DENABLE_QT_TRANSLATION=ON \
     -DUSE_DISCORD_PRESENCE=ON \
     -DYUZU_USE_BUNDLED_SDL2=ON \
+    -DBUILD_TESTING=OFF \
     -DYUZU_TESTS=OFF \
+    -DDYNARMIC_TESTS=OFF \
     -DYUZU_CMD=OFF \
     -DYUZU_ROOM_STANDALONE=OFF \
-    -DYUZU_USE_QT_MULTIMEDIA=$MULTIMEDIA \
-    -DYUZU_USE_QT_WEB_ENGINE=$WEBENGINE \
+    -DYUZU_USE_QT_MULTIMEDIA=${USE_MULTIMEDIA:-false} \
+    -DYUZU_USE_QT_WEB_ENGINE=${USE_WEBENGINE:-false} \
     -DYUZU_ENABLE_LTO=ON \
+	-DCMAKE_EXE_LINKER_FLAGS=" /LTCG" \
     -DDYNARMIC_ENABLE_LTO=ON \
-    "${EXTRA_CMAKE_FLAGS[@]}"
+    -DYUZU_USE_BUNDLED_QT=${BUNDLE_QT:-false} \
+    -DUSE_CCACHE=${CCACHE:-false} \
+    -DENABLE_QT_UPDATE_CHECKER=${DEVEL:-true} \
+    "${EXTRA_CMAKE_FLAGS[@]}" \
+    "$@"
 
 ninja
 
@@ -62,4 +48,5 @@ rm -f bin/*.pdb
 set -e
 
 $WINDEPLOYQT --release --no-compiler-runtime --no-opengl-sw --no-system-dxc-compiler --no-system-d3d-compiler --dir pkg bin/eden.exe
+
 cp bin/* pkg
diff --git a/.patch/boost/0001-clang-cl.patch b/.patch/boost/0001-clang-cl.patch
new file mode 100644
index 0000000000..cdabc712cb
--- /dev/null
+++ b/.patch/boost/0001-clang-cl.patch
@@ -0,0 +1,13 @@
+diff --git a/libs/cobalt/include/boost/cobalt/concepts.hpp b/libs/cobalt/include/boost/cobalt/concepts.hpp
+index d49f2ec..a9bdb80 100644
+--- a/libs/cobalt/include/boost/cobalt/concepts.hpp
++++ b/libs/cobalt/include/boost/cobalt/concepts.hpp
+@@ -62,7 +62,7 @@ struct enable_awaitables
+ template <typename T>
+ concept with_get_executor = requires (T& t)
+ {
+-  {t.get_executor()} -> asio::execution::executor;
++  t.get_executor();
+ };
+ 
+ 
diff --git a/.patch/boost/0002-use-marmasm.patch b/.patch/boost/0002-use-marmasm.patch
new file mode 100644
index 0000000000..10f490b878
--- /dev/null
+++ b/.patch/boost/0002-use-marmasm.patch
@@ -0,0 +1,11 @@
+--- a/libs/context/CMakeLists.txt	2025-09-08 00:42:31.303651800 -0400
++++ b/libs/context/CMakeLists.txt	2025-09-08 00:42:40.592184300 -0400
+@@ -146,7 +146,7 @@
+       set(ASM_LANGUAGE ASM)
+     endif()
+   elseif(BOOST_CONTEXT_ASSEMBLER STREQUAL armasm)
+-    set(ASM_LANGUAGE ASM_ARMASM)
++    set(ASM_LANGUAGE ASM_MARMASM)
+   else()
+     set(ASM_LANGUAGE ASM_MASM)
+   endif()
diff --git a/.patch/boost/0003-armasm-options.patch b/.patch/boost/0003-armasm-options.patch
new file mode 100644
index 0000000000..3869f95f6f
--- /dev/null
+++ b/.patch/boost/0003-armasm-options.patch
@@ -0,0 +1,14 @@
+diff --git a/libs/context/CMakeLists.txt b/libs/context/CMakeLists.txt
+index 8210f65..0e59dd7 100644
+--- a/libs/context/CMakeLists.txt
++++ b/libs/context/CMakeLists.txt
+@@ -186,7 +186,8 @@ if(BOOST_CONTEXT_IMPLEMENTATION STREQUAL "fcontext")
+       set_property(SOURCE ${ASM_SOURCES} APPEND PROPERTY COMPILE_OPTIONS "/safeseh")
+     endif()
+ 
+-  else() # masm
++  # armasm doesn't support most of these options
++  elseif(NOT BOOST_CONTEXT_ASSEMBLER STREQUAL armasm) # masm
+     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+       set_property(SOURCE ${ASM_SOURCES} APPEND PROPERTY COMPILE_OPTIONS "-x" "assembler-with-cpp")
+     elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
diff --git a/.patch/cpp-jwt/0001-no-install.patch b/.patch/cpp-jwt/0001-no-install.patch
deleted file mode 100644
index b5be557a53..0000000000
--- a/.patch/cpp-jwt/0001-no-install.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 8c1761f..52c4ca4 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -69,42 +69,3 @@ endif()
- if(CPP_JWT_BUILD_EXAMPLES)
-   add_subdirectory(examples)
- endif()
--
--# ##############################################################################
--# INSTALL
--# ##############################################################################
--
--include(GNUInstallDirs)
--include(CMakePackageConfigHelpers)
--set(CPP_JWT_CONFIG_INSTALL_DIR ${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME})
--
--install(TARGETS ${PROJECT_NAME} EXPORT ${PROJECT_NAME}Targets)
--install(
--  EXPORT ${PROJECT_NAME}Targets
--  DESTINATION ${CPP_JWT_CONFIG_INSTALL_DIR}
--  NAMESPACE ${PROJECT_NAME}::
--  COMPONENT dev)
--configure_package_config_file(cmake/Config.cmake.in ${PROJECT_NAME}Config.cmake
--                              INSTALL_DESTINATION ${CPP_JWT_CONFIG_INSTALL_DIR}
--                              NO_SET_AND_CHECK_MACRO)
--write_basic_package_version_file(${PROJECT_NAME}ConfigVersion.cmake
--                                 COMPATIBILITY SameMajorVersion
--                                 ARCH_INDEPENDENT)
--install(
--  FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
--        ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
--  DESTINATION ${CPP_JWT_CONFIG_INSTALL_DIR}
--  COMPONENT dev)
--
--if(NOT CPP_JWT_USE_VENDORED_NLOHMANN_JSON)
--  set(CPP_JWT_VENDORED_NLOHMANN_JSON_INSTALL_PATTERN PATTERN "json" EXCLUDE)
--endif()
--install(
--  DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/jwt/
--  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/jwt
--  COMPONENT dev
--  FILES_MATCHING
--  PATTERN "*.hpp"
--  PATTERN "*.ipp"
--  PATTERN "test" EXCLUDE
--  ${CPP_JWT_VENDORED_NLOHMANN_JSON_INSTALL_PATTERN})
diff --git a/.patch/cpp-jwt/0002-missing-decl.patch b/.patch/cpp-jwt/0002-missing-decl.patch
deleted file mode 100644
index cd5175dbe0..0000000000
--- a/.patch/cpp-jwt/0002-missing-decl.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/include/jwt/algorithm.hpp b/include/jwt/algorithm.hpp
-index 0e3b843..1156e6a 100644
---- a/include/jwt/algorithm.hpp
-+++ b/include/jwt/algorithm.hpp
-@@ -64,6 +64,8 @@ using verify_func_t = verify_result_t (*) (const jwt::string_view key,
-                                            const jwt::string_view head,
-                                            const jwt::string_view jwt_sign);
- 
-+verify_result_t is_secret_a_public_key(const jwt::string_view secret);
-+
- namespace algo {
- 
- //Me: TODO: All these can be done using code generaion.
diff --git a/.patch/discord-rpc/0001-cmake-version.patch b/.patch/discord-rpc/0001-cmake-version.patch
deleted file mode 100644
index 6a1609fadf..0000000000
--- a/.patch/discord-rpc/0001-cmake-version.patch
+++ /dev/null
@@ -1,10 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 5dad9e9..760a1b2 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -1,4 +1,4 @@
--cmake_minimum_required (VERSION 3.2.0)
-+cmake_minimum_required (VERSION 3.10)
- project (DiscordRPC)
- 
- include(GNUInstallDirs)
diff --git a/.patch/discord-rpc/0002-no-clang-format.patch b/.patch/discord-rpc/0002-no-clang-format.patch
deleted file mode 100644
index 4b1e37c29f..0000000000
--- a/.patch/discord-rpc/0002-no-clang-format.patch
+++ /dev/null
@@ -1,40 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 760a1b2..540d643 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -12,20 +12,6 @@ file(GLOB_RECURSE ALL_SOURCE_FILES
-     src/*.cpp src/*.h src/*.c
- )
- 
--# Set CLANG_FORMAT_SUFFIX if you are using custom clang-format, e.g. clang-format-5.0
--find_program(CLANG_FORMAT_CMD clang-format${CLANG_FORMAT_SUFFIX})
--
--if (CLANG_FORMAT_CMD)
--    add_custom_target(
--        clangformat
--        COMMAND ${CLANG_FORMAT_CMD}
--        -i -style=file -fallback-style=none
--        ${ALL_SOURCE_FILES}
--        DEPENDS
--        ${ALL_SOURCE_FILES}
--    )
--endif(CLANG_FORMAT_CMD)
--
- # thirdparty stuff
- execute_process(
-     COMMAND mkdir ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty
-diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
-index 290d761..cd2cc92 100644
---- a/src/CMakeLists.txt
-+++ b/src/CMakeLists.txt
-@@ -120,10 +120,6 @@ if (${BUILD_SHARED_LIBS})
-     target_compile_definitions(discord-rpc PRIVATE -DDISCORD_BUILDING_SDK)
- endif(${BUILD_SHARED_LIBS})
- 
--if (CLANG_FORMAT_CMD)
--    add_dependencies(discord-rpc clangformat)
--endif(CLANG_FORMAT_CMD)
--
- # install
- 
- install(
diff --git a/.patch/discord-rpc/0003-fix-cpp17.patch b/.patch/discord-rpc/0003-fix-cpp17.patch
deleted file mode 100644
index 35b725d307..0000000000
--- a/.patch/discord-rpc/0003-fix-cpp17.patch
+++ /dev/null
@@ -1,31 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 540d643..5d12f3d 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -17,12 +17,14 @@ execute_process(
-     COMMAND mkdir ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty
-     ERROR_QUIET
- )
-+# new commit that fixes c++17
-+set(RAPIDJSON_SHA 3b2441b87f99ab65f37b141a7b548ebadb607b96)
- 
--find_file(RAPIDJSONTEST NAMES rapidjson rapidjson-1.1.0 PATHS ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty CMAKE_FIND_ROOT_PATH_BOTH)
-+find_file(RAPIDJSONTEST NAMES rapidjson rapidjson-${RAPIDJSON_SHA} PATHS ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty CMAKE_FIND_ROOT_PATH_BOTH)
- if (NOT RAPIDJSONTEST)
-     message("no rapidjson, download")
--    set(RJ_TAR_FILE ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/v1.1.0.tar.gz)
--    file(DOWNLOAD https://github.com/miloyip/rapidjson/archive/v1.1.0.tar.gz ${RJ_TAR_FILE})
-+    set(RJ_TAR_FILE ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/${RAPIDJSON_SHA}.tar.gz)
-+    file(DOWNLOAD https://github.com/miloyip/rapidjson/archive/${RAPIDJSON_SHA}.tar.gz ${RJ_TAR_FILE})
-     execute_process(
-         COMMAND ${CMAKE_COMMAND} -E tar xzf ${RJ_TAR_FILE}
-         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty
-@@ -30,7 +32,7 @@ if (NOT RAPIDJSONTEST)
-     file(REMOVE ${RJ_TAR_FILE})
- endif(NOT RAPIDJSONTEST)
- 
--find_file(RAPIDJSON NAMES rapidjson rapidjson-1.1.0 PATHS ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty CMAKE_FIND_ROOT_PATH_BOTH)
-+find_file(RAPIDJSON NAMES rapidjson rapidjson-${RAPIDJSON_SHA} PATHS ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty CMAKE_FIND_ROOT_PATH_BOTH)
- 
- add_library(rapidjson STATIC IMPORTED ${RAPIDJSON})
- 
diff --git a/.patch/unordered-dense/0001-cmake.patch b/.patch/unordered-dense/0001-cmake.patch
deleted file mode 100644
index 39e7794b1f..0000000000
--- a/.patch/unordered-dense/0001-cmake.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-From e59d30b7b12e1d04cc2fc9c6219e35bda447c17e Mon Sep 17 00:00:00 2001
-From: Lizzie <159065448+Lizzie841@users.noreply.github.com>
-Date: Fri, 16 May 2025 04:12:13 +0100
-Subject: [PATCH] Update CMakeLists.txt
-
----
- CMakeLists.txt | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index b5f4c4f..c5c6f31 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -24,7 +24,7 @@ target_include_directories(
- 
- target_compile_features(unordered_dense INTERFACE cxx_std_17)
- 
--if(_unordered_dense_is_toplevel_project)
-+if(_unordered_dense_is_toplevel_project OR UNORDERED_DENSE_INSTALL)
-     # locations are provided by GNUInstallDirs
-     install(
-         TARGETS unordered_dense
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e23f8f87f..144e77684e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,6 +15,21 @@ elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
     set(PLATFORM_LINUX ON)
 endif()
 
+if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set(CXX_CLANG ON)
+    if (MSVC)
+        set(CXX_CLANG_CL ON)
+    endif()
+elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    set(CXX_GCC ON)
+elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    set(CXX_CL ON)
+elseif (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
+    set(CXX_ICC ON)
+elseif (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+    set(CXX_APPLE ON)
+endif()
+
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules")
 if (PLATFORM_SUN)
@@ -29,6 +44,77 @@ if (PLATFORM_SUN)
     endif()
 endif()
 
+# Detect current compilation architecture and create standard definitions
+# =======================================================================
+
+include(CheckSymbolExists)
+function(detect_architecture symbol arch)
+    if (NOT DEFINED ARCHITECTURE)
+        set(CMAKE_REQUIRED_QUIET 1)
+        check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch})
+        unset(CMAKE_REQUIRED_QUIET)
+
+        # The output variable needs to be unique across invocations otherwise
+        # CMake's crazy scope rules will keep it defined
+        if (ARCHITECTURE_${arch})
+            set(ARCHITECTURE "${arch}" PARENT_SCOPE)
+            set(ARCHITECTURE_${arch} 1 PARENT_SCOPE)
+            add_definitions(-DARCHITECTURE_${arch}=1)
+        endif()
+    endif()
+endfunction()
+
+if (NOT ENABLE_GENERIC)
+    if (MSVC)
+        detect_architecture("_M_AMD64" x86_64)
+        detect_architecture("_M_IX86" x86)
+        detect_architecture("_M_ARM" arm)
+        detect_architecture("_M_ARM64" arm64)
+    else()
+        detect_architecture("__x86_64__" x86_64)
+        detect_architecture("__i386__" x86)
+        detect_architecture("__arm__" arm)
+        detect_architecture("__aarch64__" arm64)
+    endif()
+endif()
+
+if (NOT DEFINED ARCHITECTURE)
+    set(ARCHITECTURE "GENERIC")
+    set(ARCHITECTURE_GENERIC 1)
+    add_definitions(-DARCHITECTURE_GENERIC=1)
+endif()
+
+message(STATUS "Target architecture: ${ARCHITECTURE}")
+
+if (MSVC AND ARCHITECTURE_x86)
+    message(FATAL_ERROR "Attempting to build with the x86 environment is not supported. \
+        This can typically happen if you used the Developer Command Prompt from the start menu;\
+        instead, run vcvars64.bat directly, located at C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Auxiliary/Build/vcvars64.bat")
+endif()
+
+if (CXX_CLANG_CL)
+    add_compile_options(
+        # clang-cl prints literally 10000+ warnings without this
+        $<$<COMPILE_LANGUAGE:C,CXX>:-Wno-unused-command-line-argument>
+        $<$<COMPILE_LANGUAGE:C,CXX>:-Wno-unsafe-buffer-usage>
+        $<$<COMPILE_LANGUAGE:C,CXX>:-Wno-unused-value>
+        $<$<COMPILE_LANGUAGE:C,CXX>:-Wno-extra-semi-stmt>
+        $<$<COMPILE_LANGUAGE:C,CXX>:-Wno-sign-conversion>
+        $<$<COMPILE_LANGUAGE:C,CXX>:-Wno-reserved-identifier>
+        $<$<COMPILE_LANGUAGE:C,CXX>:-Wno-deprecated-declarations>
+        $<$<COMPILE_LANGUAGE:C,CXX>:-Wno-cast-function-type-mismatch>
+        $<$<COMPILE_LANGUAGE:C,CXX>:/EHsc> # thanks microsoft
+    )
+
+    if (ARCHITECTURE_x86_64)
+        add_compile_options(
+            # Required CPU features for amd64
+            $<$<COMPILE_LANGUAGE:C,CXX>:-msse4.1>
+            $<$<COMPILE_LANGUAGE:C,CXX>:-mcx16>
+        )
+    endif()
+endif()
+
 set(CPM_SOURCE_CACHE ${CMAKE_SOURCE_DIR}/.cache/cpm)
 
 include(DownloadExternals)
@@ -36,7 +122,7 @@ include(CMakeDependentOption)
 include(CTest)
 
 # Disable Warnings as Errors for MSVC
-if (MSVC)
+if (CXX_CL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /WX-")
 endif()
 
@@ -58,7 +144,7 @@ CMAKE_DEPENDENT_OPTION(YUZU_USE_EXTERNAL_SDL2 "Compile external SDL2" ${EXT_DEFA
 
 cmake_dependent_option(ENABLE_LIBUSB "Enable the use of LibUSB" ON "NOT ANDROID" OFF)
 
-option(ENABLE_OPENGL "Enable OpenGL" ON)
+cmake_dependent_option(ENABLE_OPENGL "Enable OpenGL" ON "NOT WIN32 OR NOT ARCHITECTURE_arm64" OFF)
 mark_as_advanced(FORCE ENABLE_OPENGL)
 
 option(ENABLE_QT "Enable the Qt frontend" ON)
@@ -212,54 +298,6 @@ if (NOT EXISTS ${PROJECT_BINARY_DIR}/dist/compatibility_list/compatibility_list.
     file(WRITE ${PROJECT_BINARY_DIR}/dist/compatibility_list/compatibility_list.json "")
 endif()
 
-# Detect current compilation architecture and create standard definitions
-# =======================================================================
-
-include(CheckSymbolExists)
-function(detect_architecture symbol arch)
-    if (NOT DEFINED ARCHITECTURE)
-        set(CMAKE_REQUIRED_QUIET 1)
-        check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch})
-        unset(CMAKE_REQUIRED_QUIET)
-
-        # The output variable needs to be unique across invocations otherwise
-        # CMake's crazy scope rules will keep it defined
-        if (ARCHITECTURE_${arch})
-            set(ARCHITECTURE "${arch}" PARENT_SCOPE)
-            set(ARCHITECTURE_${arch} 1 PARENT_SCOPE)
-            add_compile_definitions(ARCHITECTURE_${arch}=1)
-        endif()
-    endif()
-endfunction()
-
-if (NOT ENABLE_GENERIC)
-    if (MSVC)
-        detect_architecture("_M_AMD64" x86_64)
-        detect_architecture("_M_IX86" x86)
-        detect_architecture("_M_ARM" arm)
-        detect_architecture("_M_ARM64" arm64)
-    else()
-        detect_architecture("__x86_64__" x86_64)
-        detect_architecture("__i386__" x86)
-        detect_architecture("__arm__" arm)
-        detect_architecture("__aarch64__" arm64)
-    endif()
-endif()
-
-if (NOT DEFINED ARCHITECTURE)
-    set(ARCHITECTURE "GENERIC")
-    set(ARCHITECTURE_GENERIC 1)
-    add_compile_definitions(ARCHITECTURE_GENERIC=1)
-endif()
-
-message(STATUS "Target architecture: ${ARCHITECTURE}")
-
-if (MSVC AND ARCHITECTURE_x86)
-    message(FATAL_ERROR "Attempting to build with the x86 environment is not supported. \
-        This can typically happen if you used the Developer Command Prompt from the start menu;\
-        instead, run vcvars64.bat directly, located at C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Auxiliary/Build/vcvars64.bat")
-endif()
-
 if (UNIX)
     add_compile_definitions(YUZU_UNIX=1)
 endif()
@@ -274,7 +312,7 @@ if (YUZU_ROOM)
 endif()
 
 # Build/optimization presets
-if (PLATFORM_LINUX)
+if (PLATFORM_LINUX OR CXX_CLANG)
     if (ARCHITECTURE_x86_64)
         set(YUZU_BUILD_PRESET "custom" CACHE STRING "Build preset to use. One of: custom, generic, v3, zen2, zen4, native")
         if (${YUZU_BUILD_PRESET} STREQUAL "generic")
@@ -341,6 +379,7 @@ if (YUZU_USE_CPM)
 
     # boost
     set(BOOST_INCLUDE_LIBRARIES algorithm icl pool container heap asio headers process filesystem crc variant)
+
     AddJsonPackage(boost)
 
     # really annoying thing where boost::headers doesn't work with cpm
@@ -350,13 +389,10 @@ if (YUZU_USE_CPM)
     if (Boost_ADDED)
         if (MSVC OR ANDROID)
             add_compile_definitions(YUZU_BOOST_v1)
-        else()
-            message(WARNING "Using bundled Boost on a non-MSVC or Android system is not recommended. You are strongly encouraged to install Boost through your system's package manager.")
         endif()
 
-        if (NOT MSVC)
+        if (NOT MSVC OR CXX_CLANG)
             # boost sucks
-            # Solaris (and probably other NIXes) need explicit pthread definition
             if (PLATFORM_SUN)
                 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthreads")
                 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthreads")
@@ -409,6 +445,14 @@ if (YUZU_USE_CPM)
 
     # Opus
     AddJsonPackage(opus)
+
+    if (Opus_ADDED)
+        if (MSVC AND CXX_CLANG)
+            target_compile_options(opus PRIVATE
+                -Wno-implicit-function-declaration
+            )
+        endif()
+    endif()
 else()
     # Enforce the search mode of non-required packages for better and shorter failure messages
     find_package(fmt 8 REQUIRED)
@@ -441,6 +485,10 @@ endif()
 
 # DiscordRPC
 if (USE_DISCORD_PRESENCE)
+    if (ARCHITECTURE_arm64)
+        add_compile_definitions(RAPIDJSON_ENDIAN=RAPIDJSON_LITTLEENDIAN)
+    endif()
+
     AddJsonPackage(discord-rpc)
 
     target_include_directories(discord-rpc INTERFACE ${discord-rpc_SOURCE_DIR}/include)
@@ -748,6 +796,27 @@ if (MSVC AND CMAKE_GENERATOR STREQUAL "Ninja")
     )
 endif()
 
+# Adjustments for clang-cl
+if (MSVC AND CXX_CLANG)
+    if (ARCHITECTURE_x86_64)
+        set(FILE_ARCH x86_64)
+    elseif (ARCHITECTURE_arm64)
+        set(FILE_ARCH aarch64)
+    else()
+        message(FATAL_ERROR "clang-cl: Unsupported architecture ${ARCHITECTURE}")
+    endif()
+
+    AddJsonPackage(llvm-mingw)
+    set(LIB_PATH "${llvm-mingw_SOURCE_DIR}/libclang_rt.builtins-${FILE_ARCH}.a")
+
+    add_library(llvm-mingw-runtime STATIC IMPORTED)
+    set_target_properties(llvm-mingw-runtime PROPERTIES
+        IMPORTED_LOCATION "${LIB_PATH}"
+    )
+
+    link_libraries(llvm-mingw-runtime)
+endif()
+
 if (YUZU_USE_FASTER_LD AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     # We will assume that if the compiler is GCC, it will attempt to use ld.bfd by default.
     # Try to pick a faster linker.
diff --git a/CMakeModules/DownloadExternals.cmake b/CMakeModules/DownloadExternals.cmake
index de45d15d2a..88fa183061 100644
--- a/CMakeModules/DownloadExternals.cmake
+++ b/CMakeModules/DownloadExternals.cmake
@@ -11,10 +11,17 @@ function(download_bundled_external remote_path lib_name cpm_key prefix_var versi
     set(package_repo "no_platform")
     set(package_extension "no_platform")
 
+    # TODO(crueter): Need to convert ffmpeg to a CI.
     if (WIN32 OR FORCE_WIN_ARCHIVES)
-        set(CACHE_KEY "windows")
-        set(package_repo "ext-windows-bin/raw/master/")
-        set(package_extension ".7z")
+        if (ARCHITECTURE_arm64)
+            set(CACHE_KEY "windows")
+            set(package_repo "ext-windows-arm64-bin/raw/master/")
+            set(package_extension ".zip")
+        elseif(ARCHITECTURE_x86_64)
+            set(CACHE_KEY "windows")
+            set(package_repo "ext-windows-bin/raw/master/")
+            set(package_extension ".7z")
+        endif()
     elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
         set(CACHE_KEY "linux")
         set(package_repo "ext-linux-bin/raw/master/")
diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake
index 3b8e996751..bcb5dc466a 100644
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -35,4 +35,6 @@ set(REPO_NAME "Eden")
 set(BUILD_ID ${GIT_BRANCH})
 set(BUILD_FULLNAME "${REPO_NAME} ${BUILD_VERSION} ")
 
+set(CXX_COMPILER "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+
 configure_file(scm_rev.cpp.in scm_rev.cpp @ONLY)
diff --git a/CMakeModules/WindowsCopyFiles.cmake b/CMakeModules/WindowsCopyFiles.cmake
index 08b598365d..8d37bd5c2c 100644
--- a/CMakeModules/WindowsCopyFiles.cmake
+++ b/CMakeModules/WindowsCopyFiles.cmake
@@ -12,16 +12,25 @@ set(__windows_copy_files YES)
 
 # Any number of files to copy from SOURCE_DIR to DEST_DIR can be specified after DEST_DIR.
 # This copying happens post-build.
-function(windows_copy_files TARGET SOURCE_DIR DEST_DIR)
-    # windows commandline expects the / to be \ so switch them
-    string(REPLACE "/" "\\\\" SOURCE_DIR ${SOURCE_DIR})
-    string(REPLACE "/" "\\\\" DEST_DIR ${DEST_DIR})
+if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+    function(windows_copy_files TARGET SOURCE_DIR DEST_DIR)
+        # windows commandline expects the / to be \ so switch them
+        string(REPLACE "/" "\\\\" SOURCE_DIR ${SOURCE_DIR})
+        string(REPLACE "/" "\\\\" DEST_DIR ${DEST_DIR})
 
-    # /NJH /NJS /NDL /NFL /NC /NS /NP - Silence any output
-    # cmake adds an extra check for command success which doesn't work too well with robocopy
-    # so trick it into thinking the command was successful with the || cmd /c "exit /b 0"
-    add_custom_command(TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${DEST_DIR}
-        COMMAND robocopy ${SOURCE_DIR} ${DEST_DIR} ${ARGN} /NJH /NJS /NDL /NFL /NC /NS /NP || cmd /c "exit /b 0"
-    )
-endfunction()
+        # /NJH /NJS /NDL /NFL /NC /NS /NP - Silence any output
+        # cmake adds an extra check for command success which doesn't work too well with robocopy
+        # so trick it into thinking the command was successful with the || cmd /c "exit /b 0"
+        add_custom_command(TARGET ${TARGET} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E make_directory ${DEST_DIR}
+            COMMAND robocopy ${SOURCE_DIR} ${DEST_DIR} ${ARGN} /NJH /NJS /NDL /NFL /NC /NS /NP || cmd /c "exit /b 0"
+        )
+    endfunction()
+else()
+    function(windows_copy_files TARGET SOURCE_DIR DEST_DIR)
+        add_custom_command(TARGET ${TARGET} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E make_directory ${DEST_DIR}
+            COMMAND cp -ra ${SOURCE_DIR}/. ${DEST_DIR}
+        )
+    endfunction()
+endif()
diff --git a/cpmfile.json b/cpmfile.json
index c720b69e89..e071e0a8b8 100644
--- a/cpmfile.json
+++ b/cpmfile.json
@@ -11,10 +11,15 @@
         "package": "Boost",
         "repo": "boostorg/boost",
         "tag": "boost-%VERSION%",
-        "artifact": "%TAG%-cmake.7z",
-        "hash": "e5b049e5b61964480ca816395f63f95621e66cb9bcf616a8b10e441e0e69f129e22443acb11e77bc1e8170f8e4171b9b7719891efc43699782bfcd4b3a365f01",
-        "git_version": "1.88.0",
-        "version": "1.57"
+        "artifact": "%TAG%-cmake.tar.xz",
+        "hash": "4fb7f6fde92762305aad8754d7643cd918dd1f3f67e104e9ab385b18c73178d72a17321354eb203b790b6702f2cf6d725a5d6e2dfbc63b1e35f9eb59fb42ece9",
+        "git_version": "1.89.0",
+        "version": "1.57",
+        "patches": [
+            "0001-clang-cl.patch",
+            "0002-use-marmasm.patch",
+            "0003-armasm-options.patch"
+        ]
     },
     "fmt": {
         "repo": "fmtlib/fmt",
@@ -77,16 +82,13 @@
     },
     "opus": {
         "package": "Opus",
-        "repo": "xiph/opus",
-        "sha": "5ded705cf4",
-        "hash": "0dc89e58ddda1f3bc6a7037963994770c5806c10e66f5cc55c59286fc76d0544fe4eca7626772b888fd719f434bc8a92f792bdb350c807968b2ac14cfc04b203",
+        "repo": "crueter/opus",
+        "sha": "ab19c44fad",
+        "hash": "79d0d015b19e74ce6076197fc32b86fe91d724a0b5a79e86adfc4bdcb946ece384e252adbbf742b74d03040913b70bb0e9556eafa59ef20e42d2f3f4d6f2859a",
         "version": "1.3",
         "find_args": "MODULE",
         "options": [
-            "OPUS_BUILD_TESTING OFF",
-            "OPUS_BUILD_PROGRAMS OFF",
-            "OPUS_INSTALL_PKG_CONFIG_MODULE OFF",
-            "OPUS_INSTALL_CMAKE_CONFIG_MODULE OFF"
+            "OPUS_PRESUME_NEON ON"
         ]
     },
     "cubeb": {
@@ -103,8 +105,8 @@
     },
     "boost_headers": {
         "repo": "boostorg/headers",
-        "sha": "0456900fad",
-        "hash": "50cd75dcdfc5f082225cdace058f47b4fb114a47585f7aee1d22236a910a80b667186254c214fa2fcebac67ae6d37ba4b6e695e1faea8affd6fd42a03cf996e3",
+        "sha": "95930ca8f5",
+        "hash": "d1dece16f3b209109de02123c537bfe1adf07a62b16c166367e7e5d62e0f7c323bf804c89b3192dd6871bc58a9d879d25a1cc3f7b9da0e497cf266f165816e2a",
         "bundled": true
     },
     "discord-rpc": {
@@ -143,5 +145,13 @@
         "version": "2.32.8",
         "min_version": "2.26.4",
         "cmake_filename": "sdl2"
+    },
+    "llvm-mingw": {
+        "repo": "misc/llvm-mingw",
+        "git_host": "git.crueter.xyz",
+        "tag": "20250828",
+        "version": "20250828",
+        "artifact": "clang-rt-builtins.tar.zst",
+        "hash": "d902392caf94e84f223766e2cc51ca5fab6cae36ab8dc6ef9ef6a683ab1c483bfcfe291ef0bd38ab16a4ecc4078344fa8af72da2f225ab4c378dee23f6186181"
     }
 }
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 6f64c79f5d..9f89cfc1f5 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -63,7 +63,14 @@ if (mbedtls_ADDED)
     if (NOT MSVC)
         target_compile_options(mbedcrypto PRIVATE
             -Wno-unused-but-set-variable
-            -Wno-string-concatenation)
+            -Wno-string-concatenation
+        )
+    elseif(CXX_CLANG)
+        foreach(TARGET mbedtls mbedcrypto mbedx509)
+            target_compile_options(${TARGET} PRIVATE
+                -w
+            )
+        endforeach()
     endif()
 endif()
 
@@ -84,6 +91,8 @@ if(MSVC AND USE_CCACHE AND sirit_ADDED)
     list(FILTER _opts EXCLUDE REGEX "/Zi")
     list(APPEND _opts "/Z7")
     set_target_properties(sirit PROPERTIES COMPILE_OPTIONS "${_opts}")
+elseif(MSVC AND CXX_CLANG)
+    target_compile_options(sirit PRIVATE -Wno-error=unused-command-line-argument)
 endif()
 
 # httplib
@@ -136,8 +145,16 @@ add_subdirectory(nx_tzdb)
 # VMA
 AddJsonPackage(vulkan-memory-allocator)
 
-if (VulkanMemoryAllocator_ADDED AND MSVC)
-    target_compile_options(VulkanMemoryAllocator INTERFACE /wd4189)
+if (VulkanMemoryAllocator_ADDED)
+    if (CXX_CLANG)
+        target_compile_options(VulkanMemoryAllocator INTERFACE
+            -Wno-unused-variable
+        )
+    elseif(MSVC)
+        target_compile_options(VulkanMemoryAllocator INTERFACE
+            /wd4189
+        )
+    endif()
 endif()
 
 if (NOT TARGET LLVM::Demangle)
diff --git a/externals/cpmfile.json b/externals/cpmfile.json
index f8ca528951..57258f771b 100644
--- a/externals/cpmfile.json
+++ b/externals/cpmfile.json
@@ -1,12 +1,9 @@
 {
     "mbedtls": {
-        "repo": "Mbed-TLS/mbedtls",
-        "sha": "8c88150ca1",
-        "hash": "769ad1e94c570671071e1f2a5c0f1027e0bf6bcdd1a80ea8ac970f2c86bc45ce4e31aa88d6d8110fc1bed1de81c48bc624df1b38a26f8b340a44e109d784a966",
-        "find_args": "MODULE",
-        "patches": [
-            "0001-cmake-version.patch"
-        ]
+        "repo": "eden-emulator/mbedtls",
+        "sha": "ce4f81f4a9",
+        "hash": "f2e7f887651b28745e508149214d409fd7cfdb92cb94b4146b47ff1e0fc09e47143f203ac18e34c2c1814b5bd031d04c74828676c0d4342920a2ddb7fd35e9a5",
+        "find_args": "MODULE"
     },
     "spirv-headers": {
         "package": "SPIRV-Headers",
@@ -29,18 +26,12 @@
     },
     "cpp-jwt": {
         "version": "1.4",
-        "repo": "arun11299/cpp-jwt",
-        "sha": "a54fa08a3b",
-        "hash": "a90f7e594ada0c7e49d5ff9211c71097534e7742a8e44bf0851b0362642a7271d53f5d83d04eeaae2bad17ef3f35e09e6818434d8eaefa038f3d1f7359d0969a",
+        "repo": "crueter/cpp-jwt",
+        "sha": "9eaea6328f",
+        "hash": "e237d92c59ebbf0dc8ac0bae3bc80340e1e9cf430e1c1c9638443001118e16de2b3e9036ac4b98105427667b0386d97831415170b68c432438dcad9ef8052de7",
         "find_args": "CONFIG",
         "options": [
-            "CPP_JWT_BUILD_EXAMPLES OFF",
-            "CPP_JWT_BUILD_TESTS OFF",
             "CPP_JWT_USE_VENDORED_NLOHMANN_JSON OFF"
-        ],
-        "patches": [
-            "0001-no-install.patch",
-            "0002-missing-decl.patch"
         ]
     },
     "vulkan-utility-headers": {
@@ -107,12 +98,6 @@
         "repo": "martinus/unordered_dense",
         "sha": "73f3cbb237",
         "hash": "c08c03063938339d61392b687562909c1a92615b6ef39ec8df19ea472aa6b6478e70d7d5e33d4a27b5d23f7806daf57fe1bacb8124c8a945c918c7663a9e8532",
-        "find_args": "CONFIG",
-        "options": [
-            "UNORDERED_DENSE_INSTALL OFF"
-        ],
-        "patches": [
-            "0001-cmake.patch"
-        ]
+        "find_args": "CONFIG"
     }
 }
diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h
index 66b93c1c74..67ad0ae6f8 100755
--- a/externals/sse2neon/sse2neon.h
+++ b/externals/sse2neon/sse2neon.h
@@ -183,7 +183,7 @@
     }
 
 /* Compiler barrier */
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
 #define SSE2NEON_BARRIER() _ReadWriteBarrier()
 #else
 #define SSE2NEON_BARRIER()                     \
@@ -859,7 +859,7 @@ FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 {
     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     __n64 a1 = {a}, b1 = {b};
     return vreinterpretq_u64_p128(vmull_p64(a1, b1));
 #else
@@ -1770,7 +1770,7 @@ FORCE_INLINE void _mm_free(void *addr)
 FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
 {
     uint64_t value;
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     value = _ReadStatusReg(ARM64_FPCR);
 #else
     __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
@@ -1780,7 +1780,7 @@ FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
 
 FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
 {
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     _WriteStatusReg(ARM64_FPCR, value);
 #else
     __asm__ __volatile__("msr FPCR, %0" ::"r"(value));  /* write */
@@ -2249,7 +2249,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
 FORCE_INLINE void _mm_prefetch(char const *p, int i)
 {
     (void) i;
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     switch (i) {
     case _MM_HINT_NTA:
         __prefetch2(p, 1);
@@ -4820,7 +4820,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
 FORCE_INLINE void _mm_pause(void)
 {
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     __isb(_ARM64_BARRIER_SY);
 #else
     __asm__ __volatile__("isb\n");
@@ -5716,7 +5716,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
     __m128d a;
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     a = _mm_setzero_pd();
 #endif
     return a;
@@ -8130,7 +8130,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
 
 FORCE_INLINE int _sse2neon_clz(unsigned int x)
 {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
     unsigned long cnt = 0;
     if (_BitScanReverse(&cnt, x))
         return 31 - cnt;
@@ -8142,7 +8142,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x)
 
 FORCE_INLINE int _sse2neon_ctz(unsigned int x)
 {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
     unsigned long cnt = 0;
     if (_BitScanForward(&cnt, x))
         return cnt;
@@ -9058,7 +9058,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
     // AESE does ShiftRows and SubBytes on A
     uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
 
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
     uint8x16_t dest = {
         // Undo ShiftRows step from AESE and extract X1 and X3
         u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
@@ -9245,7 +9245,7 @@ FORCE_INLINE uint64_t _rdtsc(void)
      * bits wide and it is attributed with the flag 'cap_user_time_short'
      * is true.
      */
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
     val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
 #else
     __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b1fbab6a59..eb66e55964 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,7 +18,7 @@ set_property(DIRECTORY APPEND PROPERTY
     COMPILE_DEFINITIONS $<$<CONFIG:Debug>:_DEBUG> $<$<NOT:$<CONFIG:Debug>>:NDEBUG>)
 
 # Set compilation flags
-if (MSVC)
+if (MSVC AND NOT CXX_CLANG)
     set(CMAKE_CONFIGURATION_TYPES Debug Release CACHE STRING "" FORCE)
 
     # Silence "deprecation" warnings
@@ -69,10 +69,6 @@ if (MSVC)
         /external:anglebrackets # Treats all headers included by #include <header>, where the header file is enclosed in angle brackets (< >), as external headers
         /external:W0            # Sets the default warning level to 0 for external headers, effectively disabling warnings for them.
 
-        # Warnings
-        /W4
-        /WX-
-
         /we4062 # Enumerator 'identifier' in a switch of enum 'enumeration' is not handled
         /we4189 # 'identifier': local variable is initialized but not referenced
         /we4265 # 'class': class has virtual functions, but destructor is not virtual
@@ -97,6 +93,14 @@ if (MSVC)
         /wd4702 # unreachable code (when used with LTO)
     )
 
+    if (NOT CXX_CLANG)
+        add_compile_options(
+            # Warnings
+            /W4
+            /WX-
+        )
+    endif()
+
     if (USE_CCACHE OR YUZU_USE_PRECOMPILED_HEADERS)
         # when caching, we need to use /Z7 to downgrade debug info to use an older but more cacheable format
         # Precompiled headers are deleted if not using /Z7. See https://github.com/nanoant/CMakePCHCompiler/issues/21
@@ -118,9 +122,13 @@ if (MSVC)
     set(CMAKE_EXE_LINKER_FLAGS_DEBUG   "/DEBUG /MANIFEST:NO" CACHE STRING "" FORCE)
     set(CMAKE_EXE_LINKER_FLAGS_RELEASE "/DEBUG /MANIFEST:NO /INCREMENTAL:NO /OPT:REF,ICF" CACHE STRING "" FORCE)
 else()
-    add_compile_options(
-        -fwrapv
+    if (NOT MSVC)
+        add_compile_options(
+            -fwrapv
+        )
+    endif()
 
+    add_compile_options(
         -Werror=all
         -Werror=extra
         -Werror=missing-declarations
@@ -133,14 +141,19 @@ else()
         -Wno-missing-field-initializers
     )
 
-    if (CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES IntelLLVM) # Clang or AppleClang
+    if (CXX_CLANG OR CXX_ICC) # Clang or AppleClang
+        if (NOT MSVC)
+            add_compile_options(
+                -Werror=shadow-uncaptured-local
+                -Werror=implicit-fallthrough
+                -Werror=type-limits
+            )
+        endif()
+
         add_compile_options(
             -Wno-braced-scalar-init
             -Wno-unused-private-field
             -Wno-nullability-completeness
-            -Werror=shadow-uncaptured-local
-            -Werror=implicit-fallthrough
-            -Werror=type-limits
         )
     endif()
 
@@ -148,12 +161,12 @@ else()
         add_compile_options("-mcx16")
     endif()
 
-    if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL Clang)
+    if (APPLE AND CXX_CLANG)
         add_compile_options("-stdlib=libc++")
     endif()
 
     # GCC bugs
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "11" AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "11" AND CXX_GCC)
         # These diagnostics would be great if they worked, but are just completely broken
         # and produce bogus errors on external libraries like fmt.
         add_compile_options(
diff --git a/src/audio_core/adsp/apps/audio_renderer/audio_renderer.cpp b/src/audio_core/adsp/apps/audio_renderer/audio_renderer.cpp
index d799f3f06f..b874f87ae6 100644
--- a/src/audio_core/adsp/apps/audio_renderer/audio_renderer.cpp
+++ b/src/audio_core/adsp/apps/audio_renderer/audio_renderer.cpp
@@ -193,7 +193,7 @@ void AudioRenderer::Main(std::stop_token stop_token) {
                         }
                     }
 
-                    max_time = std::min(command_buffer.time_limit, max_time);
+                    max_time = (std::min)(command_buffer.time_limit, max_time);
                     command_list_processor.SetProcessTimeMax(max_time);
 
                     if (index == 0) {
diff --git a/src/audio_core/common/common.h b/src/audio_core/common/common.h
index 6abd9be45e..eaf0c6470f 100644
--- a/src/audio_core/common/common.h
+++ b/src/audio_core/common/common.h
@@ -73,9 +73,9 @@ constexpr s32 HighestVoicePriority = 0;
 constexpr u32 BufferAlignment = 0x40;
 constexpr u32 WorkbufferAlignment = 0x1000;
 constexpr s32 FinalMixId = 0;
-constexpr s32 InvalidDistanceFromFinalMix = std::numeric_limits<s32>::min();
+constexpr s32 InvalidDistanceFromFinalMix = (std::numeric_limits<s32>::min)();
 constexpr s32 UnusedSplitterId = -1;
-constexpr s32 UnusedMixId = std::numeric_limits<s32>::max();
+constexpr s32 UnusedMixId = (std::numeric_limits<s32>::max)();
 constexpr u32 InvalidNodeId = 0xF0000000;
 constexpr s32 InvalidProcessOrder = -1;
 constexpr u32 MaxBiquadFilters = 2;
diff --git a/src/audio_core/device/audio_buffers.h b/src/audio_core/device/audio_buffers.h
index 9e84a9c059..9016246fbf 100644
--- a/src/audio_core/device/audio_buffers.h
+++ b/src/audio_core/device/audio_buffers.h
@@ -51,7 +51,7 @@ public:
      */
     void RegisterBuffers(boost::container::static_vector<AudioBuffer, N>& out_buffers) {
         std::scoped_lock l{lock};
-        const s32 to_register{std::min(std::min(appended_count, BufferAppendLimit),
+        const s32 to_register{(std::min)((std::min)(appended_count, BufferAppendLimit),
                                        BufferAppendLimit - registered_count)};
 
         for (s32 i = 0; i < to_register; i++) {
@@ -175,7 +175,7 @@ public:
         }
 
         size_t buffers_to_flush{
-            std::min(static_cast<u32>(registered_count + appended_count), max_buffers)};
+            (std::min)(static_cast<u32>(registered_count + appended_count), max_buffers)};
         if (buffers_to_flush == 0) {
             return 0;
         }
diff --git a/src/audio_core/renderer/audio_device.cpp b/src/audio_core/renderer/audio_device.cpp
index 5be5594f6f..387d23b0b4 100644
--- a/src/audio_core/renderer/audio_device.cpp
+++ b/src/audio_core/renderer/audio_device.cpp
@@ -45,7 +45,7 @@ u32 AudioDevice::ListAudioDeviceName(std::span<AudioDeviceName> out_buffer) cons
         names = device_names;
     }
 
-    const u32 out_count{static_cast<u32>(std::min(out_buffer.size(), names.size()))};
+    const u32 out_count{static_cast<u32>((std::min)(out_buffer.size(), names.size()))};
     for (u32 i = 0; i < out_count; i++) {
         out_buffer[i] = names[i];
     }
@@ -53,7 +53,7 @@ u32 AudioDevice::ListAudioDeviceName(std::span<AudioDeviceName> out_buffer) cons
 }
 
 u32 AudioDevice::ListAudioOutputDeviceName(std::span<AudioDeviceName> out_buffer) const {
-    const u32 out_count{static_cast<u32>(std::min(out_buffer.size(), output_device_names.size()))};
+    const u32 out_count{static_cast<u32>((std::min)(out_buffer.size(), output_device_names.size()))};
 
     for (u32 i = 0; i < out_count; i++) {
         out_buffer[i] = output_device_names[i];
diff --git a/src/audio_core/renderer/behavior/behavior_info.cpp b/src/audio_core/renderer/behavior/behavior_info.cpp
index 0585390426..f139e63ffb 100644
--- a/src/audio_core/renderer/behavior/behavior_info.cpp
+++ b/src/audio_core/renderer/behavior/behavior_info.cpp
@@ -43,7 +43,7 @@ void BehaviorInfo::AppendError(const ErrorInfo& error) {
 }
 
 void BehaviorInfo::CopyErrorInfo(std::span<ErrorInfo> out_errors, u32& out_count) const {
-    out_count = std::min(error_count, MaxErrors);
+    out_count = (std::min)(error_count, MaxErrors);
 
     for (size_t i = 0; i < MaxErrors; i++) {
         if (i < out_count) {
diff --git a/src/audio_core/renderer/command/command_buffer.cpp b/src/audio_core/renderer/command/command_buffer.cpp
index 67d43e69aa..f9e8575691 100644
--- a/src/audio_core/renderer/command/command_buffer.cpp
+++ b/src/audio_core/renderer/command/command_buffer.cpp
@@ -464,7 +464,7 @@ void CommandBuffer::GenerateDeviceSinkCommand(const s32 node_id, const s16 buffe
     s16 max_input{0};
     for (u32 i = 0; i < parameter.input_count; i++) {
         cmd.inputs[i] = buffer_offset + parameter.inputs[i];
-        max_input = std::max(max_input, cmd.inputs[i]);
+        max_input = (std::max)(max_input, cmd.inputs[i]);
     }
 
     if (state.upsampler_info != nullptr) {
diff --git a/src/audio_core/renderer/command/command_generator.h b/src/audio_core/renderer/command/command_generator.h
index 38ee2a64ee..497cfa92f2 100644
--- a/src/audio_core/renderer/command/command_generator.h
+++ b/src/audio_core/renderer/command/command_generator.h
@@ -56,11 +56,11 @@ public:
         // Voices
         u64 voice_size{0};
         if (behavior.IsWaveBufferVer2Supported()) {
-            voice_size = std::max(std::max(sizeof(AdpcmDataSourceVersion2Command),
+            voice_size = (std::max)((std::max)(sizeof(AdpcmDataSourceVersion2Command),
                                            sizeof(PcmInt16DataSourceVersion2Command)),
                                   sizeof(PcmFloatDataSourceVersion2Command));
         } else {
-            voice_size = std::max(std::max(sizeof(AdpcmDataSourceVersion1Command),
+            voice_size = (std::max)((std::max)(sizeof(AdpcmDataSourceVersion1Command),
                                            sizeof(PcmInt16DataSourceVersion1Command)),
                                   sizeof(PcmFloatDataSourceVersion1Command));
         }
@@ -82,7 +82,7 @@ public:
 
         // Sinks
         size +=
-            params.sinks * std::max(sizeof(DeviceSinkCommand), sizeof(CircularBufferSinkCommand));
+            params.sinks * (std::max)(sizeof(DeviceSinkCommand), sizeof(CircularBufferSinkCommand));
 
         // Performance
         size += (params.effects + params.voices + params.sinks + params.sub_mixes + 1 +
diff --git a/src/audio_core/renderer/command/data_source/decode.cpp b/src/audio_core/renderer/command/data_source/decode.cpp
index 905613a5a8..b42ecb961f 100644
--- a/src/audio_core/renderer/command/data_source/decode.cpp
+++ b/src/audio_core/renderer/command/data_source/decode.cpp
@@ -29,8 +29,8 @@ constexpr std::array<u8, 3> PitchBySrcQuality = {4, 8, 4};
 template <typename T>
 static u32 DecodePcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
                      const DecodeArg& req) {
-    constexpr s32 min{std::numeric_limits<s16>::min()};
-    constexpr s32 max{std::numeric_limits<s16>::max()};
+    constexpr s32 min{(std::numeric_limits<s16>::min)()};
+    constexpr s32 max{(std::numeric_limits<s16>::max)()};
 
     if (req.buffer == 0 || req.buffer_size == 0) {
         return 0;
@@ -41,7 +41,7 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
     }
 
     auto samples_to_decode{
-        std::min(req.samples_to_read, req.end_offset - req.start_offset - req.offset)};
+        (std::min)(req.samples_to_read, req.end_offset - req.start_offset - req.offset)};
     u32 channel_count{static_cast<u32>(req.channel_count)};
 
     switch (req.channel_count) {
@@ -55,7 +55,7 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
         if constexpr (std::is_floating_point_v<T>) {
             for (u32 i = 0; i < samples_to_decode; i++) {
                 auto sample{static_cast<s32>(samples[i * channel_count + req.target_channel] *
-                                             std::numeric_limits<s16>::max())};
+                                             (std::numeric_limits<s16>::max)())};
                 out_buffer[i] = static_cast<s16>(std::clamp(sample, min, max));
             }
         } else {
@@ -79,7 +79,7 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
         if constexpr (std::is_floating_point_v<T>) {
             for (u32 i = 0; i < samples_to_decode; i++) {
                 auto sample{static_cast<s32>(samples[i * channel_count + req.target_channel] *
-                                             std::numeric_limits<s16>::max())};
+                                             (std::numeric_limits<s16>::max)())};
                 out_buffer[i] = static_cast<s16>(std::clamp(sample, min, max));
             }
         } else {
@@ -125,7 +125,7 @@ static u32 DecodeAdpcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
     }
 
     auto start_pos{req.start_offset + req.offset};
-    auto samples_to_process{std::min(req.end_offset - start_pos, req.samples_to_read)};
+    auto samples_to_process{(std::min)(req.end_offset - start_pos, req.samples_to_read)};
     if (samples_to_process == 0) {
         return 0;
     }
@@ -139,7 +139,7 @@ static u32 DecodeAdpcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
         position_in_frame += 2;
     }
 
-    const auto size{std::max((samples_to_process / 8U) * SamplesPerFrame, 8U)};
+    const auto size{(std::max)((samples_to_process / 8U) * SamplesPerFrame, 8U)};
     Core::Memory::CpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::UnsafeRead> wavebuffer(
         memory, req.buffer + position_in_frame / 2, size);
 
@@ -260,7 +260,7 @@ void DecodeFromWaveBuffers(Core::Memory::Memory& memory, const DecodeFromWaveBuf
     auto max_remaining_sample_count{
         ((Common::FixedPoint<17, 15>(TempBufferSize) - fraction) / sample_rate_ratio)
             .to_uint_floor()};
-    max_remaining_sample_count = std::min(max_remaining_sample_count, remaining_sample_count);
+    max_remaining_sample_count = (std::min)(max_remaining_sample_count, remaining_sample_count);
 
     auto wavebuffers_consumed{voice_state.wave_buffers_consumed};
     auto wavebuffer_index{voice_state.wave_buffer_index};
@@ -273,7 +273,7 @@ void DecodeFromWaveBuffers(Core::Memory::Memory& memory, const DecodeFromWaveBuf
     std::array<s16, TempBufferSize> temp_buffer{};
 
     while (remaining_sample_count > 0) {
-        const auto samples_to_write{std::min(remaining_sample_count, max_remaining_sample_count)};
+        const auto samples_to_write{(std::min)(remaining_sample_count, max_remaining_sample_count)};
         const auto samples_to_read{
             (fraction + samples_to_write * sample_rate_ratio).to_uint_floor()};
 
diff --git a/src/audio_core/renderer/command/effect/aux_.cpp b/src/audio_core/renderer/command/effect/aux_.cpp
index 74d9c229f3..cb23007a66 100644
--- a/src/audio_core/renderer/command/effect/aux_.cpp
+++ b/src/audio_core/renderer/command/effect/aux_.cpp
@@ -86,7 +86,7 @@ static u32 WriteAuxBufferDsp(Core::Memory::Memory& memory, CpuAddr send_info_,
     u32 write_count{write_count_};
     u32 read_pos{0};
     while (write_count > 0) {
-        u32 to_write{std::min(count_max - target_write_offset, write_count)};
+        u32 to_write{(std::min)(count_max - target_write_offset, write_count)};
         if (to_write > 0) {
             const auto write_addr = send_buffer + target_write_offset * sizeof(s32);
             memory.WriteBlockUnsafe(write_addr, &input[read_pos], to_write * sizeof(s32));
@@ -157,7 +157,7 @@ static u32 ReadAuxBufferDsp(Core::Memory::Memory& memory, CpuAddr return_info_,
     u32 read_count{read_count_};
     u32 write_pos{0};
     while (read_count > 0) {
-        u32 to_read{std::min(count_max - target_read_offset, read_count)};
+        u32 to_read{(std::min)(count_max - target_read_offset, read_count)};
         if (to_read > 0) {
             const auto read_addr = return_buffer + target_read_offset * sizeof(s32);
             memory.ReadBlockUnsafe(read_addr, &output[write_pos], to_read * sizeof(s32));
diff --git a/src/audio_core/renderer/command/effect/biquad_filter.cpp b/src/audio_core/renderer/command/effect/biquad_filter.cpp
index 3392e7747d..dbc7085d16 100644
--- a/src/audio_core/renderer/command/effect/biquad_filter.cpp
+++ b/src/audio_core/renderer/command/effect/biquad_filter.cpp
@@ -20,8 +20,8 @@ namespace AudioCore::Renderer {
 void ApplyBiquadFilterFloat(std::span<s32> output, std::span<const s32> input,
                             std::array<s16, 3>& b_, std::array<s16, 2>& a_,
                             VoiceState::BiquadFilterState& state, const u32 sample_count) {
-    constexpr f64 min{std::numeric_limits<s32>::min()};
-    constexpr f64 max{std::numeric_limits<s32>::max()};
+    constexpr f64 min{(std::numeric_limits<s32>::min)()};
+    constexpr f64 max{(std::numeric_limits<s32>::max)()};
     std::array<f64, 3> b{Common::FixedPoint<50, 14>::from_base(b_[0]).to_double(),
                          Common::FixedPoint<50, 14>::from_base(b_[1]).to_double(),
                          Common::FixedPoint<50, 14>::from_base(b_[2]).to_double()};
@@ -61,8 +61,8 @@ void ApplyBiquadFilterFloat(std::span<s32> output, std::span<const s32> input,
 static void ApplyBiquadFilterInt(std::span<s32> output, std::span<const s32> input,
                                  std::array<s16, 3>& b, std::array<s16, 2>& a,
                                  VoiceState::BiquadFilterState& state, const u32 sample_count) {
-    constexpr s64 min{std::numeric_limits<s32>::min()};
-    constexpr s64 max{std::numeric_limits<s32>::max()};
+    constexpr s64 min{(std::numeric_limits<s32>::min)()};
+    constexpr s64 max{(std::numeric_limits<s32>::max)()};
 
     for (u32 i = 0; i < sample_count; i++) {
         const s64 in_sample{input[i]};
diff --git a/src/audio_core/renderer/command/effect/capture.cpp b/src/audio_core/renderer/command/effect/capture.cpp
index f235ce027f..95bc88e464 100644
--- a/src/audio_core/renderer/command/effect/capture.cpp
+++ b/src/audio_core/renderer/command/effect/capture.cpp
@@ -79,7 +79,7 @@ static u32 WriteAuxBufferDsp(Core::Memory::Memory& memory, const CpuAddr send_in
     u32 write_count{write_count_};
     u32 write_pos{0};
     while (write_count > 0) {
-        u32 to_write{std::min(count_max - target_write_offset, write_count)};
+        u32 to_write{(std::min)(count_max - target_write_offset, write_count)};
 
         if (to_write > 0) {
             memory.WriteBlockUnsafe(send_buffer + target_write_offset * sizeof(s32),
diff --git a/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp b/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp
index ecfdfabc61..14e77b3cb2 100644
--- a/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp
+++ b/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp
@@ -76,9 +76,9 @@ static void UpdateI3dl2ReverbEffectParameter(const I3dl2ReverbInfo::ParameterVer
 
     state.dry_gain = params.dry_gain;
     Common::FixedPoint<50, 14> early_gain{
-        std::min(params.room_gain + params.reflection_gain, 5000.0f) / 2000.0f};
+        (std::min)(params.room_gain + params.reflection_gain, 5000.0f) / 2000.0f};
     state.early_gain = pow_10(early_gain.to_float());
-    Common::FixedPoint<50, 14> late_gain{std::min(params.room_gain + params.reverb_gain, 5000.0f) /
+    Common::FixedPoint<50, 14> late_gain{(std::min)(params.room_gain + params.reverb_gain, 5000.0f) /
                                          2000.0f};
     state.late_gain = pow_10(late_gain.to_float());
 
@@ -94,7 +94,7 @@ static void UpdateI3dl2ReverbEffectParameter(const I3dl2ReverbInfo::ParameterVer
         const Common::FixedPoint<50, 14> c{
             std::sqrt(std::pow(b.to_float(), 2.0f) + (std::pow(a.to_float(), 2.0f) * -4.0f))};
 
-        state.lowpass_1 = std::min(((b - c) / (a * 2.0f)).to_float(), 0.99723f);
+        state.lowpass_1 = (std::min)(((b - c) / (a * 2.0f)).to_float(), 0.99723f);
         state.lowpass_2 = 1.0f - state.lowpass_1;
     }
 
diff --git a/src/audio_core/renderer/command/effect/light_limiter.cpp b/src/audio_core/renderer/command/effect/light_limiter.cpp
index 63aa06f5c3..3488dd37b5 100644
--- a/src/audio_core/renderer/command/effect/light_limiter.cpp
+++ b/src/audio_core/renderer/command/effect/light_limiter.cpp
@@ -50,8 +50,8 @@ static void ApplyLightLimiterEffect(const LightLimiterInfo::ParameterVersion2& p
                                     std::span<std::span<const s32>> inputs,
                                     std::span<std::span<s32>> outputs, const u32 sample_count,
                                     LightLimiterInfo::StatisticsInternal* statistics) {
-    constexpr s64 min{std::numeric_limits<s32>::min()};
-    constexpr s64 max{std::numeric_limits<s32>::max()};
+    constexpr s64 min{(std::numeric_limits<s32>::min)()};
+    constexpr s64 max{(std::numeric_limits<s32>::max)()};
 
     const auto recip_estimate = [](f64 a) -> f64 {
         s32 q, s;
@@ -117,9 +117,9 @@ static void ApplyLightLimiterEffect(const LightLimiterInfo::ParameterVersion2& p
 
                 if (statistics) {
                     statistics->channel_max_sample[channel] =
-                        std::max(statistics->channel_max_sample[channel], abs_sample.to_float());
+                        (std::max)(statistics->channel_max_sample[channel], abs_sample.to_float());
                     statistics->channel_compression_gain_min[channel] =
-                        std::min(statistics->channel_compression_gain_min[channel],
+                        (std::min)(statistics->channel_compression_gain_min[channel],
                                  state.compression_gain[channel].to_float());
                 }
             }
diff --git a/src/audio_core/renderer/command/effect/reverb.cpp b/src/audio_core/renderer/command/effect/reverb.cpp
index 7f152a9629..67b893305a 100644
--- a/src/audio_core/renderer/command/effect/reverb.cpp
+++ b/src/audio_core/renderer/command/effect/reverb.cpp
@@ -94,7 +94,7 @@ static void UpdateReverbEffectParameter(const ReverbInfo::ParameterVersion2& par
     for (u32 i = 0; i < ReverbInfo::MaxDelayTaps; i++) {
         auto early_delay{
             ((pre_delay_time + EarlyDelayTimes[params.early_mode][i]) * sample_rate).to_int()};
-        early_delay = std::min(early_delay, state.pre_delay_line.sample_count_max);
+        early_delay = (std::min)(early_delay, state.pre_delay_line.sample_count_max);
         state.early_delay_times[i] = early_delay + 1;
         state.early_gains[i] = Common::FixedPoint<50, 14>::from_base(params.early_gain) *
                                EarlyDelayGains[params.early_mode][i];
@@ -107,7 +107,7 @@ static void UpdateReverbEffectParameter(const ReverbInfo::ParameterVersion2& par
 
     auto pre_time{
         ((pre_delay_time + EarlyDelayTimes[params.early_mode][10]) * sample_rate).to_int()};
-    state.pre_delay_time = std::min(pre_time, state.pre_delay_line.sample_count_max);
+    state.pre_delay_time = (std::min)(pre_time, state.pre_delay_line.sample_count_max);
 
     if (!unk_initialized) {
         unk_value = cos((1280.0f / sample_rate).to_float());
@@ -117,13 +117,13 @@ static void UpdateReverbEffectParameter(const ReverbInfo::ParameterVersion2& par
     for (u32 i = 0; i < ReverbInfo::MaxDelayLines; i++) {
         const auto fdn_delay{(FdnDelayTimes[params.late_mode][i] * sample_rate).to_int()};
         state.fdn_delay_lines[i].sample_count =
-            std::min(fdn_delay, state.fdn_delay_lines[i].sample_count_max);
+            (std::min)(fdn_delay, state.fdn_delay_lines[i].sample_count_max);
         state.fdn_delay_lines[i].buffer_end =
             &state.fdn_delay_lines[i].buffer[state.fdn_delay_lines[i].sample_count - 1];
 
         const auto decay_delay{(DecayDelayTimes[params.late_mode][i] * sample_rate).to_int()};
         state.decay_delay_lines[i].sample_count =
-            std::min(decay_delay, state.decay_delay_lines[i].sample_count_max);
+            (std::min)(decay_delay, state.decay_delay_lines[i].sample_count_max);
         state.decay_delay_lines[i].buffer_end =
             &state.decay_delay_lines[i].buffer[state.decay_delay_lines[i].sample_count - 1];
 
diff --git a/src/audio_core/renderer/command/mix/depop_for_mix_buffers.cpp b/src/audio_core/renderer/command/mix/depop_for_mix_buffers.cpp
index caedb56b79..f80fb92631 100644
--- a/src/audio_core/renderer/command/mix/depop_for_mix_buffers.cpp
+++ b/src/audio_core/renderer/command/mix/depop_for_mix_buffers.cpp
@@ -43,7 +43,7 @@ void DepopForMixBuffersCommand::Dump(
 }
 
 void DepopForMixBuffersCommand::Process(const AudioRenderer::CommandListProcessor& processor) {
-    auto end_index{std::min(processor.buffer_count, input + count)};
+    auto end_index{(std::min)(processor.buffer_count, input + count)};
     std::span<s32> depop_buff{reinterpret_cast<s32*>(depop_buffer), end_index};
 
     for (u32 index = input; index < end_index; index++) {
diff --git a/src/audio_core/renderer/command/resample/upsample.cpp b/src/audio_core/renderer/command/resample/upsample.cpp
index 691d70390f..add975504c 100644
--- a/src/audio_core/renderer/command/resample/upsample.cpp
+++ b/src/audio_core/renderer/command/resample/upsample.cpp
@@ -215,7 +215,7 @@ auto UpsampleCommand::Dump([[maybe_unused]] const AudioRenderer::CommandListProc
 
 void UpsampleCommand::Process(const AudioRenderer::CommandListProcessor& processor) {
     const auto info{reinterpret_cast<UpsamplerInfo*>(upsampler_info)};
-    const auto input_count{std::min(info->input_count, buffer_count)};
+    const auto input_count{(std::min)(info->input_count, buffer_count)};
     const std::span<const s16> inputs_{reinterpret_cast<const s16*>(inputs), input_count};
 
     for (u32 i = 0; i < input_count; i++) {
diff --git a/src/audio_core/renderer/command/sink/circular_buffer.cpp b/src/audio_core/renderer/command/sink/circular_buffer.cpp
index e056d15a65..8ef48a26df 100644
--- a/src/audio_core/renderer/command/sink/circular_buffer.cpp
+++ b/src/audio_core/renderer/command/sink/circular_buffer.cpp
@@ -21,8 +21,8 @@ void CircularBufferSinkCommand::Dump(
 }
 
 void CircularBufferSinkCommand::Process(const AudioRenderer::CommandListProcessor& processor) {
-    constexpr s32 min{std::numeric_limits<s16>::min()};
-    constexpr s32 max{std::numeric_limits<s16>::max()};
+    constexpr s32 min{(std::numeric_limits<s16>::min)()};
+    constexpr s32 max{(std::numeric_limits<s16>::max)()};
 
     std::array<s16, TargetSampleCount * MaxChannels> output{};
     for (u32 channel = 0; channel < input_count; channel++) {
diff --git a/src/audio_core/renderer/command/sink/device.cpp b/src/audio_core/renderer/command/sink/device.cpp
index 3480ed475c..86a257363b 100644
--- a/src/audio_core/renderer/command/sink/device.cpp
+++ b/src/audio_core/renderer/command/sink/device.cpp
@@ -20,8 +20,8 @@ void DeviceSinkCommand::Dump([[maybe_unused]] const AudioRenderer::CommandListPr
 }
 
 void DeviceSinkCommand::Process(const AudioRenderer::CommandListProcessor& processor) {
-    constexpr s32 min = std::numeric_limits<s16>::min();
-    constexpr s32 max = std::numeric_limits<s16>::max();
+    constexpr s32 min = (std::numeric_limits<s16>::min)();
+    constexpr s32 max = (std::numeric_limits<s16>::max)();
 
     auto stream{processor.GetOutputSinkStream()};
     stream->SetSystemChannels(input_count);
diff --git a/src/audio_core/renderer/mix/mix_context.cpp b/src/audio_core/renderer/mix/mix_context.cpp
index c712610bbd..1103af910b 100644
--- a/src/audio_core/renderer/mix/mix_context.cpp
+++ b/src/audio_core/renderer/mix/mix_context.cpp
@@ -126,7 +126,7 @@ bool MixContext::TSortInfo(const SplitterContext& splitter_context) {
     }
 
     auto sorted_results{node_states.GetSortedResuls()};
-    const auto result_size{std::min(count, static_cast<s32>(sorted_results.second))};
+    const auto result_size{(std::min)(count, static_cast<s32>(sorted_results.second))};
     for (s32 i = 0; i < result_size; i++) {
         sorted_mix_infos[i] = &mix_infos[sorted_results.first[i]];
     }
diff --git a/src/audio_core/renderer/sink/sink_info_base.h b/src/audio_core/renderer/sink/sink_info_base.h
index e10d1cb382..2a7fd81f68 100644
--- a/src/audio_core/renderer/sink/sink_info_base.h
+++ b/src/audio_core/renderer/sink/sink_info_base.h
@@ -168,9 +168,9 @@ protected:
     /// Node id for this sink
     u32 node_id{};
     /// State buffer for this sink
-    std::array<u8, std::max(sizeof(DeviceState), sizeof(CircularBufferState))> state{};
+    std::array<u8, (std::max)(sizeof(DeviceState), sizeof(CircularBufferState))> state{};
     /// Parameter buffer for this sink
-    std::array<u8, std::max(sizeof(DeviceInParameter), sizeof(CircularBufferInParameter))>
+    std::array<u8, (std::max)(sizeof(DeviceInParameter), sizeof(CircularBufferInParameter))>
         parameter{};
 };
 
diff --git a/src/audio_core/renderer/splitter/splitter_context.cpp b/src/audio_core/renderer/splitter/splitter_context.cpp
index d0f3b60c29..583cbaf735 100644
--- a/src/audio_core/renderer/splitter/splitter_context.cpp
+++ b/src/audio_core/renderer/splitter/splitter_context.cpp
@@ -170,7 +170,7 @@ void SplitterContext::RecomposeDestination(SplitterInfo& out_info,
 
     auto dest_count{info_header->destination_count};
     if (!splitter_bug_fixed) {
-        dest_count = std::min(dest_count, GetDestCountPerInfoForCompat());
+        dest_count = (std::min)(dest_count, GetDestCountPerInfoForCompat());
     }
 
     if (dest_count == 0) {
diff --git a/src/audio_core/renderer/system.cpp b/src/audio_core/renderer/system.cpp
index c30d68426c..c4a2768b93 100644
--- a/src/audio_core/renderer/system.cpp
+++ b/src/audio_core/renderer/system.cpp
@@ -718,7 +718,7 @@ u64 System::GenerateCommand(std::span<u8> in_command_buffer,
 
         const auto estimated_time{start_estimated_time - end_estimated_time};
 
-        const auto time_limit{static_cast<u32>(std::max(dsp_time_limit + estimated_time, 0.0f))};
+        const auto time_limit{static_cast<u32>((std::max)(dsp_time_limit + estimated_time, 0.0f))};
         num_voices_dropped =
             DropVoices(command_buffer, static_cast<u32>(start_estimated_time), time_limit);
     }
diff --git a/src/audio_core/sink/cubeb_sink.cpp b/src/audio_core/sink/cubeb_sink.cpp
index a33162b806..a3a7a89ba4 100644
--- a/src/audio_core/sink/cubeb_sink.cpp
+++ b/src/audio_core/sink/cubeb_sink.cpp
@@ -73,7 +73,7 @@ public:
             minimum_latency = TargetSampleCount * 2;
         }
 
-        minimum_latency = std::max(minimum_latency, TargetSampleCount * 2);
+        minimum_latency = (std::max)(minimum_latency, TargetSampleCount * 2);
 
         LOG_INFO(Service_Audio,
                  "Opening cubeb stream {} type {} with: rate {} channels {} (system channels {}) "
@@ -372,7 +372,7 @@ u32 GetCubebLatency() {
         LOG_CRITICAL(Audio_Sink, "Error getting minimum latency, error: {}", latency_error);
         latency = TargetSampleCount * 2;
     }
-    latency = std::max(latency, TargetSampleCount * 2);
+    latency = (std::max)(latency, TargetSampleCount * 2);
     cubeb_destroy(ctx);
     return latency;
 }
@@ -426,7 +426,7 @@ bool IsCubebSuitable() {
         LOG_ERROR(Audio_Sink, "Cubeb could not get min latency, it is not suitable.");
         return false;
     }
-    latency = std::max(latency, TargetSampleCount * 2);
+    latency = (std::max)(latency, TargetSampleCount * 2);
 
     // Test opening a device with standard parameters
     cubeb_devid output_device{0};
diff --git a/src/audio_core/sink/sink_stream.cpp b/src/audio_core/sink/sink_stream.cpp
index c0078e6ddc..4d7f0c1d5d 100644
--- a/src/audio_core/sink/sink_stream.cpp
+++ b/src/audio_core/sink/sink_stream.cpp
@@ -31,8 +31,8 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::span<s16> samples) {
         return;
     }
 
-    constexpr s32 min{std::numeric_limits<s16>::min()};
-    constexpr s32 max{std::numeric_limits<s16>::max()};
+    constexpr s32 min{(std::numeric_limits<s16>::min)()};
+    constexpr s32 max{(std::numeric_limits<s16>::max)()};
 
     auto yuzu_volume{Settings::Volume()};
     if (yuzu_volume > 1.0f) {
@@ -123,8 +123,8 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::span<s16> samples) {
 }
 
 std::vector<s16> SinkStream::ReleaseBuffer(u64 num_samples) {
-    constexpr s32 min = std::numeric_limits<s16>::min();
-    constexpr s32 max = std::numeric_limits<s16>::max();
+    constexpr s32 min = (std::numeric_limits<s16>::min)();
+    constexpr s32 max = (std::numeric_limits<s16>::max)();
 
     auto samples{samples_buffer.Pop(num_samples)};
 
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 9b898837bc..665143900a 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -237,7 +237,7 @@ else()
   )
 
   # Get around GCC failing with intrinsics in Debug
-  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_BUILD_TYPE MATCHES "Debug")
+  if(CXX_GCC AND CMAKE_BUILD_TYPE MATCHES "Debug")
     set_property(
       SOURCE stb.cpp
       APPEND
@@ -245,7 +245,7 @@ else()
   endif()
 endif()
 
-if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+if(CXX_CLANG)
   target_compile_options(common PRIVATE -fsized-deallocation
                                         -Werror=unreachable-code-aggressive)
   target_compile_definitions(
diff --git a/src/common/free_region_manager.h b/src/common/free_region_manager.h
index 2e590d6094..39d52f866c 100644
--- a/src/common/free_region_manager.h
+++ b/src/common/free_region_manager.h
@@ -27,8 +27,8 @@ public:
 
         // If we are, join with them, ensuring we stay in bounds.
         if (it != m_free_regions.end()) {
-            start_address = std::min(start_address, it->lower());
-            end_address = std::max(end_address, it->upper());
+            start_address = (std::min)(start_address, it->lower());
+            end_address = (std::max)(end_address, it->upper());
         }
 
         // Free the relevant region.
diff --git a/src/common/fs/path_util.cpp b/src/common/fs/path_util.cpp
index a2f5cb92ff..318f311891 100644
--- a/src/common/fs/path_util.cpp
+++ b/src/common/fs/path_util.cpp
@@ -484,9 +484,9 @@ std::string GetParentPath(std::string_view path) {
     std::size_t name_index;
 
     if (name_bck_index == std::string_view::npos || name_fwd_index == std::string_view::npos) {
-        name_index = std::min(name_bck_index, name_fwd_index);
+        name_index = (std::min)(name_bck_index, name_fwd_index);
     } else {
-        name_index = std::max(name_bck_index, name_fwd_index);
+        name_index = (std::max)(name_bck_index, name_fwd_index);
     }
 
     return std::string(path.substr(0, name_index));
@@ -506,7 +506,7 @@ std::string_view GetPathWithoutTop(std::string_view path) {
 
     const auto name_bck_index = path.find('\\');
     const auto name_fwd_index = path.find('/');
-    return path.substr(std::min(name_bck_index, name_fwd_index) + 1);
+    return path.substr((std::min)(name_bck_index, name_fwd_index) + 1);
 }
 
 } // namespace Common::FS
diff --git a/src/common/heap_tracker.cpp b/src/common/heap_tracker.cpp
index c875683f0f..7cce54976e 100644
--- a/src/common/heap_tracker.cpp
+++ b/src/common/heap_tracker.cpp
@@ -144,8 +144,7 @@ void HeapTracker::Protect(size_t virtual_offset, size_t size, MemoryPermission p
         }
 
         // Clamp to end.
-        next = std::min(next, end);
-
+        next = (std::min)(next, end);
         // Reprotect, if we need to.
         if (should_protect) {
             m_buffer.Protect(cur, next - cur, perm);
@@ -211,8 +210,8 @@ void HeapTracker::RebuildSeparateHeapAddressSpace() {
     // Despite being worse in theory, this has proven to be better in practice than more
     // regularly dumping a smaller amount, because it significantly reduces average case
     // lock contention.
-    const size_t desired_count = std::min(m_resident_map_count, m_max_resident_map_count) / 2;
-    const size_t evict_count = m_resident_map_count - desired_count;
+    std::size_t const desired_count = (std::min)(m_resident_map_count, m_max_resident_map_count) / 2;
+    std::size_t const evict_count = m_resident_map_count - desired_count;
     auto it = m_resident_mappings.begin();
 
     for (size_t i = 0; i < evict_count && it != m_resident_mappings.end(); i++) {
diff --git a/src/common/host_memory.cpp b/src/common/host_memory.cpp
index 15a198e216..1b7532b6b9 100644
--- a/src/common/host_memory.cpp
+++ b/src/common/host_memory.cpp
@@ -199,8 +199,8 @@ public:
         std::scoped_lock lock{placeholder_mutex};
         auto [it, end] = placeholders.equal_range({virtual_offset, virtual_end});
         while (it != end) {
-            const size_t offset = std::max(it->lower(), virtual_offset);
-            const size_t protect_length = std::min(it->upper(), virtual_end) - offset;
+            const size_t offset = (std::max)(it->lower(), virtual_offset);
+            const size_t protect_length = (std::min)(it->upper(), virtual_end) - offset;
             DWORD old_flags{};
             if (!VirtualProtect(virtual_base + offset, protect_length, new_flags, &old_flags)) {
                 LOG_CRITICAL(HW_Memory, "Failed to change virtual memory protect rules");
@@ -266,8 +266,8 @@ private:
         }
         const size_t placeholder_begin = it->lower();
         const size_t placeholder_end = it->upper();
-        const size_t unmap_begin = std::max(virtual_offset, placeholder_begin);
-        const size_t unmap_end = std::min(virtual_offset + length, placeholder_end);
+        const size_t unmap_begin = (std::max)(virtual_offset, placeholder_begin);
+        const size_t unmap_end = (std::min)(virtual_offset + length, placeholder_end);
         ASSERT(unmap_begin >= placeholder_begin && unmap_begin < placeholder_end);
         ASSERT(unmap_end <= placeholder_end && unmap_end > placeholder_begin);
 
@@ -655,8 +655,8 @@ private:
             *virtual_offset = 0;
             *length = 0;
         } else {
-            *virtual_offset = std::max(intended_start, address_space_start);
-            *length = std::min(intended_end, address_space_end) - *virtual_offset;
+            *virtual_offset = (std::max)(intended_start, address_space_start);
+            *length = (std::min)(intended_end, address_space_end) - *virtual_offset;
         }
     }
 
diff --git a/src/common/logging/log.h b/src/common/logging/log.h
index 6da9e5231a..252c83aa2c 100644
--- a/src/common/logging/log.h
+++ b/src/common/logging/log.h
@@ -18,7 +18,7 @@ constexpr const char* TrimSourcePath(std::string_view source) {
     const auto rfind = [source](const std::string_view match) {
         return source.rfind(match) == source.npos ? 0 : (source.rfind(match) + match.size());
     };
-    auto idx = std::max({rfind("src/"), rfind("src\\"), rfind("../"), rfind("..\\")});
+    auto idx = (std::max)({rfind("src/"), rfind("src\\"), rfind("../"), rfind("..\\")});
     return source.data() + idx;
 }
 
diff --git a/src/common/math_util.h b/src/common/math_util.h
index 1f5928c15f..f52a0a35ae 100644
--- a/src/common/math_util.h
+++ b/src/common/math_util.h
@@ -85,10 +85,10 @@ struct Rectangle {
     }
 
     [[nodiscard]] constexpr bool Intersect(const Rectangle<T>& with, Rectangle<T>* result) const {
-        result->left = std::max(left, with.left);
-        result->top = std::max(top, with.top);
-        result->right = std::min(right, with.right);
-        result->bottom = std::min(bottom, with.bottom);
+        result->left = (std::max)(left, with.left);
+        result->top = (std::max)(top, with.top);
+        result->right = (std::min)(right, with.right);
+        result->bottom = (std::min)(bottom, with.bottom);
         return !result->IsEmpty();
     }
 };
diff --git a/src/common/overflow.h b/src/common/overflow.h
index e184ead953..d39fa24041 100644
--- a/src/common/overflow.h
+++ b/src/common/overflow.h
@@ -25,9 +25,9 @@ template <typename T>
 inline bool CanAddWithoutOverflow(T lhs, T rhs) {
 #ifdef _MSC_VER
     if (lhs >= 0 && rhs >= 0) {
-        return WrappingAdd(lhs, rhs) >= std::max(lhs, rhs);
+        return WrappingAdd(lhs, rhs) >= (std::max)(lhs, rhs);
     } else if (lhs < 0 && rhs < 0) {
-        return WrappingAdd(lhs, rhs) <= std::min(lhs, rhs);
+        return WrappingAdd(lhs, rhs) <= (std::min)(lhs, rhs);
     } else {
         return true;
     }
diff --git a/src/common/range_map.h b/src/common/range_map.h
index ab73993e3b..e9cb50825b 100644
--- a/src/common/range_map.h
+++ b/src/common/range_map.h
@@ -18,7 +18,7 @@ private:
 
 public:
     explicit RangeMap(ValueT null_value_) : null_value{null_value_} {
-        container.emplace(std::numeric_limits<KeyT>::min(), null_value);
+        container.emplace((std::numeric_limits<KeyT>::min)(), null_value);
     };
     ~RangeMap() = default;
 
@@ -66,7 +66,7 @@ private:
         }
         const auto it_end = std::next(it);
         if (it_end == container.end()) {
-            return std::numeric_limits<KeyT>::max() - address;
+            return (std::numeric_limits<KeyT>::max)() - address;
         }
         return it_end->first - address;
     }
diff --git a/src/common/range_sets.inc b/src/common/range_sets.inc
index b83eceb7b0..3edd8c8a43 100644
--- a/src/common/range_sets.inc
+++ b/src/common/range_sets.inc
@@ -274,7 +274,7 @@ void OverlapRangeSet<AddressType>::Subtract(AddressType base_address, size_t siz
 
 template <typename AddressType>
 void OverlapRangeSet<AddressType>::DeleteAll(AddressType base_address, size_t size) {
-    m_impl->template Subtract<false>(base_address, size, std::numeric_limits<s32>::max(),
+    m_impl->template Subtract<false>(base_address, size, (std::numeric_limits<s32>::max)(),
                                      [](AddressType, AddressType) {});
 }
 
diff --git a/src/common/ring_buffer.h b/src/common/ring_buffer.h
index 86de96b43e..e97854f514 100644
--- a/src/common/ring_buffer.h
+++ b/src/common/ring_buffer.h
@@ -29,7 +29,7 @@ class RingBuffer {
     // T must be safely memcpy-able and have a trivial default constructor.
     static_assert(std::is_trivial_v<T>);
     // Ensure capacity is sensible.
-    static_assert(capacity < std::numeric_limits<std::size_t>::max() / 2);
+    static_assert(capacity < (std::numeric_limits<std::size_t>::max)() / 2);
     static_assert((capacity & (capacity - 1)) == 0, "capacity must be a power of two");
     // Ensure lock-free.
     static_assert(std::atomic_size_t::is_always_lock_free);
@@ -43,9 +43,9 @@ public:
         std::lock_guard lock(rb_mutex);
 
         const std::size_t slots_free = capacity + read_index - write_index;
-        const std::size_t push_count = std::min(slot_count, slots_free);
+        const std::size_t push_count = (std::min)(slot_count, slots_free);
         const std::size_t pos = write_index % capacity;
-        const std::size_t first_copy = std::min(capacity - pos, push_count);
+        const std::size_t first_copy = (std::min)(capacity - pos, push_count);
         const std::size_t second_copy = push_count - first_copy;
 
         const char* in = static_cast<const char*>(new_slots);
@@ -69,9 +69,9 @@ public:
         std::lock_guard lock(rb_mutex);
 
         const std::size_t slots_filled = write_index - read_index;
-        const std::size_t pop_count = std::min(slots_filled, max_slots);
+        const std::size_t pop_count = (std::min)(slots_filled, max_slots);
         const std::size_t pos = read_index % capacity;
-        const std::size_t first_copy = std::min(capacity - pos, pop_count);
+        const std::size_t first_copy = (std::min)(capacity - pos, pop_count);
         const std::size_t second_copy = pop_count - first_copy;
 
         char* out = static_cast<char*>(output);
@@ -84,7 +84,7 @@ public:
     }
 
     std::vector<T> Pop(std::size_t max_slots = ~std::size_t(0)) {
-        std::vector<T> out(std::min(max_slots, capacity));
+        std::vector<T> out((std::min)(max_slots, capacity));
         const std::size_t count = Pop(out.data(), out.size());
         out.resize(count);
         return out;
diff --git a/src/common/scm_rev.cpp.in b/src/common/scm_rev.cpp.in
index b6bff72867..a157d03878 100644
--- a/src/common/scm_rev.cpp.in
+++ b/src/common/scm_rev.cpp.in
@@ -1,12 +1,11 @@
+// SPDX-FileCopyrightText: 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: 2014 Citra Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include "common/scm_rev.h"
 
-#include <fstream>
-#include <string>
-#include <fmt/ranges.h>
-
 #define GIT_REV "@GIT_REV@"
 #define GIT_BRANCH "@GIT_BRANCH@"
 #define GIT_DESC "@GIT_DESC@"
@@ -18,64 +17,21 @@
 #define TITLE_BAR_FORMAT_IDLE "@TITLE_BAR_FORMAT_IDLE@"
 #define TITLE_BAR_FORMAT_RUNNING "@TITLE_BAR_FORMAT_RUNNING@"
 #define IS_DEV_BUILD @IS_DEV_BUILD@
+#define COMPILER_ID "@CXX_COMPILER@"
 
 namespace Common {
 
-const char* g_scm_rev;
-const char* g_scm_branch;
-const char* g_scm_desc;
-const char g_build_name[] = BUILD_NAME;
-const char g_build_date[] = BUILD_DATE;
-const char g_build_fullname[] = BUILD_FULLNAME;
-const char g_build_version[] = BUILD_VERSION;
-const char g_build_id[] = BUILD_ID;
-const char g_title_bar_format_idle[] = TITLE_BAR_FORMAT_IDLE;
-const char g_title_bar_format_running[] = TITLE_BAR_FORMAT_RUNNING;
-const bool g_is_dev_build = IS_DEV_BUILD;
+constexpr const char g_scm_rev[] = GIT_REV;
+constexpr const char g_scm_branch[] = GIT_BRANCH;
+constexpr const char g_scm_desc[] = GIT_DESC;
+constexpr const char g_build_name[] = BUILD_NAME;
+constexpr const char g_build_date[] = BUILD_DATE;
+constexpr const char g_build_fullname[] = BUILD_FULLNAME;
+constexpr const char g_build_version[] = BUILD_VERSION;
+constexpr const char g_build_id[] = BUILD_ID;
+constexpr const char g_title_bar_format_idle[] = TITLE_BAR_FORMAT_IDLE;
+constexpr const char g_title_bar_format_running[] = TITLE_BAR_FORMAT_RUNNING;
+constexpr const bool g_is_dev_build = IS_DEV_BUILD;
+constexpr const char g_compiler_id[] = COMPILER_ID;
 
-/// Anonymizes SCM data
-/// This is quite weak. But better than nothing.
-class scm_encrypt {
-    std::string m_scm_rev, m_scm_branch, m_scm_desc;
-
-public:
-    scm_encrypt() {
-        // Get a key that is easy to obtain when asking the person directly but (usually) hard to
-        // guess
-        std::string key;
-#ifdef __linux__
-        if (!std::getline(std::ifstream("/proc/sys/kernel/hostname"), key))
-            key = "linux_error_key";
-#else
-        // Not a good fallback, but better than nothing I guess?
-        key = g_build_date;
-#endif
-        // Copy strings in place
-        m_scm_rev = GIT_REV;
-        m_scm_branch = GIT_BRANCH;
-        m_scm_desc = GIT_DESC;
-        // XOR each string with key
-        auto key_it = key.begin();
-        for (auto& string : {&m_scm_rev, &m_scm_branch, &m_scm_desc}) {
-            for (auto& c : *string) {
-                c ^= *key_it;
-                if (++key_it == key.end())
-                    key_it = key.begin();
-            }
-        }
-        // Make each string human-readable
-        for (auto& string : {&m_scm_rev, &m_scm_branch, &m_scm_desc}) {
-            const std::string original = *string;
-            string->clear();
-            for (const auto c : original) {
-                string->append(fmt::format("{:x}", unsigned(c)));
-            }
-            string->pop_back();
-        }
-        // Set pointers
-        g_scm_rev = m_scm_rev.c_str();
-        g_scm_branch = m_scm_branch.c_str();
-        g_scm_desc = m_scm_desc.c_str();
-    }
-} scm_encrypt_instance;
 } // namespace Common
diff --git a/src/common/scm_rev.h b/src/common/scm_rev.h
index ee1997950a..84356ad64a 100644
--- a/src/common/scm_rev.h
+++ b/src/common/scm_rev.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: 2014 Citra Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -5,9 +8,9 @@
 
 namespace Common {
 
-extern const char* g_scm_rev;
-extern const char* g_scm_branch;
-extern const char* g_scm_desc;
+extern const char g_scm_rev[];
+extern const char g_scm_branch[];
+extern const char g_scm_desc[];
 extern const char g_build_name[];
 extern const char g_build_date[];
 extern const char g_build_fullname[];
@@ -17,5 +20,6 @@ extern const char g_title_bar_format_idle[];
 extern const char g_title_bar_format_running[];
 extern const char g_shader_cache_version[];
 extern const bool g_is_dev_build;
+extern const char g_compiler_id[];
 
 } // namespace Common
diff --git a/src/common/settings.h b/src/common/settings.h
index 047dfc800a..9d448a2b38 100644
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -37,14 +37,14 @@ struct ResolutionScalingInfo {
         if (value == 0) {
             return 0;
         }
-        return std::max((value * static_cast<s32>(up_scale)) >> static_cast<s32>(down_shift), 1);
+        return (std::max)((value * static_cast<s32>(up_scale)) >> static_cast<s32>(down_shift), 1);
     }
 
     u32 ScaleUp(u32 value) const {
         if (value == 0U) {
             return 0U;
         }
-        return std::max((value * up_scale) >> down_shift, 1U);
+        return (std::max)((value * up_scale) >> down_shift, 1U);
     }
 };
 
@@ -612,8 +612,8 @@ struct Values {
                                       false,   true, &custom_rtc_enabled};
     SwitchableSetting<s64, true> custom_rtc_offset{linkage,
                                                    0,
-                                                   std::numeric_limits<int>::min(),
-                                                   std::numeric_limits<int>::max(),
+                                                   (std::numeric_limits<int>::min)(),
+                                                   (std::numeric_limits<int>::max)(),
                                                    "custom_rtc_offset",
                                                    Category::System,
                                                    Specialization::Countable,
diff --git a/src/common/settings_setting.h b/src/common/settings_setting.h
index 0b18ca5ecc..ce7a3e91a6 100644
--- a/src/common/settings_setting.h
+++ b/src/common/settings_setting.h
@@ -223,7 +223,7 @@ public:
         if constexpr (std::is_enum_v<Type>) {
             return EnumMetadata<Type>::Index();
         } else {
-            return std::numeric_limits<u32>::max();
+            return (std::numeric_limits<u32>::max)();
         }
     }
 
@@ -237,14 +237,14 @@ public:
 
     [[nodiscard]] std::string MinVal() const override final {
         if constexpr (std::is_arithmetic_v<Type> && !ranged) {
-            return this->ToString(std::numeric_limits<Type>::min());
+            return this->ToString((std::numeric_limits<Type>::min)());
         } else {
             return this->ToString(minimum);
         }
     }
     [[nodiscard]] std::string MaxVal() const override final {
         if constexpr (std::is_arithmetic_v<Type> && !ranged) {
-            return this->ToString(std::numeric_limits<Type>::max());
+            return this->ToString((std::numeric_limits<Type>::max)());
         } else {
             return this->ToString(maximum);
         }
diff --git a/src/common/slot_vector.h b/src/common/slot_vector.h
index 34ff7de941..8db4bba30b 100644
--- a/src/common/slot_vector.h
+++ b/src/common/slot_vector.h
@@ -17,7 +17,7 @@
 namespace Common {
 
 struct SlotId {
-    static constexpr u32 INVALID_INDEX = std::numeric_limits<u32>::max();
+    static constexpr u32 INVALID_INDEX = (std::numeric_limits<u32>::max)();
 
     constexpr auto operator<=>(const SlotId&) const noexcept = default;
 
diff --git a/src/common/thread.cpp b/src/common/thread.cpp
index 34cc1527bf..62a3115d5a 100644
--- a/src/common/thread.cpp
+++ b/src/common/thread.cpp
@@ -66,7 +66,7 @@ void SetCurrentThreadPriority(ThreadPriority new_priority) {
     const auto scheduling_type = SCHED_OTHER;
     s32 max_prio = sched_get_priority_max(scheduling_type);
     s32 min_prio = sched_get_priority_min(scheduling_type);
-    u32 level = std::max(static_cast<u32>(new_priority) + 1, 4U);
+    u32 level = (std::max)(static_cast<u32>(new_priority) + 1, 4U);
 
     struct sched_param params;
     if (max_prio > min_prio) {
@@ -101,7 +101,7 @@ void SetCurrentThreadName(const char* name) {
 #elif defined(__linux__)
     // Linux limits thread names to 15 characters and will outright reject any
     // attempt to set a longer name with ERANGE.
-    std::string truncated(name, std::min(strlen(name), static_cast<size_t>(15)));
+    std::string truncated(name, (std::min)(strlen(name), static_cast<size_t>(15)));
     if (int e = pthread_setname_np(pthread_self(), truncated.c_str())) {
         errno = e;
         LOG_ERROR(Common, "Failed to set thread name to '{}': {}", truncated, GetLastErrorMsg());
diff --git a/src/common/tiny_mt.h b/src/common/tiny_mt.h
index 5d5ebf158c..a757591c9b 100644
--- a/src/common/tiny_mt.h
+++ b/src/common/tiny_mt.h
@@ -124,7 +124,7 @@ public:
         this->state.data[3] = ParamTmat;
 
         {
-            const int num_init_iterations = std::max(seed_count + 1, MinimumInitIterations) - 1;
+            const int num_init_iterations = (std::max)(seed_count + 1, MinimumInitIterations) - 1;
 
             GenerateInitialValuePlus(&this->state, 0, seed_count);
 
diff --git a/src/common/uint128.h b/src/common/uint128.h
index f450a6db99..56433096fe 100644
--- a/src/common/uint128.h
+++ b/src/common/uint128.h
@@ -20,7 +20,7 @@ namespace Common {
 
 // This function multiplies 2 u64 values and divides it by a u64 value.
 [[nodiscard]] static inline u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
     u128 r{};
     r[0] = _umul128(a, b, &r[1]);
     u64 remainder;
@@ -41,7 +41,7 @@ namespace Common {
 // This function multiplies 2 u64 values and produces a u128 value;
 [[nodiscard]] static inline u128 Multiply64Into128(u64 a, u64 b) {
     u128 result;
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
     result[0] = _umul128(a, b, &result[1]);
 #else
     unsigned __int128 tmp = a;
@@ -65,7 +65,7 @@ namespace Common {
 #endif
 #else
     // This one is bit more inaccurate.
-    return MultiplyAndDivide64(std::numeric_limits<u64>::max(), numerator, divisor);
+    return MultiplyAndDivide64((std::numeric_limits<u64>::max)(), numerator, divisor);
 #endif
 }
 
diff --git a/src/common/x64/cpu_wait.cpp b/src/common/x64/cpu_wait.cpp
index 41d385f598..b578d75ece 100644
--- a/src/common/x64/cpu_wait.cpp
+++ b/src/common/x64/cpu_wait.cpp
@@ -24,7 +24,7 @@ constexpr auto PauseCycles = 100'000U;
 
 } // Anonymous namespace
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
 __forceinline static void TPAUSE() {
     static constexpr auto RequestC02State = 0U;
     _tpause(RequestC02State, FencedRDTSC() + PauseCycles);
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 1979d427b5..0be60b55c6 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1187,6 +1187,7 @@ else()
         -Wno-cast-function-type
 
         $<$<CXX_COMPILER_ID:Clang>:-fsized-deallocation>
+        $<$<CXX_COMPILER_ID:Clang>:-Wno-cast-function-type-mismatch>
     )
 endif()
 
diff --git a/src/core/arm/debug.cpp b/src/core/arm/debug.cpp
index 854509463b..20f1ea00df 100644
--- a/src/core/arm/debug.cpp
+++ b/src/core/arm/debug.cpp
@@ -283,9 +283,9 @@ Loader::AppLoader::Modules FindModules(Kernel::KProcess* process) {
                     // Ignore leading directories.
                     char* path_pointer = module_path.path.data();
                     char* path_end =
-                        path_pointer + std::min(PathLengthMax, module_path.path_length);
+                        path_pointer + (std::min)(PathLengthMax, module_path.path_length);
 
-                    for (s32 i = 0; i < std::min(PathLengthMax, module_path.path_length) &&
+                    for (s32 i = 0; i < (std::min)(PathLengthMax, module_path.path_length) &&
                                     module_path.path[i] != '\0';
                          i++) {
                         if (module_path.path[i] == '/' || module_path.path[i] == '\\') {
diff --git a/src/core/arm/dynarmic/dynarmic_cp15.cpp b/src/core/arm/dynarmic/dynarmic_cp15.cpp
index c663adda19..0d5e5912ae 100644
--- a/src/core/arm/dynarmic/dynarmic_cp15.cpp
+++ b/src/core/arm/dynarmic/dynarmic_cp15.cpp
@@ -58,6 +58,8 @@ CallbackOrAccessOneWord DynarmicCP15::CompileSendOneWord(bool two, unsigned opc1
                     _mm_lfence();
 #elif defined(ARCHITECTURE_x86_64)
                     asm volatile("mfence\n\tlfence\n\t" : : : "memory");
+#elif defined(_MSC_VER) && defined(ARCHITECTURE_arm64)
+                    _Memory_barrier();
 #elif defined(ARCHITECTURE_arm64)
                     asm volatile("dsb sy\n\t" : : : "memory");
 #else
@@ -75,6 +77,8 @@ CallbackOrAccessOneWord DynarmicCP15::CompileSendOneWord(bool two, unsigned opc1
                     _mm_mfence();
 #elif defined(ARCHITECTURE_x86_64)
                     asm volatile("mfence\n\t" : : : "memory");
+#elif defined(_MSC_VER) && defined(ARCHITECTURE_arm64)
+                    _Memory_barrier();
 #elif defined(ARCHITECTURE_arm64)
                     asm volatile("dmb sy\n\t" : : : "memory");
 #else
diff --git a/src/core/arm/nce/interpreter_visitor.h b/src/core/arm/nce/interpreter_visitor.h
index f90d876abb..9dfbdb2fe9 100644
--- a/src/core/arm/nce/interpreter_visitor.h
+++ b/src/core/arm/nce/interpreter_visitor.h
@@ -4,9 +4,14 @@
 
 #pragma once
 
+#include <atomic>
 #include <signal.h>
 #include <unistd.h>
+#include <span>
 
+#include "core/hle/kernel/k_thread.h"
+#include "core/memory.h"
+#include "common/logging/log.h"
 #include "core/arm/nce/visitor_base.h"
 
 namespace Core {
diff --git a/src/core/arm/nce/patcher.cpp b/src/core/arm/nce/patcher.cpp
index b8387ce7cb..9321258ae9 100644
--- a/src/core/arm/nce/patcher.cpp
+++ b/src/core/arm/nce/patcher.cpp
@@ -11,6 +11,8 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/hle/kernel/svc.h"
+#include "core/memory.h"
+#include "core/hle/kernel/k_thread.h"
 
 namespace Core::NCE {
 
diff --git a/src/core/core.cpp b/src/core/core.cpp
index c2852e66f0..7315f35e0c 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -185,7 +185,7 @@ struct System::Impl {
 
         Service::PSC::Time::LocationName name{};
         auto new_name = Settings::GetTimeZoneString(Settings::values.time_zone_index.GetValue());
-        std::memcpy(name.data(), new_name.data(), std::min(name.size(), new_name.size()));
+        std::memcpy(name.data(), new_name.data(), (std::min)(name.size(), new_name.size()));
 
         timezone_service->SetDeviceLocationName(name);
 
diff --git a/src/core/crypto/xts_encryption_layer.cpp b/src/core/crypto/xts_encryption_layer.cpp
index b60303412b..34e58463de 100644
--- a/src/core/crypto/xts_encryption_layer.cpp
+++ b/src/core/crypto/xts_encryption_layer.cpp
@@ -34,8 +34,8 @@ std::size_t XTSEncryptionLayer::Read(u8* data, std::size_t length, std::size_t o
             buffer.resize(XTS_SECTOR_SIZE);
         cipher.XTSTranscode(buffer.data(), buffer.size(), buffer.data(), offset / XTS_SECTOR_SIZE,
                             XTS_SECTOR_SIZE, Op::Decrypt);
-        std::memcpy(data, buffer.data(), std::min(buffer.size(), length));
-        return std::min(buffer.size(), length);
+        std::memcpy(data, buffer.data(), (std::min)(buffer.size(), length));
+        return (std::min)(buffer.size(), length);
     }
 
     // offset does not fall on block boundary (0x4000)
diff --git a/src/core/debugger/gdbstub.cpp b/src/core/debugger/gdbstub.cpp
index fcb5787147..5c3c045b3c 100644
--- a/src/core/debugger/gdbstub.cpp
+++ b/src/core/debugger/gdbstub.cpp
@@ -664,7 +664,7 @@ void GDBStub::HandleRcmd(const std::vector<u8>& command) {
 
             if (svc_mem_info.state != Kernel::Svc::MemoryState::Inaccessible ||
                 svc_mem_info.base_address + svc_mem_info.size - 1 !=
-                    std::numeric_limits<u64>::max()) {
+                    (std::numeric_limits<u64>::max)()) {
                 const char* state = GetMemoryStateName(svc_mem_info.state);
                 const char* perm = GetMemoryPermissionString(svc_mem_info);
                 const char l = True(svc_mem_info.attribute & MemoryAttribute::Locked) ? 'L' : '-';
@@ -710,7 +710,7 @@ std::vector<char>::const_iterator GDBStub::CommandEnd() const {
     const auto end{std::find(current_command.begin(), current_command.end(), GDB_STUB_END)};
 
     // Require the checksum to be present
-    return std::min(end + 2, current_command.end());
+    return (std::min)(end + 2, current_command.end());
 }
 
 std::optional<std::string> GDBStub::DetachCommand() {
diff --git a/src/core/debugger/gdbstub_arch.cpp b/src/core/debugger/gdbstub_arch.cpp
index 452f565bec..ee7108376a 100644
--- a/src/core/debugger/gdbstub_arch.cpp
+++ b/src/core/debugger/gdbstub_arch.cpp
@@ -12,7 +12,7 @@ static T HexToValue(std::string_view hex) {
     static_assert(std::is_trivially_copyable_v<T>);
     T value{};
     const auto mem{Common::HexStringToVector(hex, false)};
-    std::memcpy(&value, mem.data(), std::min(mem.size(), sizeof(T)));
+    std::memcpy(&value, mem.data(), (std::min)(mem.size(), sizeof(T)));
     return value;
 }
 
diff --git a/src/core/device_memory_manager.inc b/src/core/device_memory_manager.inc
index f104d495bb..52dff5df9a 100644
--- a/src/core/device_memory_manager.inc
+++ b/src/core/device_memory_manager.inc
@@ -388,7 +388,7 @@ void DeviceMemoryManager<Traits>::WalkBlock(DAddr addr, std::size_t size, auto o
     while (remaining_size) {
         const size_t next_pages = static_cast<std::size_t>(continuity_tracker[page_index]);
         const std::size_t copy_amount =
-            std::min((next_pages << Memory::YUZU_PAGEBITS) - page_offset, remaining_size);
+            (std::min)((next_pages << Memory::YUZU_PAGEBITS) - page_offset, remaining_size);
         const auto current_vaddr =
             static_cast<u64>((page_index << Memory::YUZU_PAGEBITS) + page_offset);
         SCOPE_EXIT{
diff --git a/src/core/file_sys/fs_path_utility.h b/src/core/file_sys/fs_path_utility.h
index cdfd8c7729..3af23b0bba 100644
--- a/src/core/file_sys/fs_path_utility.h
+++ b/src/core/file_sys/fs_path_utility.h
@@ -683,7 +683,7 @@ public:
         const auto max_mount_len =
             out_mount_name_buffer_size == 0
                 ? MountNameLengthMax + 1
-                : std::min(MountNameLengthMax + 1, out_mount_name_buffer_size);
+                : (std::min)(MountNameLengthMax + 1, out_mount_name_buffer_size);
 
         // Parse the path until we see a drive separator
         size_t mount_len = 0;
diff --git a/src/core/file_sys/fsa/fs_i_directory.h b/src/core/file_sys/fsa/fs_i_directory.h
index c8e895eab0..a4adcd2beb 100644
--- a/src/core/file_sys/fsa/fs_i_directory.h
+++ b/src/core/file_sys/fsa/fs_i_directory.h
@@ -48,7 +48,7 @@ public:
 private:
     Result DoRead(s64* out_count, DirectoryEntry* out_entries, s64 max_entries) {
         const u64 actual_entries =
-            std::min(static_cast<u64>(max_entries), entries.size() - next_entry_index);
+            (std::min)(static_cast<u64>(max_entries), entries.size() - next_entry_index);
         const auto* begin = reinterpret_cast<u8*>(entries.data() + next_entry_index);
         const auto* end = reinterpret_cast<u8*>(entries.data() + next_entry_index + actual_entries);
         const auto range_size = static_cast<std::size_t>(std::distance(begin, end));
diff --git a/src/core/file_sys/fsa/fs_i_file.h b/src/core/file_sys/fsa/fs_i_file.h
index 1188ae8ca7..99468ef0e2 100644
--- a/src/core/file_sys/fsa/fs_i_file.h
+++ b/src/core/file_sys/fsa/fs_i_file.h
@@ -93,7 +93,7 @@ protected:
         R_TRY(this->DoGetSize(std::addressof(file_size)));
         R_UNLESS(offset <= file_size, ResultOutOfRange);
 
-        *out = static_cast<size_t>(std::min(file_size - offset, static_cast<s64>(size)));
+        *out = static_cast<size_t>((std::min)(file_size - offset, static_cast<s64>(size)));
         R_SUCCEED();
     }
 
diff --git a/src/core/file_sys/fssystem/fssystem_aes_ctr_counter_extended_storage.cpp b/src/core/file_sys/fssystem/fssystem_aes_ctr_counter_extended_storage.cpp
index bc1cddbb0c..c9fb5f64d6 100644
--- a/src/core/file_sys/fssystem/fssystem_aes_ctr_counter_extended_storage.cpp
+++ b/src/core/file_sys/fssystem/fssystem_aes_ctr_counter_extended_storage.cpp
@@ -213,7 +213,7 @@ size_t AesCtrCounterExtendedStorage::Read(u8* buffer, size_t size, size_t offset
 
         // Determine how much is left.
         const auto remaining_size = end_offset - cur_offset;
-        const auto cur_size = static_cast<size_t>(std::min(remaining_size, data_size));
+        const auto cur_size = static_cast<size_t>((std::min)(remaining_size, data_size));
         ASSERT(cur_size <= size);
 
         // If necessary, perform decryption.
diff --git a/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp b/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp
index b65aca18d9..c18fde18f4 100644
--- a/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp
+++ b/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp
@@ -94,7 +94,7 @@ size_t AesCtrStorage::Write(const u8* buffer, size_t size, size_t offset) {
     while (remaining > 0) {
         // Determine data we're writing and where.
         const size_t write_size =
-            use_work_buffer ? std::min(pooled_buffer.GetSize(), remaining) : remaining;
+            use_work_buffer ? (std::min)(pooled_buffer.GetSize(), remaining) : remaining;
 
         void* write_buf;
         if (use_work_buffer) {
diff --git a/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp b/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp
index efc5aa0b12..5ef2544dfb 100644
--- a/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp
+++ b/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp
@@ -65,7 +65,7 @@ size_t AesXtsStorage::Read(u8* buffer, size_t size, size_t offset) const {
         // Determine the size of the pre-data read.
         const size_t skip_size =
             static_cast<size_t>(offset - Common::AlignDown(offset, m_block_size));
-        const size_t data_size = std::min(size, m_block_size - skip_size);
+        const size_t data_size = (std::min)(size, m_block_size - skip_size);
 
         // Decrypt into a pooled buffer.
         {
@@ -84,14 +84,14 @@ size_t AesXtsStorage::Read(u8* buffer, size_t size, size_t offset) const {
 
         AddCounter(ctr.data(), IvSize, 1);
         processed_size += data_size;
-        ASSERT(processed_size == std::min(size, m_block_size - skip_size));
+        ASSERT(processed_size == (std::min)(size, m_block_size - skip_size));
     }
 
     // Decrypt aligned chunks.
     char* cur = reinterpret_cast<char*>(buffer) + processed_size;
     size_t remaining = size - processed_size;
     while (remaining > 0) {
-        const size_t cur_size = std::min(m_block_size, remaining);
+        const size_t cur_size = (std::min)(m_block_size, remaining);
 
         m_cipher->SetIV(ctr);
         m_cipher->Transcode(cur, cur_size, cur, Core::Crypto::Op::Decrypt);
diff --git a/src/core/file_sys/fssystem/fssystem_alignment_matching_storage_impl.cpp b/src/core/file_sys/fssystem/fssystem_alignment_matching_storage_impl.cpp
index 641c888aed..08b77d790a 100644
--- a/src/core/file_sys/fssystem/fssystem_alignment_matching_storage_impl.cpp
+++ b/src/core/file_sys/fssystem/fssystem_alignment_matching_storage_impl.cpp
@@ -104,7 +104,7 @@ size_t AlignmentMatchingStorageImpl::Read(VirtualFile base_storage, char* work_b
     while (remaining_tail_size > 0) {
         const auto aligned_tail_offset = Common::AlignDown(tail_offset, data_alignment);
         const auto cur_size =
-            std::min(static_cast<size_t>(aligned_tail_offset + data_alignment - tail_offset),
+            (std::min)(static_cast<size_t>(aligned_tail_offset + data_alignment - tail_offset),
                      remaining_tail_size);
         base_storage->Read(reinterpret_cast<u8*>(work_buf), data_alignment, aligned_tail_offset);
 
@@ -186,7 +186,7 @@ size_t AlignmentMatchingStorageImpl::Write(VirtualFile base_storage, char* work_
 
         const auto aligned_tail_offset = Common::AlignDown(tail_offset, data_alignment);
         const auto cur_size =
-            std::min(static_cast<size_t>(aligned_tail_offset + data_alignment - tail_offset),
+            (std::min)(static_cast<size_t>(aligned_tail_offset + data_alignment - tail_offset),
                      remaining_tail_size);
 
         base_storage->Read(reinterpret_cast<u8*>(work_buf), data_alignment, aligned_tail_offset);
diff --git a/src/core/file_sys/fssystem/fssystem_crypto_configuration.cpp b/src/core/file_sys/fssystem/fssystem_crypto_configuration.cpp
index a4f0cde281..0a9f28975b 100644
--- a/src/core/file_sys/fssystem/fssystem_crypto_configuration.cpp
+++ b/src/core/file_sys/fssystem/fssystem_crypto_configuration.cpp
@@ -29,12 +29,12 @@ void GenerateKey(void* dst_key, size_t dst_key_size, const void* src_key, size_t
         key_type == static_cast<s32>(KeyType::NcaHeaderKey2)) {
         const s32 key_index = static_cast<s32>(KeyType::NcaHeaderKey2) == key_type;
         const auto key = instance.GetKey(Core::Crypto::S256KeyType::Header);
-        std::memcpy(dst_key, key.data() + key_index * 0x10, std::min(dst_key_size, key.size() / 2));
+        std::memcpy(dst_key, key.data() + key_index * 0x10, (std::min)(dst_key_size, key.size() / 2));
         return;
     }
 
     const s32 key_generation =
-        std::max(key_type / NcaCryptoConfiguration::KeyAreaEncryptionKeyIndexCount, 1) - 1;
+        (std::max)(key_type / NcaCryptoConfiguration::KeyAreaEncryptionKeyIndexCount, 1) - 1;
     const s32 key_index = key_type % NcaCryptoConfiguration::KeyAreaEncryptionKeyIndexCount;
 
     Core::Crypto::AESCipher<Core::Crypto::Key128> cipher(
diff --git a/src/core/file_sys/fssystem/fssystem_integrity_verification_storage.cpp b/src/core/file_sys/fssystem/fssystem_integrity_verification_storage.cpp
index 046571e9ef..57cdc19248 100644
--- a/src/core/file_sys/fssystem/fssystem_integrity_verification_storage.cpp
+++ b/src/core/file_sys/fssystem/fssystem_integrity_verification_storage.cpp
@@ -34,7 +34,7 @@ void IntegrityVerificationStorage::Initialize(VirtualFile hs,
     ASSERT(m_verification_block_size == 1ll << m_verification_block_order);
 
     // Set upper layer block sizes.
-    upper_layer_verif_block_size = std::max(upper_layer_verif_block_size, HashSize);
+    upper_layer_verif_block_size = (std::max)(upper_layer_verif_block_size, HashSize);
     m_upper_layer_verification_block_size = upper_layer_verif_block_size;
     m_upper_layer_verification_block_order = ILog2(static_cast<u32>(upper_layer_verif_block_size));
     ASSERT(m_upper_layer_verification_block_size == 1ll << m_upper_layer_verification_block_order);
diff --git a/src/core/file_sys/fssystem/fssystem_nca_header.cpp b/src/core/file_sys/fssystem/fssystem_nca_header.cpp
index cef0f0bb94..2226c087c0 100644
--- a/src/core/file_sys/fssystem/fssystem_nca_header.cpp
+++ b/src/core/file_sys/fssystem/fssystem_nca_header.cpp
@@ -9,7 +9,7 @@
 namespace FileSys {
 
 u8 NcaHeader::GetProperKeyGeneration() const {
-    return std::max(this->key_generation, this->key_generation_2);
+    return (std::max)(this->key_generation, this->key_generation_2);
 }
 
 bool NcaPatchInfo::HasIndirectTable() const {
diff --git a/src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp b/src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp
index bbfaab2557..dcd08dac3e 100644
--- a/src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp
+++ b/src/core/file_sys/fssystem/fssystem_pooled_buffer.cpp
@@ -34,7 +34,7 @@ void PooledBuffer::AllocateCore(size_t ideal_size, size_t required_size, bool la
     ASSERT(required_size <= GetAllocatableSizeMaxCore(large));
 
     const size_t target_size =
-        std::min(std::max(ideal_size, required_size), GetAllocatableSizeMaxCore(large));
+        (std::min)((std::max)(ideal_size, required_size), GetAllocatableSizeMaxCore(large));
 
     // Dummy implementation for allocate.
     if (target_size > 0) {
diff --git a/src/core/file_sys/fssystem/fssystem_sparse_storage.h b/src/core/file_sys/fssystem/fssystem_sparse_storage.h
index 6c196ec611..1cc7e7b1eb 100644
--- a/src/core/file_sys/fssystem/fssystem_sparse_storage.h
+++ b/src/core/file_sys/fssystem/fssystem_sparse_storage.h
@@ -18,7 +18,7 @@ private:
         virtual ~ZeroStorage() {}
 
         virtual size_t GetSize() const override {
-            return std::numeric_limits<size_t>::max();
+            return (std::numeric_limits<size_t>::max)();
         }
 
         virtual size_t Read(u8* buffer, size_t size, size_t offset) const override {
@@ -62,7 +62,7 @@ public:
 
 private:
     void SetZeroStorage() {
-        return this->SetStorage(1, m_zero_storage, 0, std::numeric_limits<s64>::max());
+        return this->SetStorage(1, m_zero_storage, 0, (std::numeric_limits<s64>::max)());
     }
 
 private:
diff --git a/src/core/file_sys/nca_metadata.cpp b/src/core/file_sys/nca_metadata.cpp
index 9e855c50d1..55ea4d0803 100644
--- a/src/core/file_sys/nca_metadata.cpp
+++ b/src/core/file_sys/nca_metadata.cpp
@@ -102,7 +102,7 @@ std::vector<u8> CNMT::Serialize() const {
         header.type >= TitleType::Application && header.type <= TitleType::AOC;
     const auto dead_zone = header.table_offset + sizeof(CNMTHeader);
     std::vector<u8> out(
-        std::max(sizeof(CNMTHeader) + (has_opt_header ? sizeof(OptionalHeader) : 0), dead_zone) +
+        (std::max)(sizeof(CNMTHeader) + (has_opt_header ? sizeof(OptionalHeader) : 0), dead_zone) +
         content_records.size() * sizeof(ContentRecord) + meta_records.size() * sizeof(MetaRecord));
     memcpy(out.data(), &header, sizeof(CNMTHeader));
 
diff --git a/src/core/file_sys/registered_cache.cpp b/src/core/file_sys/registered_cache.cpp
index 85d30543c1..cb2089e9b3 100644
--- a/src/core/file_sys/registered_cache.cpp
+++ b/src/core/file_sys/registered_cache.cpp
@@ -273,7 +273,7 @@ std::vector<NcaID> PlaceholderCache::List() const {
 NcaID PlaceholderCache::Generate() {
     std::random_device device;
     std::mt19937 gen(device());
-    std::uniform_int_distribution<u64> distribution(1, std::numeric_limits<u64>::max());
+    std::uniform_int_distribution<u64> distribution(1, (std::numeric_limits<u64>::max)());
 
     NcaID out{};
 
diff --git a/src/core/file_sys/romfs.cpp b/src/core/file_sys/romfs.cpp
index a2b2809734..fee75f9de6 100644
--- a/src/core/file_sys/romfs.cpp
+++ b/src/core/file_sys/romfs.cpp
@@ -75,7 +75,7 @@ std::pair<EntryType, std::string> GetEntry(const RomFSTraversalContext& ctx, siz
     }
     std::memcpy(&entry, data + offset, sizeof(EntryType));
 
-    const size_t name_length = std::min(entry_end + entry.name_length, size) - entry_end;
+    const size_t name_length = (std::min)(entry_end + entry.name_length, size) - entry_end;
     std::string name(reinterpret_cast<const char*>(data + entry_end), name_length);
 
     return {entry, std::move(name)};
diff --git a/src/core/file_sys/vfs/vfs.cpp b/src/core/file_sys/vfs/vfs.cpp
index a04292760f..2be7084209 100644
--- a/src/core/file_sys/vfs/vfs.cpp
+++ b/src/core/file_sys/vfs/vfs.cpp
@@ -507,9 +507,9 @@ bool VfsRawCopy(const VirtualFile& src, const VirtualFile& dest, std::size_t blo
     if (!dest->Resize(src->GetSize()))
         return false;
 
-    std::vector<u8> temp(std::min(block_size, src->GetSize()));
+    std::vector<u8> temp((std::min)(block_size, src->GetSize()));
     for (std::size_t i = 0; i < src->GetSize(); i += block_size) {
-        const auto read = std::min(block_size, src->GetSize() - i);
+        const auto read = (std::min)(block_size, src->GetSize() - i);
 
         if (src->Read(temp.data(), read, i) != read) {
             return false;
diff --git a/src/core/file_sys/vfs/vfs_static.h b/src/core/file_sys/vfs/vfs_static.h
index bb53560ac7..6dc4ef8fbf 100644
--- a/src/core/file_sys/vfs/vfs_static.h
+++ b/src/core/file_sys/vfs/vfs_static.h
@@ -43,7 +43,7 @@ public:
     }
 
     std::size_t Read(u8* data, std::size_t length, std::size_t offset) const override {
-        const auto read = std::min(length, size - offset);
+        const auto read = (std::min)(length, size - offset);
         std::fill(data, data + read, value);
         return read;
     }
@@ -61,7 +61,7 @@ public:
     }
 
     std::vector<u8> ReadBytes(std::size_t length, std::size_t offset) const override {
-        const auto read = std::min(length, size - offset);
+        const auto read = (std::min)(length, size - offset);
         return std::vector<u8>(read, value);
     }
 
diff --git a/src/core/file_sys/vfs/vfs_vector.cpp b/src/core/file_sys/vfs/vfs_vector.cpp
index 0d54461c8f..7576a023cf 100644
--- a/src/core/file_sys/vfs/vfs_vector.cpp
+++ b/src/core/file_sys/vfs/vfs_vector.cpp
@@ -37,7 +37,7 @@ bool VectorVfsFile::IsReadable() const {
 }
 
 std::size_t VectorVfsFile::Read(u8* data_, std::size_t length, std::size_t offset) const {
-    const auto read = std::min(length, data.size() - offset);
+    const auto read = (std::min)(length, data.size() - offset);
     std::memcpy(data_, data.data() + offset, read);
     return read;
 }
@@ -45,7 +45,7 @@ std::size_t VectorVfsFile::Read(u8* data_, std::size_t length, std::size_t offse
 std::size_t VectorVfsFile::Write(const u8* data_, std::size_t length, std::size_t offset) {
     if (offset + length > data.size())
         data.resize(offset + length);
-    const auto write = std::min(length, data.size() - offset);
+    const auto write = (std::min)(length, data.size() - offset);
     std::memcpy(data.data() + offset, data_, write);
     return write;
 }
diff --git a/src/core/file_sys/vfs/vfs_vector.h b/src/core/file_sys/vfs/vfs_vector.h
index 587187dd26..27f2c03ca7 100644
--- a/src/core/file_sys/vfs/vfs_vector.h
+++ b/src/core/file_sys/vfs/vfs_vector.h
@@ -45,7 +45,7 @@ public:
     }
 
     std::size_t Read(u8* data_, std::size_t length, std::size_t offset) const override {
-        const auto read = std::min(length, size - offset);
+        const auto read = (std::min)(length, size - offset);
         std::memcpy(data_, data.data() + offset, read);
         return read;
     }
diff --git a/src/core/frontend/emu_window.cpp b/src/core/frontend/emu_window.cpp
index d1f1ca8c97..ec5cec8fa0 100644
--- a/src/core/frontend/emu_window.cpp
+++ b/src/core/frontend/emu_window.cpp
@@ -28,11 +28,11 @@ std::pair<f32, f32> EmuWindow::MapToTouchScreen(u32 framebuffer_x, u32 framebuff
 }
 
 std::pair<u32, u32> EmuWindow::ClipToTouchScreen(u32 new_x, u32 new_y) const {
-    new_x = std::max(new_x, framebuffer_layout.screen.left);
-    new_x = std::min(new_x, framebuffer_layout.screen.right - 1);
+    new_x = (std::max)(new_x, framebuffer_layout.screen.left);
+    new_x = (std::min)(new_x, framebuffer_layout.screen.right - 1);
 
-    new_y = std::max(new_y, framebuffer_layout.screen.top);
-    new_y = std::min(new_y, framebuffer_layout.screen.bottom - 1);
+    new_y = (std::max)(new_y, framebuffer_layout.screen.top);
+    new_y = (std::min)(new_y, framebuffer_layout.screen.bottom - 1);
 
     return std::make_pair(new_x, new_y);
 }
diff --git a/src/core/frontend/framebuffer_layout.cpp b/src/core/frontend/framebuffer_layout.cpp
index 2590b20da4..3de975c20f 100644
--- a/src/core/frontend/framebuffer_layout.cpp
+++ b/src/core/frontend/framebuffer_layout.cpp
@@ -14,7 +14,7 @@ namespace Layout {
 template <class T>
 static Common::Rectangle<T> MaxRectangle(Common::Rectangle<T> window_area,
                                          float screen_aspect_ratio) {
-    const float scale = std::min(static_cast<float>(window_area.GetWidth()),
+    const float scale = (std::min)(static_cast<float>(window_area.GetWidth()),
                                  static_cast<float>(window_area.GetHeight()) / screen_aspect_ratio);
     return Common::Rectangle<T>{0, 0, static_cast<T>(std::round(scale)),
                                 static_cast<T>(std::round(scale * screen_aspect_ratio))};
diff --git a/src/core/hle/kernel/board/nintendo/nx/k_memory_layout.cpp b/src/core/hle/kernel/board/nintendo/nx/k_memory_layout.cpp
index 24eb3f8866..fa918ff204 100644
--- a/src/core/hle/kernel/board/nintendo/nx/k_memory_layout.cpp
+++ b/src/core/hle/kernel/board/nintendo/nx/k_memory_layout.cpp
@@ -133,7 +133,7 @@ void SetupPoolPartitionMemoryRegions(KMemoryLayout& memory_layout) {
     // Decide on starting addresses for our pools.
     const u64 application_pool_start = pool_end - application_pool_size;
     const u64 applet_pool_start = application_pool_start - applet_pool_size;
-    const u64 unsafe_system_pool_start = std::min(
+    const u64 unsafe_system_pool_start = (std::min)(
         kernel_dram_start + CarveoutSizeMax,
         Common::AlignDown(applet_pool_start - unsafe_system_pool_min_size, CarveoutAlignment));
     const size_t unsafe_system_pool_size = applet_pool_start - unsafe_system_pool_start;
diff --git a/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp b/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp
index f62f3e4767..db654d730d 100644
--- a/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp
+++ b/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp
@@ -182,13 +182,13 @@ namespace {
 template <typename F>
 u64 GenerateUniformRange(u64 min, u64 max, F f) {
     // Handle the case where the difference is too large to represent.
-    if (max == std::numeric_limits<u64>::max() && min == std::numeric_limits<u64>::min()) {
+    if (max == (std::numeric_limits<u64>::max)() && min == (std::numeric_limits<u64>::min)()) {
         return f();
     }
 
     // Iterate until we get a value in range.
     const u64 range_size = ((max + 1) - min);
-    const u64 effective_max = (std::numeric_limits<u64>::max() / range_size) * range_size;
+    const u64 effective_max = ((std::numeric_limits<u64>::max)() / range_size) * range_size;
     while (true) {
         if (const u64 rnd = f(); rnd < effective_max) {
             return min + (rnd % range_size);
@@ -201,7 +201,7 @@ u64 GenerateUniformRange(u64 min, u64 max, F f) {
 u64 KSystemControl::GenerateRandomU64() {
     std::random_device device;
     std::mt19937 gen(device());
-    std::uniform_int_distribution<u64> distribution(1, std::numeric_limits<u64>::max());
+    std::uniform_int_distribution<u64> distribution(1, (std::numeric_limits<u64>::max)());
     return distribution(gen);
 }
 
diff --git a/src/core/hle/kernel/k_dynamic_page_manager.h b/src/core/hle/kernel/k_dynamic_page_manager.h
index ad11e84b71..2357fe0f4d 100644
--- a/src/core/hle/kernel/k_dynamic_page_manager.h
+++ b/src/core/hle/kernel/k_dynamic_page_manager.h
@@ -110,7 +110,7 @@ public:
 
         // Update our tracking.
         m_page_bitmap.ClearBit(offset);
-        m_peak = std::max(m_peak, (++m_used));
+        m_peak = (std::max)(m_peak, (++m_used));
 
         return GetPointer<PageBuffer>(m_aligned_address) + offset;
     }
@@ -131,7 +131,7 @@ public:
         // Update our tracking.
         m_page_bitmap.ClearRange(offset, count);
         m_used += count;
-        m_peak = std::max(m_peak, m_used);
+        m_peak = (std::max)(m_peak, m_used);
 
         return GetPointer<PageBuffer>(m_aligned_address) + offset;
     }
diff --git a/src/core/hle/kernel/k_handle_table.h b/src/core/hle/kernel/k_handle_table.h
index 1bf68e6b04..22fdc7e47a 100644
--- a/src/core/hle/kernel/k_handle_table.h
+++ b/src/core/hle/kernel/k_handle_table.h
@@ -179,7 +179,7 @@ private:
 
         m_free_head_index = m_entry_infos[index].GetNextFreeIndex();
 
-        m_max_count = std::max(m_max_count, ++m_count);
+        m_max_count = (std::max)(m_max_count, ++m_count);
 
         return index;
     }
diff --git a/src/core/hle/kernel/k_hardware_timer.cpp b/src/core/hle/kernel/k_hardware_timer.cpp
index 4e947dd6bc..f3098a59e0 100644
--- a/src/core/hle/kernel/k_hardware_timer.cpp
+++ b/src/core/hle/kernel/k_hardware_timer.cpp
@@ -19,7 +19,7 @@ void KHardwareTimer::Initialize() {
 
 void KHardwareTimer::Finalize() {
     m_kernel.System().CoreTiming().UnscheduleEvent(m_event_type);
-    m_wakeup_time = std::numeric_limits<s64>::max();
+    m_wakeup_time = (std::numeric_limits<s64>::max)();
     m_event_type.reset();
 }
 
@@ -37,7 +37,7 @@ void KHardwareTimer::DoTask() {
         // Disable the timer interrupt while we handle this.
         // Not necessary due to core timing already having popped this event to call it.
         // this->DisableInterrupt();
-        m_wakeup_time = std::numeric_limits<s64>::max();
+        m_wakeup_time = (std::numeric_limits<s64>::max)();
 
         if (const s64 next_time = this->DoInterruptTaskImpl(GetTick());
             0 < next_time && next_time <= m_wakeup_time) {
@@ -63,7 +63,7 @@ void KHardwareTimer::EnableInterrupt(s64 wakeup_time) {
 void KHardwareTimer::DisableInterrupt() {
     m_kernel.System().CoreTiming().UnscheduleEvent(m_event_type,
                                                    Core::Timing::UnscheduleEventType::NoWait);
-    m_wakeup_time = std::numeric_limits<s64>::max();
+    m_wakeup_time = (std::numeric_limits<s64>::max)();
 }
 
 s64 KHardwareTimer::GetTick() const {
@@ -71,7 +71,7 @@ s64 KHardwareTimer::GetTick() const {
 }
 
 bool KHardwareTimer::GetInterruptEnabled() {
-    return m_wakeup_time != std::numeric_limits<s64>::max();
+    return m_wakeup_time != (std::numeric_limits<s64>::max)();
 }
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/k_hardware_timer.h b/src/core/hle/kernel/k_hardware_timer.h
index 27f43cd194..cb83e9c5b5 100644
--- a/src/core/hle/kernel/k_hardware_timer.h
+++ b/src/core/hle/kernel/k_hardware_timer.h
@@ -40,7 +40,7 @@ private:
 
 private:
     // Absolute time in nanoseconds
-    s64 m_wakeup_time{std::numeric_limits<s64>::max()};
+    s64 m_wakeup_time{(std::numeric_limits<s64>::max)()};
     std::shared_ptr<Core::Timing::EventType> m_event_type{};
 };
 
diff --git a/src/core/hle/kernel/k_light_server_session.cpp b/src/core/hle/kernel/k_light_server_session.cpp
index e5ceb01f2a..5ea448b998 100644
--- a/src/core/hle/kernel/k_light_server_session.cpp
+++ b/src/core/hle/kernel/k_light_server_session.cpp
@@ -11,7 +11,7 @@ namespace Kernel {
 
 namespace {
 
-constexpr u64 InvalidThreadId = std::numeric_limits<u64>::max();
+constexpr u64 InvalidThreadId = (std::numeric_limits<u64>::max)();
 
 class ThreadQueueImplForKLightServerSessionRequest final : public KThreadQueue {
 private:
diff --git a/src/core/hle/kernel/k_light_server_session.h b/src/core/hle/kernel/k_light_server_session.h
index 8eca3eab69..87ec9db016 100644
--- a/src/core/hle/kernel/k_light_server_session.h
+++ b/src/core/hle/kernel/k_light_server_session.h
@@ -19,7 +19,7 @@ private:
     KLightSession* m_parent{};
     KThread::WaiterList m_request_list{};
     KThread* m_current_request{};
-    u64 m_server_thread_id{std::numeric_limits<u64>::max()};
+    u64 m_server_thread_id{(std::numeric_limits<u64>::max)()};
     KThread* m_server_thread{};
 
 public:
diff --git a/src/core/hle/kernel/k_memory_block.h b/src/core/hle/kernel/k_memory_block.h
index d2b7e9a66e..acf48cb757 100644
--- a/src/core/hle/kernel/k_memory_block.h
+++ b/src/core/hle/kernel/k_memory_block.h
@@ -551,7 +551,7 @@ public:
         }
 
         m_device_disable_merge_left_count =
-            std::min(m_device_disable_merge_left_count, m_device_use_count);
+            (std::min)(m_device_disable_merge_left_count, m_device_use_count);
 
         if (m_device_disable_merge_left_count == 0) {
             m_disable_merge_attribute = static_cast<KMemoryBlockDisableMergeAttribute>(
diff --git a/src/core/hle/kernel/k_memory_layout.cpp b/src/core/hle/kernel/k_memory_layout.cpp
index bec7146688..6821f4c07e 100644
--- a/src/core/hle/kernel/k_memory_layout.cpp
+++ b/src/core/hle/kernel/k_memory_layout.cpp
@@ -66,7 +66,7 @@ bool KMemoryRegionTree::Insert(u64 address, size_t size, u32 type_id, u32 new_at
         this->insert(*found);
 
         // Insert a new region for the split.
-        const u64 new_pair = (old_pair != std::numeric_limits<u64>::max())
+        const u64 new_pair = (old_pair != (std::numeric_limits<u64>::max)())
                                  ? old_pair + (address - old_address)
                                  : old_pair;
         this->insert(*AllocateRegion(m_memory_region_allocator, address, inserted_region_last,
@@ -75,7 +75,7 @@ bool KMemoryRegionTree::Insert(u64 address, size_t size, u32 type_id, u32 new_at
 
     // If we need to insert a region after the region, do so.
     if (old_last != inserted_region_last) {
-        const u64 after_pair = (old_pair != std::numeric_limits<u64>::max())
+        const u64 after_pair = (old_pair != (std::numeric_limits<u64>::max)())
                                    ? old_pair + (inserted_region_end - old_address)
                                    : old_pair;
         this->insert(*AllocateRegion(m_memory_region_allocator, inserted_region_end, old_last,
diff --git a/src/core/hle/kernel/k_memory_manager.cpp b/src/core/hle/kernel/k_memory_manager.cpp
index d6bd272962..2aa393ac06 100644
--- a/src/core/hle/kernel/k_memory_manager.cpp
+++ b/src/core/hle/kernel/k_memory_manager.cpp
@@ -323,7 +323,7 @@ Result KMemoryManager::AllocateAndOpen(KPageGroup* out, size_t num_pages, u32 op
 
             // Process part or all of the block.
             const size_t cur_pages =
-                std::min(remaining_pages, manager.GetPageOffsetToEnd(cur_address));
+                (std::min)(remaining_pages, manager.GetPageOffsetToEnd(cur_address));
             manager.OpenFirst(cur_address, cur_pages);
 
             // Advance.
@@ -385,7 +385,7 @@ Result KMemoryManager::AllocateForProcess(KPageGroup* out, size_t num_pages, u32
 
                     // Process part or all of the block.
                     const size_t cur_pages =
-                        std::min(remaining_pages, manager.GetPageOffsetToEnd(cur_address));
+                        (std::min)(remaining_pages, manager.GetPageOffsetToEnd(cur_address));
                     any_new = manager.ProcessOptimizedAllocation(m_system.Kernel(), cur_address,
                                                                  cur_pages, fill_pattern);
 
@@ -409,7 +409,7 @@ Result KMemoryManager::AllocateForProcess(KPageGroup* out, size_t num_pages, u32
 
                     // Track some or all of the current pages.
                     const size_t cur_pages =
-                        std::min(remaining_pages, manager.GetPageOffsetToEnd(cur_address));
+                        (std::min)(remaining_pages, manager.GetPageOffsetToEnd(cur_address));
                     manager.TrackOptimizedAllocation(m_system.Kernel(), cur_address, cur_pages);
 
                     // Advance.
diff --git a/src/core/hle/kernel/k_memory_manager.h b/src/core/hle/kernel/k_memory_manager.h
index c5a487af92..41d33fa55d 100644
--- a/src/core/hle/kernel/k_memory_manager.h
+++ b/src/core/hle/kernel/k_memory_manager.h
@@ -68,7 +68,7 @@ public:
         // Repeatedly open references until we've done so for all pages.
         while (num_pages) {
             auto& manager = this->GetManager(address);
-            const size_t cur_pages = std::min(num_pages, manager.GetPageOffsetToEnd(address));
+            const size_t cur_pages = (std::min)(num_pages, manager.GetPageOffsetToEnd(address));
 
             {
                 KScopedLightLock lk(m_pool_locks[static_cast<size_t>(manager.GetPool())]);
@@ -84,7 +84,7 @@ public:
         // Repeatedly open references until we've done so for all pages.
         while (num_pages) {
             auto& manager = this->GetManager(address);
-            const size_t cur_pages = std::min(num_pages, manager.GetPageOffsetToEnd(address));
+            const size_t cur_pages = (std::min)(num_pages, manager.GetPageOffsetToEnd(address));
 
             {
                 KScopedLightLock lk(m_pool_locks[static_cast<size_t>(manager.GetPool())]);
@@ -100,7 +100,7 @@ public:
         // Repeatedly close references until we've done so for all pages.
         while (num_pages) {
             auto& manager = this->GetManager(address);
-            const size_t cur_pages = std::min(num_pages, manager.GetPageOffsetToEnd(address));
+            const size_t cur_pages = (std::min)(num_pages, manager.GetPageOffsetToEnd(address));
 
             {
                 KScopedLightLock lk(m_pool_locks[static_cast<size_t>(manager.GetPool())]);
diff --git a/src/core/hle/kernel/k_memory_region.h b/src/core/hle/kernel/k_memory_region.h
index e3044f0227..cad7b31126 100644
--- a/src/core/hle/kernel/k_memory_region.h
+++ b/src/core/hle/kernel/k_memory_region.h
@@ -28,7 +28,7 @@ public:
         : m_address(address), m_last_address(last_address), m_pair_address(pair_address),
           m_attributes(attributes), m_type_id(type_id) {}
     constexpr KMemoryRegion(u64 address, u64 last_address, u32 attributes, u32 type_id)
-        : KMemoryRegion(address, last_address, std::numeric_limits<u64>::max(), attributes,
+        : KMemoryRegion(address, last_address, (std::numeric_limits<u64>::max)(), attributes,
                         type_id) {}
 
     ~KMemoryRegion() = default;
diff --git a/src/core/hle/kernel/k_page_bitmap.h b/src/core/hle/kernel/k_page_bitmap.h
index 0ff9877326..4ad5483b28 100644
--- a/src/core/hle/kernel/k_page_bitmap.h
+++ b/src/core/hle/kernel/k_page_bitmap.h
@@ -83,7 +83,7 @@ public:
                 }
 
                 // Determine how many bits to take this round.
-                const auto cur_bits = std::min(num_bits, m_bits_available);
+                const auto cur_bits = (std::min)(num_bits, m_bits_available);
 
                 // Generate mask for our current bits.
                 const u64 mask = (static_cast<u64>(1) << cur_bits) - 1;
diff --git a/src/core/hle/kernel/k_page_heap.h b/src/core/hle/kernel/k_page_heap.h
index c55225bac6..0d63a6b1f5 100644
--- a/src/core/hle/kernel/k_page_heap.h
+++ b/src/core/hle/kernel/k_page_heap.h
@@ -75,7 +75,7 @@ public:
     }
 
     static constexpr s32 GetAlignedBlockIndex(size_t num_pages, size_t align_pages) {
-        const size_t target_pages = std::max(num_pages, align_pages);
+        const size_t target_pages = (std::max)(num_pages, align_pages);
         for (size_t i = 0; i < NumMemoryBlockPageShifts; i++) {
             if (target_pages <= (static_cast<size_t>(1) << MemoryBlockPageShifts[i]) / PageSize) {
                 return static_cast<s32>(i);
diff --git a/src/core/hle/kernel/k_page_table_base.cpp b/src/core/hle/kernel/k_page_table_base.cpp
index 5e39fbeb14..6b3f60f52e 100644
--- a/src/core/hle/kernel/k_page_table_base.cpp
+++ b/src/core/hle/kernel/k_page_table_base.cpp
@@ -1731,7 +1731,7 @@ void KPageTableBase::RemapPageGroup(PageLinkedList* page_list, KProcessAddress a
             }
 
             // Map whatever we can.
-            const size_t cur_pages = std::min(pg_pages, map_pages);
+            const size_t cur_pages = (std::min)(pg_pages, map_pages);
             R_ASSERT(this->Operate(page_list, map_address, map_pages, pg_phys_addr, true,
                                    map_properties, OperationType::Map, true));
 
@@ -1929,7 +1929,7 @@ Result KPageTableBase::GetContiguousMemoryRangeWithState(
     }
 
     // Take the minimum size for our region.
-    size = std::min(size, contig_size);
+    size = (std::min)(size, contig_size);
 
     // Check that the memory is contiguous (modulo the reference count bit).
     const KMemoryState test_state_mask = state_mask | KMemoryState::FlagReferenceCounted;
@@ -5297,7 +5297,7 @@ Result KPageTableBase::MapPhysicalMemory(KProcessAddress address, size_t size) {
                                     KMemoryPermission::None, false, false,
                                     DisableMergeAttribute::None};
                                 const size_t cur_pages =
-                                    std::min(KProcessAddress(info.GetEndAddress()) - cur_address,
+                                    (std::min)(KProcessAddress(info.GetEndAddress()) - cur_address,
                                              last_unmap_address + 1 - cur_address) /
                                     PageSize;
 
@@ -5345,7 +5345,7 @@ Result KPageTableBase::MapPhysicalMemory(KProcessAddress address, size_t size) {
                                 ? DisableMergeAttribute::DisableHead
                                 : DisableMergeAttribute::None};
                         size_t map_pages =
-                            std::min(KProcessAddress(info.GetEndAddress()) - cur_address,
+                            (std::min)(KProcessAddress(info.GetEndAddress()) - cur_address,
                                      last_address + 1 - cur_address) /
                             PageSize;
 
@@ -5373,7 +5373,7 @@ Result KPageTableBase::MapPhysicalMemory(KProcessAddress address, size_t size) {
                                     }
 
                                     // Add whatever we can to the current block.
-                                    const size_t cur_pages = std::min(pg_pages, remain_pages);
+                                    const size_t cur_pages = (std::min)(pg_pages, remain_pages);
                                     R_TRY(cur_pg.AddBlock(pg_phys_addr +
                                                               ((pg_pages - cur_pages) * PageSize),
                                                           cur_pages));
@@ -5535,7 +5535,7 @@ Result KPageTableBase::UnmapPhysicalMemory(KProcessAddress address, size_t size)
             // Determine the range to unmap.
             const KPageProperties unmap_properties = {KMemoryPermission::None, false, false,
                                                       DisableMergeAttribute::None};
-            const size_t cur_pages = std::min(KProcessAddress(info.GetEndAddress()) - cur_address,
+            const size_t cur_pages = (std::min)(KProcessAddress(info.GetEndAddress()) - cur_address,
                                               last_address + 1 - cur_address) /
                                      PageSize;
 
@@ -5656,7 +5656,7 @@ Result KPageTableBase::UnmapProcessMemory(KProcessAddress dst_address, size_t si
                     }
 
                     // Update our current size.
-                    m_cur_size = std::min(m_remaining_size, m_cur_size + m_entry.block_size);
+                    m_cur_size = (std::min)(m_remaining_size, m_cur_size + m_entry.block_size);
                 }
             }
         };
diff --git a/src/core/hle/kernel/k_process.h b/src/core/hle/kernel/k_process.h
index df3e540dc2..d6742f0637 100644
--- a/src/core/hle/kernel/k_process.h
+++ b/src/core/hle/kernel/k_process.h
@@ -59,7 +59,7 @@ public:
     static constexpr u64 InitialProcessIdMax = 0x50;
 
     static constexpr u64 ProcessIdMin = InitialProcessIdMax + 1;
-    static constexpr u64 ProcessIdMax = std::numeric_limits<u64>::max();
+    static constexpr u64 ProcessIdMax = (std::numeric_limits<u64>::max)();
 
 private:
     using SharedMemoryInfoList = Common::IntrusiveListBaseTraits<KSharedMemoryInfo>::ListType;
diff --git a/src/core/hle/kernel/k_resource_limit.cpp b/src/core/hle/kernel/k_resource_limit.cpp
index d8a63aaf8c..1403317d72 100644
--- a/src/core/hle/kernel/k_resource_limit.cpp
+++ b/src/core/hle/kernel/k_resource_limit.cpp
@@ -111,7 +111,7 @@ bool KResourceLimit::Reserve(LimitableResource which, s64 value, s64 timeout) {
         if (m_current_values[index] + value <= m_limit_values[index]) {
             m_current_values[index] += value;
             m_current_hints[index] += value;
-            m_peak_values[index] = std::max(m_peak_values[index], m_current_values[index]);
+            m_peak_values[index] = (std::max)(m_peak_values[index], m_current_values[index]);
             return true;
         }
 
diff --git a/src/core/hle/kernel/k_slab_heap.h b/src/core/hle/kernel/k_slab_heap.h
index 334afebb71..2ec3d185dc 100644
--- a/src/core/hle/kernel/k_slab_heap.h
+++ b/src/core/hle/kernel/k_slab_heap.h
@@ -149,7 +149,7 @@ public:
     size_t GetObjectIndex(const void* obj) const {
         if constexpr (SupportDynamicExpansion) {
             if (!this->Contains(reinterpret_cast<uintptr_t>(obj))) {
-                return std::numeric_limits<size_t>::max();
+                return (std::numeric_limits<size_t>::max)();
             }
         }
 
diff --git a/src/core/hle/kernel/k_thread.cpp b/src/core/hle/kernel/k_thread.cpp
index 8a360a839b..6aef191c87 100644
--- a/src/core/hle/kernel/k_thread.cpp
+++ b/src/core/hle/kernel/k_thread.cpp
@@ -1016,7 +1016,7 @@ void KThread::RestorePriority(KernelCore& kernel, KThread* thread) {
         s32 new_priority = thread->GetBasePriority();
         for (const auto& held_lock : thread->m_held_lock_info_list) {
             new_priority =
-                std::min(new_priority, held_lock.GetHighestPriorityWaiter()->GetPriority());
+                (std::min)(new_priority, held_lock.GetHighestPriorityWaiter()->GetPriority());
         }
 
         // If the priority we would inherit is not different from ours, don't do anything.
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index 00177dc943..0ff81066e9 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -507,7 +507,7 @@ struct KernelCore::Impl {
         constexpr size_t MiscRegionAlign = KernelAslrAlignment;
         constexpr size_t MiscRegionMinimumSize = 32_MiB;
         const size_t misc_region_size = Common::AlignUp(
-            std::max(misc_region_needed_size, MiscRegionMinimumSize), MiscRegionAlign);
+            (std::max)(misc_region_needed_size, MiscRegionMinimumSize), MiscRegionAlign);
         ASSERT(misc_region_size > 0);
 
         // Setup the misc region.
diff --git a/src/core/hle/kernel/svc/svc_address_arbiter.cpp b/src/core/hle/kernel/svc/svc_address_arbiter.cpp
index ab91d74433..688d6abce0 100644
--- a/src/core/hle/kernel/svc/svc_address_arbiter.cpp
+++ b/src/core/hle/kernel/svc/svc_address_arbiter.cpp
@@ -58,10 +58,10 @@ Result WaitForAddress(Core::System& system, u64 address, ArbitrationType arb_typ
         if (offset_tick > 0) {
             timeout = system.Kernel().HardwareTimer().GetTick() + offset_tick + 2;
             if (timeout <= 0) {
-                timeout = std::numeric_limits<s64>::max();
+                timeout = (std::numeric_limits<s64>::max)();
             }
         } else {
-            timeout = std::numeric_limits<s64>::max();
+            timeout = (std::numeric_limits<s64>::max)();
         }
     } else {
         timeout = timeout_ns;
diff --git a/src/core/hle/kernel/svc/svc_condition_variable.cpp b/src/core/hle/kernel/svc/svc_condition_variable.cpp
index 0f4550a795..2aed6a77be 100644
--- a/src/core/hle/kernel/svc/svc_condition_variable.cpp
+++ b/src/core/hle/kernel/svc/svc_condition_variable.cpp
@@ -31,10 +31,10 @@ Result WaitProcessWideKeyAtomic(Core::System& system, u64 address, u64 cv_key, u
         if (offset_tick > 0) {
             timeout = system.Kernel().HardwareTimer().GetTick() + offset_tick + 2;
             if (timeout <= 0) {
-                timeout = std::numeric_limits<s64>::max();
+                timeout = (std::numeric_limits<s64>::max)();
             }
         } else {
-            timeout = std::numeric_limits<s64>::max();
+            timeout = (std::numeric_limits<s64>::max)();
         }
     } else {
         timeout = timeout_ns;
diff --git a/src/core/hle/kernel/svc/svc_ipc.cpp b/src/core/hle/kernel/svc/svc_ipc.cpp
index b619bd70ab..bc0684e76c 100644
--- a/src/core/hle/kernel/svc/svc_ipc.cpp
+++ b/src/core/hle/kernel/svc/svc_ipc.cpp
@@ -61,10 +61,10 @@ Result ReplyAndReceiveImpl(KernelCore& kernel, int32_t* out_index, uintptr_t mes
             if (offset_tick > 0) {
                 timeout = kernel.HardwareTimer().GetTick() + offset_tick + 2;
                 if (timeout <= 0) {
-                    timeout = std::numeric_limits<s64>::max();
+                    timeout = (std::numeric_limits<s64>::max)();
                 }
             } else {
-                timeout = std::numeric_limits<s64>::max();
+                timeout = (std::numeric_limits<s64>::max)();
             }
         } else {
             timeout = timeout_ns;
diff --git a/src/core/hle/kernel/svc/svc_process.cpp b/src/core/hle/kernel/svc/svc_process.cpp
index 87845d64a6..6d63892a94 100644
--- a/src/core/hle/kernel/svc/svc_process.cpp
+++ b/src/core/hle/kernel/svc/svc_process.cpp
@@ -82,7 +82,7 @@ Result GetProcessList(Core::System& system, s32* out_num_processes, u64 out_proc
 
     const auto num_processes = process_list.size();
     const auto copy_amount =
-        std::min(static_cast<std::size_t>(out_process_ids_size), num_processes);
+        (std::min)(static_cast<std::size_t>(out_process_ids_size), num_processes);
 
     for (std::size_t i = 0; i < copy_amount && it != process_list.end(); ++i, ++it) {
         memory.Write64(out_process_ids, (*it)->GetProcessId());
diff --git a/src/core/hle/kernel/svc/svc_thread.cpp b/src/core/hle/kernel/svc/svc_thread.cpp
index 77cd634c0d..ca5ce6fe07 100644
--- a/src/core/hle/kernel/svc/svc_thread.cpp
+++ b/src/core/hle/kernel/svc/svc_thread.cpp
@@ -117,10 +117,10 @@ void SleepThread(Core::System& system, s64 ns) {
         if (offset_tick > 0) {
             timeout = kernel.HardwareTimer().GetTick() + offset_tick + 2;
             if (timeout <= 0) {
-                timeout = std::numeric_limits<s64>::max();
+                timeout = (std::numeric_limits<s64>::max)();
             }
         } else {
-            timeout = std::numeric_limits<s64>::max();
+            timeout = (std::numeric_limits<s64>::max)();
         }
 
         // Sleep.
@@ -226,7 +226,7 @@ Result GetThreadList(Core::System& system, s32* out_num_threads, u64 out_thread_
     auto& memory = GetCurrentMemory(system.Kernel());
     const auto& thread_list = current_process->GetThreadList();
     const auto num_threads = thread_list.size();
-    const auto copy_amount = std::min(static_cast<std::size_t>(out_thread_ids_size), num_threads);
+    const auto copy_amount = (std::min)(static_cast<std::size_t>(out_thread_ids_size), num_threads);
 
     auto list_iter = thread_list.cbegin();
     for (std::size_t i = 0; i < copy_amount; ++i, ++list_iter) {
diff --git a/src/core/hle/service/acc/acc.cpp b/src/core/hle/service/acc/acc.cpp
index 51a542e5e6..73f54f89b2 100644
--- a/src/core/hle/service/acc/acc.cpp
+++ b/src/core/hle/service/acc/acc.cpp
@@ -72,7 +72,7 @@ static void SanitizeJPEGImageSize(std::vector<u8>& image) {
         }
     }
 
-    image.resize(std::min(image.size(), max_jpeg_image_size));
+    image.resize((std::min)(image.size(), max_jpeg_image_size));
 }
 
 class IManagerForSystemService final : public ServiceFramework<IManagerForSystemService> {
diff --git a/src/core/hle/service/am/frontend/applet_cabinet.cpp b/src/core/hle/service/am/frontend/applet_cabinet.cpp
index 4cbc80d639..58401479d3 100644
--- a/src/core/hle/service/am/frontend/applet_cabinet.cpp
+++ b/src/core/hle/service/am/frontend/applet_cabinet.cpp
@@ -118,7 +118,7 @@ void Cabinet::DisplayCompleted(bool apply_changes, std::string_view amiibo_name)
     case Service::NFP::CabinetMode::StartNicknameAndOwnerSettings: {
         Service::NFP::RegisterInfoPrivate register_info{};
         std::memcpy(register_info.amiibo_name.data(), amiibo_name.data(),
-                    std::min(amiibo_name.size(), register_info.amiibo_name.size() - 1));
+                    (std::min)(amiibo_name.size(), register_info.amiibo_name.size() - 1));
         register_info.mii_store_data.BuildRandom(Mii::Age::All, Mii::Gender::All, Mii::Race::All);
         register_info.mii_store_data.SetNickname({u'y', u'u', u'z', u'u'});
         nfp_device->SetRegisterInfoPrivate(register_info);
diff --git a/src/core/hle/service/am/frontend/applet_controller.cpp b/src/core/hle/service/am/frontend/applet_controller.cpp
index 66f52686d7..d457885773 100644
--- a/src/core/hle/service/am/frontend/applet_controller.cpp
+++ b/src/core/hle/service/am/frontend/applet_controller.cpp
@@ -31,7 +31,7 @@ static Core::Frontend::ControllerParameters ConvertToFrontendParameters(
     npad_style_set.raw = private_arg.style_set;
 
     return {
-        .min_players = std::max(s8{1}, header.player_count_min),
+        .min_players = (std::max)(s8{1}, header.player_count_min),
         .max_players = header.player_count_max,
         .keep_controllers_connected = header.enable_take_over_connection,
         .enable_single_mode = header.enable_single_mode,
diff --git a/src/core/hle/service/am/service/application_accessor.cpp b/src/core/hle/service/am/service/application_accessor.cpp
index 986abc716a..2ac07f838e 100644
--- a/src/core/hle/service/am/service/application_accessor.cpp
+++ b/src/core/hle/service/am/service/application_accessor.cpp
@@ -115,7 +115,7 @@ Result IApplicationAccessor::GetApplicationControlProperty(
     R_TRY(system.GetARPManager().GetControlProperty(&nacp, m_applet->program_id));
 
     std::memcpy(out_control_property.data(), nacp.data(),
-                std::min(out_control_property.size(), nacp.size()));
+                (std::min)(out_control_property.size(), nacp.size()));
 
     R_SUCCEED();
 }
diff --git a/src/core/hle/service/am/service/application_functions.cpp b/src/core/hle/service/am/service/application_functions.cpp
index b736e2821b..eacc345e15 100644
--- a/src/core/hle/service/am/service/application_functions.cpp
+++ b/src/core/hle/service/am/service/application_functions.cpp
@@ -216,7 +216,7 @@ Result IApplicationFunctions::GetDisplayVersion(Out<DisplayVersion> out_display_
     if (res.first != nullptr) {
         const auto& version = res.first->GetVersionString();
         std::memcpy(out_display_version->string.data(), version.data(),
-                    std::min(version.size(), out_display_version->string.size()));
+                    (std::min)(version.size(), out_display_version->string.size()));
     } else {
         static constexpr char default_version[]{"1.0.0"};
         std::memcpy(out_display_version->string.data(), default_version, sizeof(default_version));
@@ -284,7 +284,7 @@ Result IApplicationFunctions::GetCacheStorageMax(Out<u32> out_cache_storage_inde
     R_TRY(system.GetARPManager().GetControlProperty(&nacp, m_applet->program_id));
 
     auto raw_nacp = std::make_unique<FileSys::RawNACP>();
-    std::memcpy(raw_nacp.get(), nacp.data(), std::min(sizeof(*raw_nacp), nacp.size()));
+    std::memcpy(raw_nacp.get(), nacp.data(), (std::min)(sizeof(*raw_nacp), nacp.size()));
 
     *out_cache_storage_index_max = static_cast<u32>(raw_nacp->cache_storage_max_index);
     *out_max_journal_size = static_cast<u64>(raw_nacp->cache_storage_data_and_journal_max_size);
diff --git a/src/core/hle/service/am/service/library_applet_self_accessor.cpp b/src/core/hle/service/am/service/library_applet_self_accessor.cpp
index cbe45189f8..091aadc9fc 100644
--- a/src/core/hle/service/am/service/library_applet_self_accessor.cpp
+++ b/src/core/hle/service/am/service/library_applet_self_accessor.cpp
@@ -162,7 +162,7 @@ Result ILibraryAppletSelfAccessor::GetMainAppletApplicationControlProperty(
         system.GetARPManager().GetControlProperty(&nacp, application.application_id);
 
     if (R_SUCCEEDED(result)) {
-        std::memcpy(out_nacp->data(), nacp.data(), std::min(nacp.size(), out_nacp->size()));
+        std::memcpy(out_nacp->data(), nacp.data(), (std::min)(nacp.size(), out_nacp->size()));
     }
 
     R_RETURN(result);
diff --git a/src/core/hle/service/bcat/bcat_service.cpp b/src/core/hle/service/bcat/bcat_service.cpp
index 63b1072d2a..5c23760113 100644
--- a/src/core/hle/service/bcat/bcat_service.cpp
+++ b/src/core/hle/service/bcat/bcat_service.cpp
@@ -102,7 +102,7 @@ Result IBcatService::SetPassphrase(u64 application_id,
 
     Passphrase passphrase{};
     std::memcpy(passphrase.data(), passphrase_buffer.data(),
-                std::min(passphrase.size(), passphrase_buffer.size()));
+                (std::min)(passphrase.size(), passphrase_buffer.size()));
 
     backend.SetPassphrase(application_id, passphrase);
     R_SUCCEED();
diff --git a/src/core/hle/service/bcat/delivery_cache_directory_service.cpp b/src/core/hle/service/bcat/delivery_cache_directory_service.cpp
index 01f08a2fc5..fea373a607 100644
--- a/src/core/hle/service/bcat/delivery_cache_directory_service.cpp
+++ b/src/core/hle/service/bcat/delivery_cache_directory_service.cpp
@@ -57,12 +57,12 @@ Result IDeliveryCacheDirectoryService::Read(
     R_UNLESS(current_dir != nullptr, ResultNoOpenEntry);
 
     const auto files = current_dir->GetFiles();
-    *out_count = static_cast<s32>(std::min(files.size(), out_buffer.size()));
+    *out_count = static_cast<s32>((std::min)(files.size(), out_buffer.size()));
     std::transform(files.begin(), files.begin() + *out_count, out_buffer.begin(),
                    [](const auto& file) {
                        FileName name{};
                        std::memcpy(name.data(), file->GetName().data(),
-                                   std::min(file->GetName().size(), name.size()));
+                                   (std::min)(file->GetName().size(), name.size()));
                        return DeliveryCacheDirectoryEntry{name, file->GetSize(), DigestFile(file)};
                    });
     R_SUCCEED();
diff --git a/src/core/hle/service/bcat/delivery_cache_storage_service.cpp b/src/core/hle/service/bcat/delivery_cache_storage_service.cpp
index 4c79d71f41..0ce798eb75 100644
--- a/src/core/hle/service/bcat/delivery_cache_storage_service.cpp
+++ b/src/core/hle/service/bcat/delivery_cache_storage_service.cpp
@@ -47,7 +47,7 @@ Result IDeliveryCacheStorageService::EnumerateDeliveryCacheDirectory(
     LOG_DEBUG(Service_BCAT, "called, size={:016X}", out_directories.size());
 
     *out_directory_count =
-        static_cast<s32>(std::min(out_directories.size(), entries.size() - next_read_index));
+        static_cast<s32>((std::min)(out_directories.size(), entries.size() - next_read_index));
     memcpy(out_directories.data(), entries.data() + next_read_index,
            *out_directory_count * sizeof(DirectoryName));
     next_read_index += *out_directory_count;
diff --git a/src/core/hle/service/cmif_serialization.h b/src/core/hle/service/cmif_serialization.h
index 5a5f610f34..03b2a130a1 100644
--- a/src/core/hle/service/cmif_serialization.h
+++ b/src/core/hle/service/cmif_serialization.h
@@ -304,7 +304,7 @@ void ReadInArgument(bool is_domain, CallArguments& args, const u8* raw_data, HLE
                 buffer = ctx.ReadBufferX(InBufferIndex);
             }
 
-            std::memcpy(&std::get<ArgIndex>(args), buffer.data(), std::min(BufferSize, buffer.size()));
+            std::memcpy(&std::get<ArgIndex>(args), buffer.data(), (std::min)(BufferSize, buffer.size()));
 
             return ReadInArgument<MethodArguments, CallArguments, PrevAlign, DataOffset, HandleIndex, InBufferIndex + 1, OutBufferIndex, RawDataFinished, ArgIndex + 1>(is_domain, args, raw_data, ctx, temp);
         } else if constexpr (ArgumentTraits<ArgType>::Type == ArgumentType::InBuffer) {
diff --git a/src/core/hle/service/es/es.cpp b/src/core/hle/service/es/es.cpp
index 9eaae4c4bd..9ad8d0e9b5 100644
--- a/src/core/hle/service/es/es.cpp
+++ b/src/core/hle/service/es/es.cpp
@@ -203,7 +203,7 @@ private:
         std::transform(tickets.begin(), tickets.end(), std::back_inserter(ids),
                        [](const auto& pair) { return pair.first; });
 
-        out_entries = std::min(ids.size(), out_entries);
+        out_entries = (std::min)(ids.size(), out_entries);
         ctx.WriteBuffer(ids.data(), out_entries * sizeof(u128));
 
         IPC::ResponseBuilder rb{ctx, 3};
@@ -225,7 +225,7 @@ private:
         std::transform(tickets.begin(), tickets.end(), std::back_inserter(ids),
                        [](const auto& pair) { return pair.first; });
 
-        out_entries = std::min(ids.size(), out_entries);
+        out_entries = (std::min)(ids.size(), out_entries);
         ctx.WriteBuffer(ids.data(), out_entries * sizeof(u128));
 
         IPC::ResponseBuilder rb{ctx, 3};
diff --git a/src/core/hle/service/filesystem/fsp/fs_i_save_data_info_reader.cpp b/src/core/hle/service/filesystem/fsp/fs_i_save_data_info_reader.cpp
index ff823586b3..490ac49d42 100644
--- a/src/core/hle/service/filesystem/fsp/fs_i_save_data_info_reader.cpp
+++ b/src/core/hle/service/filesystem/fsp/fs_i_save_data_info_reader.cpp
@@ -44,7 +44,7 @@ Result ISaveDataInfoReader::ReadSaveDataInfo(
     const u64 count_entries = out_entries.size();
 
     // Cap at total number of entries.
-    const u64 actual_entries = std::min(count_entries, info.size() - next_entry_index);
+    const u64 actual_entries = (std::min)(count_entries, info.size() - next_entry_index);
 
     // Determine data start and end
     const auto* begin = reinterpret_cast<u8*>(info.data() + next_entry_index);
diff --git a/src/core/hle/service/glue/notif.cpp b/src/core/hle/service/glue/notif.cpp
index 5a03d34c12..dd3f1954de 100644
--- a/src/core/hle/service/glue/notif.cpp
+++ b/src/core/hle/service/glue/notif.cpp
@@ -67,7 +67,7 @@ Result NotificationServiceImpl::ListAlarmSettings(s32* out_count,
                                                   std::span<AlarmSetting> out_alarms) {
     LOG_INFO(Service_NOTIF, "called, alarm_count={}", alarms.size());
 
-    const auto count = std::min(out_alarms.size(), alarms.size());
+    const auto count = (std::min)(out_alarms.size(), alarms.size());
     for (size_t i = 0; i < count; i++) {
         out_alarms[i] = alarms[i];
     }
@@ -90,7 +90,7 @@ Result NotificationServiceImpl::LoadApplicationParameter(u32* out_size,
 
     LOG_WARNING(Service_NOTIF, "(STUBBED) called, alarm_setting_id={}", alarm_setting_id);
     std::memcpy(out_application_parameter.data(), application_parameter.data(),
-                std::min(sizeof(application_parameter), out_application_parameter.size()));
+                (std::min)(sizeof(application_parameter), out_application_parameter.size()));
 
     *out_size = static_cast<u32>(application_parameter.size());
     R_SUCCEED();
diff --git a/src/core/hle/service/glue/time/manager.cpp b/src/core/hle/service/glue/time/manager.cpp
index 77bf8896cd..bfe57999c8 100644
--- a/src/core/hle/service/glue/time/manager.cpp
+++ b/src/core/hle/service/glue/time/manager.cpp
@@ -29,7 +29,7 @@ static s64 CalendarTimeToEpoch(Service::PSC::Time::CalendarTime calendar) {
     };
 
     s16 month_s16{calendar.month};
-    s8 month{static_cast<s8>(((month_s16 * 43) & ~std::numeric_limits<s16>::max()) +
+    s8 month{static_cast<s8>(((month_s16 * 43) & ~(std::numeric_limits<s16>::max)()) +
                              ((month_s16 * 43) >> 9))};
     s8 month_index{static_cast<s8>(calendar.month - 12 * month)};
     if (month_index == 0) {
@@ -71,13 +71,13 @@ static Service::PSC::Time::LocationName GetTimeZoneString(
 
     Service::PSC::Time::LocationName configured_name{};
     std::memcpy(configured_name.data(), configured_zone.data(),
-                std::min(configured_name.size(), configured_zone.size()));
+                (std::min)(configured_name.size(), configured_zone.size()));
 
     if (!time_zone_binary.IsValid(configured_name)) {
         configured_zone = Common::TimeZone::FindSystemTimeZone();
         configured_name = {};
         std::memcpy(configured_name.data(), configured_zone.data(),
-                    std::min(configured_name.size(), configured_zone.size()));
+                    (std::min)(configured_name.size(), configured_zone.size()));
     }
 
     ASSERT_MSG(time_zone_binary.IsValid(configured_name), "Invalid time zone {}!",
diff --git a/src/core/hle/service/hid/hid_debug_server.cpp b/src/core/hle/service/hid/hid_debug_server.cpp
index 738c6d9ae2..450c1e953f 100644
--- a/src/core/hle/service/hid/hid_debug_server.cpp
+++ b/src/core/hle/service/hid/hid_debug_server.cpp
@@ -178,7 +178,7 @@ Result IHidDebugServer::SetTouchScreenAutoPilotState(
     AutoPilotState auto_pilot{};
 
     auto_pilot.count =
-        static_cast<u64>(std::min(auto_pilot_buffer.size(), auto_pilot.state.size()));
+        static_cast<u64>((std::min)(auto_pilot_buffer.size(), auto_pilot.state.size()));
     memcpy(auto_pilot.state.data(), auto_pilot_buffer.data(),
            auto_pilot.count * sizeof(TouchState));
 
diff --git a/src/core/hle/service/jit/jit_context.cpp b/src/core/hle/service/jit/jit_context.cpp
index 0090e8568d..06a2368fe5 100644
--- a/src/core/hle/service/jit/jit_context.cpp
+++ b/src/core/hle/service/jit/jit_context.cpp
@@ -107,7 +107,7 @@ public:
 
     void AddTicks(u64 ticks) override {}
     u64 GetTicksRemaining() override {
-        return std::numeric_limits<u32>::max();
+        return (std::numeric_limits<u32>::max)();
     }
     u64 GetCNTPCT() override {
         return 0;
diff --git a/src/core/hle/service/ldn/ldn_types.h b/src/core/hle/service/ldn/ldn_types.h
index fa0cdcbfa7..56a3cd1b4b 100644
--- a/src/core/hle/service/ldn/ldn_types.h
+++ b/src/core/hle/service/ldn/ldn_types.h
@@ -170,7 +170,7 @@ struct Ssid {
     Ssid() = default;
 
     constexpr explicit Ssid(std::string_view data) {
-        length = static_cast<u8>(std::min(data.size(), SsidLengthMax));
+        length = static_cast<u8>((std::min)(data.size(), SsidLengthMax));
         raw = {};
         data.copy(raw.data(), length);
         raw[length] = 0;
diff --git a/src/core/hle/service/lm/lm.cpp b/src/core/hle/service/lm/lm.cpp
index 20df002330..508f91546d 100644
--- a/src/core/hle/service/lm/lm.cpp
+++ b/src/core/hle/service/lm/lm.cpp
@@ -180,7 +180,7 @@ private:
         if (length == 0) {
             return std::nullopt;
         }
-        const auto length_to_read = std::min(length, data.size() - offset);
+        const auto length_to_read = (std::min)(length, data.size() - offset);
 
         std::string output(length_to_read, '\0');
         std::memcpy(output.data(), data.data() + offset, length_to_read);
diff --git a/src/core/hle/service/nfc/common/device.cpp b/src/core/hle/service/nfc/common/device.cpp
index 30eab469a1..c026d47e86 100644
--- a/src/core/hle/service/nfc/common/device.cpp
+++ b/src/core/hle/service/nfc/common/device.cpp
@@ -978,7 +978,7 @@ Result NfcDevice::GetApplicationArea(std::span<u8> data) const {
     }
 
     memcpy(data.data(), tag_data.application_area.data(),
-           std::min(data.size(), sizeof(NFP::ApplicationArea)));
+           (std::min)(data.size(), sizeof(NFP::ApplicationArea)));
 
     return ResultSuccess;
 }
diff --git a/src/core/hle/service/nifm/nifm.cpp b/src/core/hle/service/nifm/nifm.cpp
index 4710167364..15c7d8d2c7 100644
--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -613,7 +613,7 @@ void IGeneralService::EnumerateNetworkInterfaces(HLERequestContext& ctx) {
 
     const size_t guest_bytes = ctx.GetWriteBufferSize();
     if (guest_bytes && !blob.empty())
-        ctx.WriteBuffer(blob.data(), std::min(guest_bytes, blob.size()));
+        ctx.WriteBuffer(blob.data(), (std::min)(guest_bytes, blob.size()));
 
     IPC::ResponseBuilder rb{ctx, 3};
     rb.Push(ResultSuccess);
@@ -639,7 +639,7 @@ void IGeneralService::EnumerateNetworkProfiles(HLERequestContext& ctx) {
 
     const size_t guest_sz = ctx.GetWriteBufferSize();
     if (guest_sz && uuids.size()) {
-        const size_t to_copy = std::min(guest_sz, uuids.size() * sizeof(u128));
+        const size_t to_copy = (std::min)(guest_sz, uuids.size() * sizeof(u128));
         ctx.WriteBuffer(uuids.data(), to_copy);
     }
 
diff --git a/src/core/hle/service/ns/application_manager_interface.cpp b/src/core/hle/service/ns/application_manager_interface.cpp
index 517ec75743..60ecd5c2b9 100644
--- a/src/core/hle/service/ns/application_manager_interface.cpp
+++ b/src/core/hle/service/ns/application_manager_interface.cpp
@@ -410,7 +410,7 @@ Result IApplicationManagerInterface::IsAnyApplicationEntityInstalled(
 Result IApplicationManagerInterface::GetApplicationView(
     OutArray<ApplicationView, BufferAttr_HipcMapAlias> out_application_views,
     InArray<u64, BufferAttr_HipcMapAlias> application_ids) {
-    const auto size = std::min(out_application_views.size(), application_ids.size());
+    const auto size = (std::min)(out_application_views.size(), application_ids.size());
     LOG_WARNING(Service_NS, "(STUBBED) called, size={}", application_ids.size());
 
     for (size_t i = 0; i < size; i++) {
@@ -428,7 +428,7 @@ Result IApplicationManagerInterface::GetApplicationView(
 Result IApplicationManagerInterface::GetApplicationViewWithPromotionInfo(
     OutArray<ApplicationViewWithPromotionInfo, BufferAttr_HipcMapAlias> out_application_views,
     InArray<u64, BufferAttr_HipcMapAlias> application_ids) {
-    const auto size = std::min(out_application_views.size(), application_ids.size());
+    const auto size = (std::min)(out_application_views.size(), application_ids.size());
     LOG_WARNING(Service_NS, "(STUBBED) called, size={}", application_ids.size());
 
     for (size_t i = 0; i < size; i++) {
diff --git a/src/core/hle/service/ns/platform_service_manager.cpp b/src/core/hle/service/ns/platform_service_manager.cpp
index 23cf05005c..301cf4ac4f 100644
--- a/src/core/hle/service/ns/platform_service_manager.cpp
+++ b/src/core/hle/service/ns/platform_service_manager.cpp
@@ -254,7 +254,7 @@ Result IPlatformServiceManager::GetSharedFontInOrderOfPriority(
     constexpr size_t MaxElementCount = 6;
 
     // TODO(ogniK): Have actual priority order
-    const auto max_size = std::min({MaxElementCount, out_font_codes.size(), out_font_offsets.size(),
+    const auto max_size = (std::min)({MaxElementCount, out_font_codes.size(), out_font_offsets.size(),
                                     out_font_sizes.size(), impl->shared_font_regions.size()});
 
     for (size_t i = 0; i < max_size; i++) {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
index 02913a5817..140c6eb6e3 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@@ -504,7 +504,7 @@ NvResult nvhost_as_gpu::GetVARegions3(IoctlGetVaRegions& params, std::span<VaReg
 
     GetVARegionsImpl(params);
 
-    const size_t num_regions = std::min(params.regions.size(), regions.size());
+    const size_t num_regions = (std::min)(params.regions.size(), regions.size());
     for (size_t i = 0; i < num_regions; i++) {
         regions[i] = params.regions[i];
     }
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
index cad6457293..c9a6737ba9 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
@@ -197,12 +197,12 @@ private:
 
     struct VM {
         static constexpr u32 YUZU_PAGESIZE{0x1000};
-        static constexpr u32 PAGE_SIZE_BITS{std::countr_zero(YUZU_PAGESIZE)};
+        static constexpr u32 PAGE_SIZE_BITS{static_cast<u32>(std::countr_zero<u32>(YUZU_PAGESIZE))};
 
         static constexpr u32 SUPPORTED_BIG_PAGE_SIZES{0x30000};
         static constexpr u32 DEFAULT_BIG_PAGE_SIZE{0x20000};
         u32 big_page_size{DEFAULT_BIG_PAGE_SIZE};
-        u32 big_page_size_bits{std::countr_zero(DEFAULT_BIG_PAGE_SIZE)};
+        u32 big_page_size_bits{static_cast<u32>(std::countr_zero<u32>(DEFAULT_BIG_PAGE_SIZE))};
 
         static constexpr u32 VA_START_SHIFT{10};
         static constexpr u64 DEFAULT_VA_SPLIT{1ULL << 34};
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
index 9ca6308e6f..fa46c2f280 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -139,7 +139,7 @@ NvResult nvhost_nvdec_common::GetWaitbase(IoctlGetWaitbase& params) {
 
 NvResult nvhost_nvdec_common::MapBuffer(IoctlMapBuffer& params, std::span<MapBufferEntry> entries,
                                         DeviceFD fd) {
-    const size_t num_entries = std::min(params.num_entries, static_cast<u32>(entries.size()));
+    const size_t num_entries = (std::min)(params.num_entries, static_cast<u32>(entries.size()));
     for (size_t i = 0; i < num_entries; i++) {
         DAddr pin_address = nvmap.PinHandle(entries[i].map_handle, true);
         entries[i].map_address = static_cast<u32>(pin_address);
@@ -150,7 +150,7 @@ NvResult nvhost_nvdec_common::MapBuffer(IoctlMapBuffer& params, std::span<MapBuf
 
 NvResult nvhost_nvdec_common::UnmapBuffer(IoctlMapBuffer& params,
                                           std::span<MapBufferEntry> entries) {
-    const size_t num_entries = std::min(params.num_entries, static_cast<u32>(entries.size()));
+    const size_t num_entries = (std::min)(params.num_entries, static_cast<u32>(entries.size()));
     for (size_t i = 0; i < num_entries; i++) {
         nvmap.UnpinHandle(entries[i].map_handle);
         entries[i] = {};
diff --git a/src/core/hle/service/nvnflinger/buffer_queue_consumer.cpp b/src/core/hle/service/nvnflinger/buffer_queue_consumer.cpp
index 91ba35aef5..a9b0f9d2f3 100644
--- a/src/core/hle/service/nvnflinger/buffer_queue_consumer.cpp
+++ b/src/core/hle/service/nvnflinger/buffer_queue_consumer.cpp
@@ -328,7 +328,7 @@ void BufferQueueConsumer::Transact(u32 code, std::span<const u8> parcel_data,
 
     const auto serialized = parcel_out.Serialize();
     std::memcpy(parcel_reply.data(), serialized.data(),
-                std::min(parcel_reply.size(), serialized.size()));
+                (std::min)(parcel_reply.size(), serialized.size()));
 }
 
 Kernel::KReadableEvent* BufferQueueConsumer::GetNativeHandle(u32 type_id) {
diff --git a/src/core/hle/service/nvnflinger/buffer_queue_core.cpp b/src/core/hle/service/nvnflinger/buffer_queue_core.cpp
index 30095b0f73..27ac930f96 100644
--- a/src/core/hle/service/nvnflinger/buffer_queue_core.cpp
+++ b/src/core/hle/service/nvnflinger/buffer_queue_core.cpp
@@ -47,7 +47,7 @@ s32 BufferQueueCore::GetMinMaxBufferCountLocked(bool async) const {
 
 s32 BufferQueueCore::GetMaxBufferCountLocked(bool async) const {
     const auto min_buffer_count = GetMinMaxBufferCountLocked(async);
-    auto max_buffer_count = std::max(default_max_buffer_count, min_buffer_count);
+    auto max_buffer_count = (std::max)(default_max_buffer_count, min_buffer_count);
 
     if (override_max_buffer_count != 0) {
         ASSERT(override_max_buffer_count >= min_buffer_count);
diff --git a/src/core/hle/service/nvnflinger/buffer_queue_producer.cpp b/src/core/hle/service/nvnflinger/buffer_queue_producer.cpp
index 1bb88a45fa..f9e1dba965 100644
--- a/src/core/hle/service/nvnflinger/buffer_queue_producer.cpp
+++ b/src/core/hle/service/nvnflinger/buffer_queue_producer.cpp
@@ -942,7 +942,7 @@ void BufferQueueProducer::Transact(u32 code, std::span<const u8> parcel_data,
 
         std::scoped_lock lock{core->mutex};
 
-        auto buffer_history_count = std::min(parcel_in.Read<s32>(), (s32)core->history.size());
+        auto buffer_history_count = (std::min)(parcel_in.Read<s32>(), (s32)core->history.size());
 
         if (buffer_history_count <= 0) {
             parcel_out.Write(Status::BadValue);
@@ -978,7 +978,7 @@ void BufferQueueProducer::Transact(u32 code, std::span<const u8> parcel_data,
 
     const auto serialized = parcel_out.Serialize();
     std::memcpy(parcel_reply.data(), serialized.data(),
-                std::min(parcel_reply.size(), serialized.size()));
+                (std::min)(parcel_reply.size(), serialized.size()));
 }
 
 
diff --git a/src/core/hle/service/nvnflinger/hardware_composer.cpp b/src/core/hle/service/nvnflinger/hardware_composer.cpp
index 77622a7832..7098f4709d 100644
--- a/src/core/hle/service/nvnflinger/hardware_composer.cpp
+++ b/src/core/hle/service/nvnflinger/hardware_composer.cpp
@@ -101,7 +101,7 @@ u32 HardwareComposer::ComposeLocked(f32* out_speed_scale, Display& display,
         // only swap intervals of 0, 1 and 2 have been observed, but if 3 were
         // to be introduced, this would cause an issue.
         if (swap_interval) {
-            swap_interval = std::min(*swap_interval, item_swap_interval);
+            swap_interval = (std::min)(*swap_interval, item_swap_interval);
         } else {
             swap_interval = item_swap_interval;
         }
diff --git a/src/core/hle/service/psc/time/common.h b/src/core/hle/service/psc/time/common.h
index 954aed666a..0ad2ed51f8 100644
--- a/src/core/hle/service/psc/time/common.h
+++ b/src/core/hle/service/psc/time/common.h
@@ -138,12 +138,12 @@ constexpr inline std::chrono::nanoseconds ConvertToTimeSpan(s64 ticks) {
         std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::seconds(1)).count()};
 
     constexpr s64 max{Common::WallClock::CNTFRQ *
-                      (std::numeric_limits<s64>::max() / one_second_ns)};
+                      ((std::numeric_limits<s64>::max)() / one_second_ns)};
 
     if (ticks > max) {
-        return std::chrono::nanoseconds(std::numeric_limits<s64>::max());
+        return std::chrono::nanoseconds((std::numeric_limits<s64>::max)());
     } else if (ticks < -max) {
-        return std::chrono::nanoseconds(std::numeric_limits<s64>::min());
+        return std::chrono::nanoseconds((std::numeric_limits<s64>::min)());
     }
 
     auto a{ticks / Common::WallClock::CNTFRQ * one_second_ns};
@@ -156,9 +156,9 @@ constexpr inline Result GetSpanBetweenTimePoints(s64* out_seconds, const SteadyC
                                                  const SteadyClockTimePoint& b) {
     R_UNLESS(out_seconds, ResultInvalidArgument);
     R_UNLESS(a.IdMatches(b), ResultInvalidArgument);
-    R_UNLESS(a.time_point >= 0 || b.time_point <= a.time_point + std::numeric_limits<s64>::max(),
+    R_UNLESS(a.time_point >= 0 || b.time_point <= a.time_point + (std::numeric_limits<s64>::max)(),
              ResultOverflow);
-    R_UNLESS(a.time_point < 0 || b.time_point >= a.time_point + std::numeric_limits<s64>::min(),
+    R_UNLESS(a.time_point < 0 || b.time_point >= a.time_point + (std::numeric_limits<s64>::min)(),
              ResultOverflow);
 
     *out_seconds = b.time_point - a.time_point;
diff --git a/src/core/hle/service/psc/time/power_state_request_manager.cpp b/src/core/hle/service/psc/time/power_state_request_manager.cpp
index 17de0bf4dd..15fe8e2918 100644
--- a/src/core/hle/service/psc/time/power_state_request_manager.cpp
+++ b/src/core/hle/service/psc/time/power_state_request_manager.cpp
@@ -17,7 +17,7 @@ PowerStateRequestManager::~PowerStateRequestManager() {
 void PowerStateRequestManager::UpdatePendingPowerStateRequestPriority(u32 priority) {
     std::scoped_lock l{m_mutex};
     if (m_has_pending_request) {
-        m_pending_request_priority = std::max(m_pending_request_priority, priority);
+        m_pending_request_priority = (std::max)(m_pending_request_priority, priority);
     } else {
         m_pending_request_priority = priority;
         m_has_pending_request = true;
diff --git a/src/core/hle/service/set/settings_server.cpp b/src/core/hle/service/set/settings_server.cpp
index aa873bc8c5..7d1869a4e8 100644
--- a/src/core/hle/service/set/settings_server.cpp
+++ b/src/core/hle/service/set/settings_server.cpp
@@ -122,8 +122,8 @@ Result ISettingsServer::GetAvailableLanguageCodes(
     Out<s32> out_count, OutArray<LanguageCode, BufferAttr_HipcPointer> out_language_codes) {
     LOG_DEBUG(Service_SET, "called");
 
-    const std::size_t max_amount = std::min(PRE_4_0_0_MAX_ENTRIES, out_language_codes.size());
-    *out_count = static_cast<s32>(std::min(available_language_codes.size(), max_amount));
+    const std::size_t max_amount = (std::min)(PRE_4_0_0_MAX_ENTRIES, out_language_codes.size());
+    *out_count = static_cast<s32>((std::min)(available_language_codes.size(), max_amount));
 
     memcpy(out_language_codes.data(), available_language_codes.data(),
            static_cast<std::size_t>(*out_count) * sizeof(LanguageCode));
@@ -159,8 +159,8 @@ Result ISettingsServer::GetAvailableLanguageCodes2(
     Out<s32> out_count, OutArray<LanguageCode, BufferAttr_HipcMapAlias> language_codes) {
     LOG_DEBUG(Service_SET, "called");
 
-    const std::size_t max_amount = std::min(POST_4_0_0_MAX_ENTRIES, language_codes.size());
-    *out_count = static_cast<s32>(std::min(available_language_codes.size(), max_amount));
+    const std::size_t max_amount = (std::min)(POST_4_0_0_MAX_ENTRIES, language_codes.size());
+    *out_count = static_cast<s32>((std::min)(available_language_codes.size(), max_amount));
 
     memcpy(language_codes.data(), available_language_codes.data(),
            static_cast<std::size_t>(*out_count) * sizeof(LanguageCode));
@@ -233,7 +233,7 @@ Result ISettingsServer::GetDeviceNickName(
     LOG_DEBUG(Service_SET, "called");
 
     const std::size_t string_size =
-        std::min(Settings::values.device_name.GetValue().size(), out_device_name->size());
+        (std::min)(Settings::values.device_name.GetValue().size(), out_device_name->size());
 
     *out_device_name = {};
     memcpy(out_device_name->data(), Settings::values.device_name.GetValue().data(), string_size);
diff --git a/src/core/hle/service/set/system_settings_server.cpp b/src/core/hle/service/set/system_settings_server.cpp
index d246b95d0e..c70fdea24b 100644
--- a/src/core/hle/service/set/system_settings_server.cpp
+++ b/src/core/hle/service/set/system_settings_server.cpp
@@ -533,7 +533,7 @@ Result ISystemSettingsServer::GetEulaVersions(
     LOG_INFO(Service_SET, "called, elements={}", m_system_settings.eula_version_count);
 
     *out_count =
-        std::min(m_system_settings.eula_version_count, static_cast<s32>(out_eula_versions.size()));
+        (std::min)(m_system_settings.eula_version_count, static_cast<s32>(out_eula_versions.size()));
     memcpy(out_eula_versions.data(), m_system_settings.eula_versions.data(),
            static_cast<std::size_t>(*out_count) * sizeof(EulaVersion));
     R_SUCCEED();
@@ -599,7 +599,7 @@ Result ISystemSettingsServer::GetAccountNotificationSettings(
     LOG_INFO(Service_SET, "called, elements={}",
              m_system_settings.account_notification_settings_count);
 
-    *out_count = std::min(m_system_settings.account_notification_settings_count,
+    *out_count = (std::min)(m_system_settings.account_notification_settings_count,
                           static_cast<s32>(out_account_notification_settings.size()));
     memcpy(out_account_notification_settings.data(),
            m_system_settings.account_notification_settings.data(),
diff --git a/src/core/hle/service/sm/sm_controller.cpp b/src/core/hle/service/sm/sm_controller.cpp
index 9e25eae4d4..3b63d162c4 100644
--- a/src/core/hle/service/sm/sm_controller.cpp
+++ b/src/core/hle/service/sm/sm_controller.cpp
@@ -74,9 +74,9 @@ void Controller::QueryPointerBufferSize(HLERequestContext& ctx) {
     ASSERT(process != nullptr);
 
     u32 buffer_size = process->GetPointerBufferSize();
-    if (buffer_size > std::numeric_limits<u16>::max()) {
+    if (buffer_size > (std::numeric_limits<u16>::max)()) {
         LOG_WARNING(Service, "Pointer buffer size exceeds u16 max, clamping");
-        buffer_size = std::numeric_limits<u16>::max();
+        buffer_size = (std::numeric_limits<u16>::max)();
     }
 
     IPC::ResponseBuilder rb{ctx, 3};
@@ -94,9 +94,9 @@ void Controller::SetPointerBufferSize(HLERequestContext& ctx) {
 
     u32 requested_size = rp.PopRaw<u32>();
 
-    if (requested_size > std::numeric_limits<u16>::max()) {
+    if (requested_size > (std::numeric_limits<u16>::max)()) {
         LOG_WARNING(Service, "Requested pointer buffer size too large, clamping to 0xFFFF");
-        requested_size = std::numeric_limits<u16>::max();
+        requested_size = (std::numeric_limits<u16>::max)();
     }
 
     process->SetPointerBufferSize(requested_size);
diff --git a/src/core/hle/service/sockets/bsd.cpp b/src/core/hle/service/sockets/bsd.cpp
index fffbc413bb..a31bf45238 100644
--- a/src/core/hle/service/sockets/bsd.cpp
+++ b/src/core/hle/service/sockets/bsd.cpp
@@ -45,13 +45,13 @@ bool IsConnectionBased(Type type) {
 template <typename T>
 T GetValue(std::span<const u8> buffer) {
     T t{};
-    std::memcpy(&t, buffer.data(), std::min(sizeof(T), buffer.size()));
+    std::memcpy(&t, buffer.data(), (std::min)(sizeof(T), buffer.size()));
     return t;
 }
 
 template <typename T>
 void PutValue(std::span<u8> buffer, const T& t) {
-    std::memcpy(buffer.data(), &t, std::min(sizeof(T), buffer.size()));
+    std::memcpy(buffer.data(), &t, (std::min)(sizeof(T), buffer.size()));
 }
 
 } // Anonymous namespace
diff --git a/src/core/hle/service/spl/spl_module.cpp b/src/core/hle/service/spl/spl_module.cpp
index 549e6f4fa8..f59eeac06c 100644
--- a/src/core/hle/service/spl/spl_module.cpp
+++ b/src/core/hle/service/spl/spl_module.cpp
@@ -68,7 +68,7 @@ void Module::Interface::GenerateRandomBytes(HLERequestContext& ctx) {
 
     const std::size_t size = ctx.GetWriteBufferSize();
 
-    std::uniform_int_distribution<u16> distribution(0, std::numeric_limits<u8>::max());
+    std::uniform_int_distribution<u16> distribution(0, (std::numeric_limits<u8>::max)());
     std::vector<u8> data(size);
     std::generate(data.begin(), data.end(), [&] { return static_cast<u8>(distribution(rng)); });
 
diff --git a/src/core/hle/service/ssl/ssl.cpp b/src/core/hle/service/ssl/ssl.cpp
index 2d10bd04d2..7720c93d5a 100644
--- a/src/core/hle/service/ssl/ssl.cpp
+++ b/src/core/hle/service/ssl/ssl.cpp
@@ -445,7 +445,7 @@ private:
 
     void GetNextAlpnProto(HLERequestContext& ctx) {
         const size_t writable = ctx.GetWriteBufferSize();
-        const size_t to_write = std::min(next_alpn_proto.size(), writable);
+        const size_t to_write = (std::min)(next_alpn_proto.size(), writable);
 
         if (to_write != 0) {
             ctx.WriteBuffer(std::span<const u8>(next_alpn_proto.data(), to_write));
diff --git a/src/core/hle/service/vi/application_display_service.cpp b/src/core/hle/service/vi/application_display_service.cpp
index 289ad7073c..cd28b81dc1 100644
--- a/src/core/hle/service/vi/application_display_service.cpp
+++ b/src/core/hle/service/vi/application_display_service.cpp
@@ -192,7 +192,7 @@ Result IApplicationDisplayService::OpenLayer(Out<u64> out_size,
 
     const auto buffer = parcel.Serialize();
     std::memcpy(out_native_window.data(), buffer.data(),
-                std::min(out_native_window.size(), buffer.size()));
+                (std::min)(out_native_window.size(), buffer.size()));
     *out_size = buffer.size();
 
     R_SUCCEED();
@@ -226,7 +226,7 @@ Result IApplicationDisplayService::CreateStrayLayer(
 
     const auto buffer = parcel.Serialize();
     std::memcpy(out_native_window.data(), buffer.data(),
-                std::min(out_native_window.size(), buffer.size()));
+                (std::min)(out_native_window.size(), buffer.size()));
 
     *out_size = buffer.size();
 
diff --git a/src/core/internal_network/network.cpp b/src/core/internal_network/network.cpp
index 6a3c6e9c41..400bd04bdf 100644
--- a/src/core/internal_network/network.cpp
+++ b/src/core/internal_network/network.cpp
@@ -105,7 +105,7 @@ sockaddr TranslateFromSockAddrIn(SockAddrIn input) {
 }
 
 LINGER MakeLinger(bool enable, u32 linger_value) {
-    ASSERT(linger_value <= std::numeric_limits<u_short>::max());
+    ASSERT(linger_value <= (std::numeric_limits<u_short>::max)());
 
     LINGER value;
     value.l_onoff = enable ? 1 : 0;
@@ -798,7 +798,7 @@ Errno Socket::Shutdown(ShutdownHow how) {
 
 std::pair<s32, Errno> Socket::Recv(int flags, std::span<u8> message) {
     ASSERT(flags == 0);
-    ASSERT(message.size() < static_cast<size_t>(std::numeric_limits<int>::max()));
+    ASSERT(message.size() < static_cast<size_t>((std::numeric_limits<int>::max)()));
 
     const auto result =
         recv(fd, reinterpret_cast<char*>(message.data()), static_cast<int>(message.size()), 0);
@@ -811,7 +811,7 @@ std::pair<s32, Errno> Socket::Recv(int flags, std::span<u8> message) {
 
 std::pair<s32, Errno> Socket::RecvFrom(int flags, std::span<u8> message, SockAddrIn* addr) {
     ASSERT(flags == 0);
-    ASSERT(message.size() < static_cast<size_t>(std::numeric_limits<int>::max()));
+    ASSERT(message.size() < static_cast<size_t>((std::numeric_limits<int>::max)()));
 
     sockaddr_in addr_in{};
     socklen_t addrlen = sizeof(addr_in);
@@ -831,7 +831,7 @@ std::pair<s32, Errno> Socket::RecvFrom(int flags, std::span<u8> message, SockAdd
 }
 
 std::pair<s32, Errno> Socket::Send(std::span<const u8> message, int flags) {
-    ASSERT(message.size() < static_cast<size_t>(std::numeric_limits<int>::max()));
+    ASSERT(message.size() < static_cast<size_t>((std::numeric_limits<int>::max)()));
     ASSERT(flags == 0);
 
     int native_flags = 0;
diff --git a/src/core/internal_network/network_interface.cpp b/src/core/internal_network/network_interface.cpp
index f62381b9e3..ae9755113a 100644
--- a/src/core/internal_network/network_interface.cpp
+++ b/src/core/internal_network/network_interface.cpp
@@ -147,7 +147,7 @@ std::vector<Network::NetworkInterface> GetAvailableNetworkInterfaces() {
         }
 
         // ignore header
-        file.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+        file.ignore((std::numeric_limits<std::streamsize>::max)(), '\n');
 
         bool gateway_found = false;
 
diff --git a/src/core/internal_network/socket_proxy.cpp b/src/core/internal_network/socket_proxy.cpp
index c263fb4ca8..1600c061f0 100644
--- a/src/core/internal_network/socket_proxy.cpp
+++ b/src/core/internal_network/socket_proxy.cpp
@@ -105,14 +105,14 @@ Errno ProxySocket::Shutdown(ShutdownHow how) {
 std::pair<s32, Errno> ProxySocket::Recv(int flags, std::span<u8> message) {
     LOG_WARNING(Network, "(STUBBED) called");
     ASSERT(flags == 0);
-    ASSERT(message.size() < static_cast<size_t>(std::numeric_limits<int>::max()));
+    ASSERT(message.size() < static_cast<size_t>((std::numeric_limits<int>::max)()));
 
     return {static_cast<s32>(0), Errno::SUCCESS};
 }
 
 std::pair<s32, Errno> ProxySocket::RecvFrom(int flags, std::span<u8> message, SockAddrIn* addr) {
     ASSERT(flags == 0);
-    ASSERT(message.size() < static_cast<size_t>(std::numeric_limits<int>::max()));
+    ASSERT(message.size() < static_cast<size_t>((std::numeric_limits<int>::max)()));
 
     // TODO (flTobi): Verify the timeout behavior and break when connection is lost
     const auto timestamp = std::chrono::steady_clock::now();
@@ -183,7 +183,7 @@ std::pair<s32, Errno> ProxySocket::ReceivePacket(int flags, std::span<u8> messag
 
 std::pair<s32, Errno> ProxySocket::Send(std::span<const u8> message, int flags) {
     LOG_WARNING(Network, "(STUBBED) called");
-    ASSERT(message.size() < static_cast<size_t>(std::numeric_limits<int>::max()));
+    ASSERT(message.size() < static_cast<size_t>((std::numeric_limits<int>::max)()));
     ASSERT(flags == 0);
 
     return {static_cast<s32>(0), Errno::SUCCESS};
diff --git a/src/core/loader/nca.cpp b/src/core/loader/nca.cpp
index 4a87ab53e7..9a82dae144 100644
--- a/src/core/loader/nca.cpp
+++ b/src/core/loader/nca.cpp
@@ -164,7 +164,7 @@ ResultStatus AppLoader_NCA::VerifyIntegrity(std::function<bool(size_t, size_t)>
     // Begin iterating the file.
     while (processed_size < total_size) {
         // Refill the buffer.
-        const size_t intended_read_size = std::min(buffer.size(), total_size - processed_size);
+        const size_t intended_read_size = (std::min)(buffer.size(), total_size - processed_size);
         const size_t read_size = file->Read(buffer.data(), intended_read_size, processed_size);
 
         // Update the hash function with the buffer contents.
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 0035c626e2..2583aae867 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -48,7 +48,7 @@ struct Memory::Impl {
     explicit Impl(Core::System& system_) : system{system_} {
         // Initialize thread count based on available cores for parallel memory operations
         const unsigned int hw_concurrency = std::thread::hardware_concurrency();
-        thread_count = std::max(2u, std::min(hw_concurrency, 8u)); // Limit to 8 threads max
+        thread_count = (std::max)(2u, (std::min)(hw_concurrency, 8u)); // Limit to 8 threads max
     }
 
     void SetCurrentPageTable(Kernel::KProcess& process) {
@@ -263,7 +263,7 @@ struct Memory::Impl {
 
         while (remaining_size) {
             const std::size_t copy_amount =
-                std::min(static_cast<std::size_t>(YUZU_PAGESIZE) - page_offset, remaining_size);
+                (std::min)(static_cast<std::size_t>(YUZU_PAGESIZE) - page_offset, remaining_size);
             const auto current_vaddr =
                 static_cast<u64>((page_index << YUZU_PAGEBITS) + page_offset);
 
@@ -948,7 +948,7 @@ struct Memory::Impl {
         const auto* p = GetPointerImpl(
             v_address, []() {}, []() {});
         constexpr size_t sys_core = Core::Hardware::NUM_CPU_CORES - 1;
-        const size_t core = std::min(system.GetCurrentHostThreadID(),
+        const size_t core = (std::min)(system.GetCurrentHostThreadID(),
                                      sys_core); // any other calls threads go to syscore.
         if (!gpu_device_memory) [[unlikely]] {
             gpu_device_memory = &system.Host1x().MemoryManager();
@@ -989,7 +989,7 @@ struct Memory::Impl {
 
     void InvalidateGPUMemory(u8* p, size_t size) {
         constexpr size_t sys_core = Core::Hardware::NUM_CPU_CORES - 1;
-        const size_t core = std::min(system.GetCurrentHostThreadID(),
+        const size_t core = (std::min)(system.GetCurrentHostThreadID(),
                                      sys_core); // any other calls threads go to syscore.
         if (!gpu_device_memory) [[unlikely]] {
             gpu_device_memory = &system.Host1x().MemoryManager();
diff --git a/src/core/tools/renderdoc.cpp b/src/core/tools/renderdoc.cpp
index 947fa6cb37..d3a47e1d96 100644
--- a/src/core/tools/renderdoc.cpp
+++ b/src/core/tools/renderdoc.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -18,10 +21,12 @@ namespace Tools {
 RenderdocAPI::RenderdocAPI() {
 #ifdef WIN32
     if (HMODULE mod = GetModuleHandleA("renderdoc.dll")) {
-        const auto RENDERDOC_GetAPI =
-            reinterpret_cast<pRENDERDOC_GetAPI>(GetProcAddress(mod, "RENDERDOC_GetAPI"));
-        const s32 ret = RENDERDOC_GetAPI(eRENDERDOC_API_Version_1_6_0, (void**)&rdoc_api);
-        ASSERT(ret == 1);
+        void* proc = reinterpret_cast<void*>(GetProcAddress(mod, "RENDERDOC_GetAPI"));
+        if (proc) {
+            const auto RENDERDOC_GetAPI = reinterpret_cast<pRENDERDOC_GetAPI>(proc);
+            const s32 ret = RENDERDOC_GetAPI(eRENDERDOC_API_Version_1_6_0, (void**)&rdoc_api);
+            ASSERT(ret == 1);
+        }
     }
 #else
 #ifdef ANDROID
diff --git a/src/dynarmic/CMakeLists.txt b/src/dynarmic/CMakeLists.txt
index 0065b1cf7f..38457deb50 100644
--- a/src/dynarmic/CMakeLists.txt
+++ b/src/dynarmic/CMakeLists.txt
@@ -103,7 +103,7 @@ if (MSVC)
              /WX)
     endif()
 
-    if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+    if (CXX_CLANG)
         list(APPEND DYNARMIC_CXX_FLAGS
              -Qunused-arguments
              -Wno-missing-braces)
@@ -131,7 +131,7 @@ else()
              -Wfatal-errors)
     endif()
 
-    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+    if (CXX_GCC)
         # GCC produces bogus -Warray-bounds warnings from xbyak headers for code paths that are not
         # actually reachable.  Specifically, it happens in cases where some code casts an Operand&
         # to Address& after first checking isMEM(), and that code is inlined in a situation where
@@ -141,7 +141,7 @@ else()
         list(APPEND DYNARMIC_CXX_FLAGS -Wstack-usage=4096)
     endif()
 
-    if (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang")
+    if (CXX_CLANG)
         # Bracket depth determines maximum size of a fold expression in Clang since 9c9974c3ccb6.
         # And this in turns limits the size of a std::array.
         list(APPEND DYNARMIC_CXX_FLAGS -fbracket-depth=1024)
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
index a13baa6a97..3bc93e6fd5 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
@@ -277,7 +277,7 @@ void EmitX64::EmitNZCVFromPackedFlags(EmitContext& ctx, IR::Inst* inst) {
 }
 
 void EmitX64::EmitAddCycles(size_t cycles) {
-    ASSERT(cycles < std::numeric_limits<s32>::max());
+    ASSERT(cycles < (std::numeric_limits<s32>::max)());
     code.sub(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], static_cast<u32>(cycles));
 }
 
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp
index e795181872..31231c02aa 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp
@@ -38,7 +38,7 @@ void EmitSignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
     Xbyak::Reg addend = ctx.reg_alloc.UseGpr(args[1]).changeBit(size);
     Xbyak::Reg overflow = ctx.reg_alloc.ScratchGpr().changeBit(size);
 
-    constexpr u64 int_max = static_cast<u64>(std::numeric_limits<mcl::signed_integer_of_size<size>>::max());
+    constexpr u64 int_max = static_cast<u64>((std::numeric_limits<mcl::signed_integer_of_size<size>>::max)());
     if constexpr (size < 64) {
         code.xor_(overflow.cvt32(), overflow.cvt32());
         code.bt(result.cvt32(), size - 1);
@@ -82,7 +82,7 @@ void EmitUnsignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst
     Xbyak::Reg op_result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(size);
     Xbyak::Reg addend = ctx.reg_alloc.UseScratchGpr(args[1]).changeBit(size);
 
-    constexpr u64 boundary = op == Op::Add ? std::numeric_limits<mcl::unsigned_integer_of_size<size>>::max() : 0;
+    constexpr u64 boundary = op == Op::Add ? (std::numeric_limits<mcl::unsigned_integer_of_size<size>>::max)() : 0;
 
     if constexpr (op == Op::Add) {
         code.add(op_result, addend);
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
index e1b9e54df8..99000c2a57 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
@@ -548,7 +548,7 @@ void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst)
 void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
-    const u8 shift_amount = std::min(args[1].GetImmediateU8(), u8(63));
+    const u8 shift_amount = (std::min)(args[1].GetImmediateU8(), u8(63));
 
     if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
         code.vpsraq(result, result, shift_amount);
@@ -2139,7 +2139,7 @@ void EmitX64::EmitVectorMaxS64(EmitContext& ctx, IR::Inst* inst) {
     }
 
     EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
-        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::max(x, y); });
+        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::max)(x, y); });
     });
 }
 
@@ -2201,7 +2201,7 @@ void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
     }
 
     EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
-        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::max(x, y); });
+        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::max)(x, y); });
     });
 }
 
@@ -2259,7 +2259,7 @@ void EmitX64::EmitVectorMinS64(EmitContext& ctx, IR::Inst* inst) {
     }
 
     EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
-        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::min(x, y); });
+        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::min)(x, y); });
     });
 }
 
@@ -2321,7 +2321,7 @@ void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) {
     }
 
     EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
-        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::min(x, y); });
+        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::min)(x, y); });
     });
 }
 
@@ -2837,22 +2837,22 @@ static void LowerPairedOperation(VectorArray<T>& result, const VectorArray<T>& x
 
 template<typename T>
 static void PairedMax(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) {
-    PairedOperation(result, x, y, [](auto a, auto b) { return std::max(a, b); });
+    PairedOperation(result, x, y, [](auto a, auto b) { return (std::max)(a, b); });
 }
 
 template<typename T>
 static void PairedMin(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) {
-    PairedOperation(result, x, y, [](auto a, auto b) { return std::min(a, b); });
+    PairedOperation(result, x, y, [](auto a, auto b) { return (std::min)(a, b); });
 }
 
 template<typename T>
 static void LowerPairedMax(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) {
-    LowerPairedOperation(result, x, y, [](auto a, auto b) { return std::max(a, b); });
+    LowerPairedOperation(result, x, y, [](auto a, auto b) { return (std::max)(a, b); });
 }
 
 template<typename T>
 static void LowerPairedMin(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) {
-    LowerPairedOperation(result, x, y, [](auto a, auto b) { return std::min(a, b); });
+    LowerPairedOperation(result, x, y, [](auto a, auto b) { return (std::min)(a, b); });
 }
 
 template<typename Function>
@@ -4933,7 +4933,7 @@ static bool VectorSignedSaturatedShiftLeft(VectorArray<T>& dst, const VectorArra
     for (size_t i = 0; i < dst.size(); i++) {
         const T element = data[i];
         const T shift = std::clamp<T>(static_cast<T>(mcl::bit::sign_extend<8>(static_cast<U>(shift_values[i] & 0xFF))),
-                                      -static_cast<T>(bit_size_minus_one), std::numeric_limits<T>::max());
+                                      -static_cast<T>(bit_size_minus_one), (std::numeric_limits<T>::max)());
 
         if (element == 0) {
             dst[i] = 0;
@@ -4995,7 +4995,7 @@ static bool VectorSignedSaturatedShiftLeftUnsigned(VectorArray<T>& dst, const Ve
             const U shifted_test = shifted >> static_cast<U>(shift);
 
             if (shifted_test != static_cast<U>(element)) {
-                dst[i] = static_cast<T>(std::numeric_limits<U>::max());
+                dst[i] = static_cast<T>((std::numeric_limits<U>::max)());
                 qc_flag = true;
             } else {
                 dst[i] = shifted;
@@ -5845,11 +5845,11 @@ static bool EmitVectorUnsignedSaturatedAccumulateSigned(VectorArray<U>& result,
         const s64 y = static_cast<s64>(static_cast<std::make_unsigned_t<U>>(rhs[i]));
         const s64 sum = x + y;
 
-        if (sum > std::numeric_limits<U>::max()) {
-            result[i] = std::numeric_limits<U>::max();
+        if (sum > (std::numeric_limits<U>::max)()) {
+            result[i] = (std::numeric_limits<U>::max)();
             qc_flag = true;
         } else if (sum < 0) {
-            result[i] = std::numeric_limits<U>::min();
+            result[i] = (std::numeric_limits<U>::min)();
             qc_flag = true;
         } else {
             result[i] = static_cast<U>(sum);
@@ -5947,20 +5947,20 @@ static bool VectorUnsignedSaturatedShiftLeft(VectorArray<T>& dst, const VectorAr
     for (size_t i = 0; i < dst.size(); i++) {
         const T element = data[i];
         const S shift = std::clamp(static_cast<S>(mcl::bit::sign_extend<8>(static_cast<T>(shift_values[i] & 0xFF))),
-                                   negative_bit_size, std::numeric_limits<S>::max());
+                                   negative_bit_size, (std::numeric_limits<S>::max)());
 
         if (element == 0 || shift <= negative_bit_size) {
             dst[i] = 0;
         } else if (shift < 0) {
             dst[i] = static_cast<T>(element >> -shift);
         } else if (shift >= static_cast<S>(bit_size)) {
-            dst[i] = std::numeric_limits<T>::max();
+            dst[i] = (std::numeric_limits<T>::max)();
             qc_flag = true;
         } else {
             const T shifted = element << shift;
 
             if ((shifted >> shift) != element) {
-                dst[i] = std::numeric_limits<T>::max();
+                dst[i] = (std::numeric_limits<T>::max)();
                 qc_flag = true;
             } else {
                 dst[i] = shifted;
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
index c8f0d9575c..a368e6703f 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@@ -2116,7 +2116,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
                     }
                 } else {
                     using FPT = mcl::unsigned_integer_of_size<fsize>;  // WORKAROUND: For issue 678 on MSVC
-                    constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max());
+                    constexpr u64 integer_max = static_cast<FPT>((std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max)());
 
                     code.movaps(xmm0, GetVectorOf<fsize, float_upper_limit_signed>(code));
                     FCODE(cmplep)(xmm0, src);
diff --git a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp
index fa6006ed2a..29eab7908b 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp
@@ -85,7 +85,7 @@ void HostLocInfo::ReleaseOne() noexcept {
     if (current_references == 0)
         return;
 
-    ASSERT(size_t(accumulated_uses) + 1 < std::numeric_limits<uint16_t>::max());
+    ASSERT(size_t(accumulated_uses) + 1 < (std::numeric_limits<uint16_t>::max)());
     accumulated_uses++;
     current_references--;
 
@@ -116,7 +116,7 @@ void HostLocInfo::AddValue(IR::Inst* inst) noexcept {
         values.clear();
     }
     values.push_back(inst);
-    ASSERT(size_t(total_uses) + inst->UseCount() < std::numeric_limits<uint16_t>::max());
+    ASSERT(size_t(total_uses) + inst->UseCount() < (std::numeric_limits<uint16_t>::max)());
     total_uses += inst->UseCount();
     max_bit_width = std::max<uint8_t>(max_bit_width, GetBitWidth(inst->GetType()));
 }
@@ -400,14 +400,14 @@ void RegAlloc::HostCall(IR::Inst* result_def,
 }
 
 void RegAlloc::AllocStackSpace(const size_t stack_space) noexcept {
-    ASSERT(stack_space < size_t(std::numeric_limits<s32>::max()));
+    ASSERT(stack_space < size_t((std::numeric_limits<s32>::max)()));
     ASSERT(reserved_stack_space == 0);
     reserved_stack_space = stack_space;
     code->sub(code->rsp, u32(stack_space));
 }
 
 void RegAlloc::ReleaseStackSpace(const size_t stack_space) noexcept {
-    ASSERT(stack_space < size_t(std::numeric_limits<s32>::max()));
+    ASSERT(stack_space < size_t((std::numeric_limits<s32>::max)()));
     ASSERT(reserved_stack_space == stack_space);
     reserved_stack_space = 0;
     code->add(code->rsp, u32(stack_space));
diff --git a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.h b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.h
index f70329f471..bfacdcca52 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.h
+++ b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.h
@@ -52,18 +52,18 @@ public:
         is_set_last_use = true;
     }
     inline void ReadLock() noexcept {
-        ASSERT(size_t(is_being_used_count) + 1 < std::numeric_limits<uint16_t>::max());
+        ASSERT(size_t(is_being_used_count) + 1 < (std::numeric_limits<uint16_t>::max)());
         ASSERT(!is_scratch);
         is_being_used_count++;
     }
     inline void WriteLock() noexcept {
-        ASSERT(size_t(is_being_used_count) + 1 < std::numeric_limits<uint16_t>::max());
+        ASSERT(size_t(is_being_used_count) + 1 < (std::numeric_limits<uint16_t>::max)());
         ASSERT(is_being_used_count == 0);
         is_being_used_count++;
         is_scratch = true;
     }
     inline void AddArgReference() noexcept {
-        ASSERT(size_t(current_references) + 1 < std::numeric_limits<uint16_t>::max());
+        ASSERT(size_t(current_references) + 1 < (std::numeric_limits<uint16_t>::max)());
         current_references++;
         ASSERT(accumulated_uses + current_references <= total_uses);
     }
diff --git a/src/dynarmic/src/dynarmic/frontend/A32/a32_types.h b/src/dynarmic/src/dynarmic/frontend/A32/a32_types.h
index fdadef8257..3f4501a528 100644
--- a/src/dynarmic/src/dynarmic/frontend/A32/a32_types.h
+++ b/src/dynarmic/src/dynarmic/frontend/A32/a32_types.h
@@ -106,6 +106,7 @@ inline size_t RegNumber(ExtReg reg) {
     }
 
     ASSERT_MSG(false, "Invalid extended register");
+    return 0;
 }
 
 inline Reg operator+(Reg reg, size_t number) {
diff --git a/src/hid_core/frontend/emulated_controller.cpp b/src/hid_core/frontend/emulated_controller.cpp
index 5bdad4a0ad..8a6922c49f 100644
--- a/src/hid_core/frontend/emulated_controller.cpp
+++ b/src/hid_core/frontend/emulated_controller.cpp
@@ -1308,9 +1308,9 @@ bool EmulatedController::SetVibration(DeviceIndex device_index, const VibrationV
                         : Common::Input::VibrationAmplificationType::Linear;
 
     const Common::Input::VibrationStatus status = {
-        .low_amplitude = std::min(vibration.low_amplitude * strength, 1.0f),
+        .low_amplitude = (std::min)(vibration.low_amplitude * strength, 1.0f),
         .low_frequency = vibration.low_frequency,
-        .high_amplitude = std::min(vibration.high_amplitude * strength, 1.0f),
+        .high_amplitude = (std::min)(vibration.high_amplitude * strength, 1.0f),
         .high_frequency = vibration.high_frequency,
         .type = type,
     };
diff --git a/src/hid_core/hidbus/ringcon.cpp b/src/hid_core/hidbus/ringcon.cpp
index 4f5eaa5053..a2bfd82636 100644
--- a/src/hid_core/hidbus/ringcon.cpp
+++ b/src/hid_core/hidbus/ringcon.cpp
@@ -283,7 +283,7 @@ u8 RingController::GetCrcValue(const std::vector<u8>& data) const {
 template <typename T>
 u64 RingController::GetData(const T& reply, std::span<u8> out_data) const {
     static_assert(std::is_trivially_copyable_v<T>);
-    const auto data_size = static_cast<u64>(std::min(sizeof(reply), out_data.size()));
+    const auto data_size = static_cast<u64>((std::min)(sizeof(reply), out_data.size()));
     std::memcpy(out_data.data(), &reply, data_size);
     return data_size;
 }
diff --git a/src/hid_core/irsensor/image_transfer_processor.cpp b/src/hid_core/irsensor/image_transfer_processor.cpp
index 2b5a50ef6f..9040390946 100644
--- a/src/hid_core/irsensor/image_transfer_processor.cpp
+++ b/src/hid_core/irsensor/image_transfer_processor.cpp
@@ -146,7 +146,7 @@ void ImageTransferProcessor::SetTransferMemoryAddress(Common::ProcessAddress t_m
 
 Core::IrSensor::ImageTransferProcessorState ImageTransferProcessor::GetState(
     std::span<u8> data) const {
-    const auto size = std::min(GetDataSize(current_config.trimming_format), data.size());
+    const auto size = (std::min)(GetDataSize(current_config.trimming_format), data.size());
     system.ApplicationMemory().ReadBlock(transfer_memory, data.data(), size);
     return processor_state;
 }
diff --git a/src/hid_core/resources/abstracted_pad/abstract_battery_handler.cpp b/src/hid_core/resources/abstracted_pad/abstract_battery_handler.cpp
index 62fbbb0a7e..b3e17b389d 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_battery_handler.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_battery_handler.cpp
@@ -30,7 +30,7 @@ void NpadAbstractBatteryHandler::SetPropertiesHandler(NpadAbstractPropertiesHand
 }
 
 Result NpadAbstractBatteryHandler::IncrementRefCounter() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
     ref_counter++;
diff --git a/src/hid_core/resources/abstracted_pad/abstract_button_handler.cpp b/src/hid_core/resources/abstracted_pad/abstract_button_handler.cpp
index 5871694335..e4166b3735 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_button_handler.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_button_handler.cpp
@@ -30,7 +30,7 @@ void NpadAbstractButtonHandler::SetPropertiesHandler(NpadAbstractPropertiesHandl
 }
 
 Result NpadAbstractButtonHandler::IncrementRefCounter() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
     ref_counter++;
diff --git a/src/hid_core/resources/abstracted_pad/abstract_ir_sensor_handler.cpp b/src/hid_core/resources/abstracted_pad/abstract_ir_sensor_handler.cpp
index e399edfd70..4367dcaa56 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_ir_sensor_handler.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_ir_sensor_handler.cpp
@@ -24,7 +24,7 @@ void NpadAbstractIrSensorHandler::SetPropertiesHandler(NpadAbstractPropertiesHan
 }
 
 Result NpadAbstractIrSensorHandler::IncrementRefCounter() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
     ref_counter++;
diff --git a/src/hid_core/resources/abstracted_pad/abstract_led_handler.cpp b/src/hid_core/resources/abstracted_pad/abstract_led_handler.cpp
index 0b2bfe88da..b4375e57f3 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_led_handler.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_led_handler.cpp
@@ -29,7 +29,7 @@ void NpadAbstractLedHandler::SetPropertiesHandler(NpadAbstractPropertiesHandler*
 }
 
 Result NpadAbstractLedHandler::IncrementRefCounter() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
     ref_counter++;
diff --git a/src/hid_core/resources/abstracted_pad/abstract_mcu_handler.cpp b/src/hid_core/resources/abstracted_pad/abstract_mcu_handler.cpp
index 6f35bd95cc..accbfe0def 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_mcu_handler.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_mcu_handler.cpp
@@ -22,7 +22,7 @@ void NpadAbstractMcuHandler::SetPropertiesHandler(NpadAbstractPropertiesHandler*
 }
 
 Result NpadAbstractMcuHandler::IncrementRefCounter() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
     ref_counter++;
diff --git a/src/hid_core/resources/abstracted_pad/abstract_nfc_handler.cpp b/src/hid_core/resources/abstracted_pad/abstract_nfc_handler.cpp
index bd9b79333c..7a47786d42 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_nfc_handler.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_nfc_handler.cpp
@@ -24,7 +24,7 @@ void NpadAbstractNfcHandler::SetPropertiesHandler(NpadAbstractPropertiesHandler*
 }
 
 Result NpadAbstractNfcHandler::IncrementRefCounter() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
     ref_counter++;
diff --git a/src/hid_core/resources/abstracted_pad/abstract_pad.cpp b/src/hid_core/resources/abstracted_pad/abstract_pad.cpp
index 435b095f02..39906fe33f 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_pad.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_pad.cpp
@@ -68,7 +68,7 @@ void AbstractPad::SetNpadId(Core::HID::NpadIdType npad_id) {
 }
 
 Result AbstractPad::Activate() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
 
diff --git a/src/hid_core/resources/abstracted_pad/abstract_pad_holder.cpp b/src/hid_core/resources/abstracted_pad/abstract_pad_holder.cpp
index 8334dc34f6..80f86459b9 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_pad_holder.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_pad_holder.cpp
@@ -73,7 +73,7 @@ u64 NpadAbstractedPadHolder::RemoveAbstractPadByAssignmentStyle(
 }
 
 u32 NpadAbstractedPadHolder::GetAbstractedPads(std::span<IAbstractedPad*> list) const {
-    u32 num_elements = std::min(static_cast<u32>(list.size()), list_size);
+    u32 num_elements = (std::min)(static_cast<u32>(list.size()), list_size);
     for (std::size_t i = 0; i < num_elements; i++) {
         list[i] = assignment_list[i].abstracted_pad;
     }
diff --git a/src/hid_core/resources/abstracted_pad/abstract_palma_handler.cpp b/src/hid_core/resources/abstracted_pad/abstract_palma_handler.cpp
index 04d276d617..c10d0c4070 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_palma_handler.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_palma_handler.cpp
@@ -25,7 +25,7 @@ void NpadAbstractPalmaHandler::SetPalmaResource(PalmaResource* resource) {
 }
 
 Result NpadAbstractPalmaHandler::IncrementRefCounter() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
     ref_counter++;
diff --git a/src/hid_core/resources/abstracted_pad/abstract_properties_handler.cpp b/src/hid_core/resources/abstracted_pad/abstract_properties_handler.cpp
index 36b630c7f4..90c46cbe8c 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_properties_handler.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_properties_handler.cpp
@@ -38,7 +38,7 @@ Core::HID::NpadIdType NpadAbstractPropertiesHandler::GetNpadId() const {
 }
 
 Result NpadAbstractPropertiesHandler::IncrementRefCounter() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
 
diff --git a/src/hid_core/resources/abstracted_pad/abstract_sixaxis_handler.cpp b/src/hid_core/resources/abstracted_pad/abstract_sixaxis_handler.cpp
index 0dde244ef8..10c00ef95c 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_sixaxis_handler.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_sixaxis_handler.cpp
@@ -33,7 +33,7 @@ void NpadAbstractSixAxisHandler::SetSixaxisResource(SixAxisResource* resource) {
 }
 
 Result NpadAbstractSixAxisHandler::IncrementRefCounter() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
     ref_counter++;
diff --git a/src/hid_core/resources/abstracted_pad/abstract_vibration_handler.cpp b/src/hid_core/resources/abstracted_pad/abstract_vibration_handler.cpp
index ca64b0a437..07a35b2147 100644
--- a/src/hid_core/resources/abstracted_pad/abstract_vibration_handler.cpp
+++ b/src/hid_core/resources/abstracted_pad/abstract_vibration_handler.cpp
@@ -55,7 +55,7 @@ void NpadAbstractVibrationHandler::SetGcVibration(NpadGcVibrationDevice* gc_devi
 }
 
 Result NpadAbstractVibrationHandler::IncrementRefCounter() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadHandlerOverflow;
     }
     ref_counter++;
diff --git a/src/hid_core/resources/applet_resource.cpp b/src/hid_core/resources/applet_resource.cpp
index 243beb1c7f..a533ca4319 100644
--- a/src/hid_core/resources/applet_resource.cpp
+++ b/src/hid_core/resources/applet_resource.cpp
@@ -271,7 +271,7 @@ void AppletResource::EnablePalmaBoostMode(u64 aruid, bool is_enabled) {
 }
 
 Result AppletResource::RegisterCoreAppletResource() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultAppletResourceOverflow;
     }
     if (ref_counter == 0) {
diff --git a/src/hid_core/resources/npad/npad.cpp b/src/hid_core/resources/npad/npad.cpp
index ca1ccd659c..f1f5ee5e9f 100644
--- a/src/hid_core/resources/npad/npad.cpp
+++ b/src/hid_core/resources/npad/npad.cpp
@@ -55,7 +55,7 @@ NPad::~NPad() {
 }
 
 Result NPad::Activate() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultNpadResourceOverflow;
     }
 
diff --git a/src/hid_core/resources/npad/npad_data.cpp b/src/hid_core/resources/npad/npad_data.cpp
index 29ad5cb08c..fbcc3ef89a 100644
--- a/src/hid_core/resources/npad/npad_data.cpp
+++ b/src/hid_core/resources/npad/npad_data.cpp
@@ -46,7 +46,7 @@ Result NPadData::SetSupportedNpadIdType(std::span<const Core::HID::NpadIdType> l
 }
 
 std::size_t NPadData::GetSupportedNpadIdType(std::span<Core::HID::NpadIdType> out_list) const {
-    std::size_t out_size = std::min(supported_npad_id_types_count, out_list.size());
+    std::size_t out_size = (std::min)(supported_npad_id_types_count, out_list.size());
 
     memcpy(out_list.data(), supported_npad_id_types.data(),
            out_size * sizeof(Core::HID::NpadIdType));
diff --git a/src/hid_core/resources/npad/npad_resource.cpp b/src/hid_core/resources/npad/npad_resource.cpp
index 79f7d74c0c..21a514dce6 100644
--- a/src/hid_core/resources/npad/npad_resource.cpp
+++ b/src/hid_core/resources/npad/npad_resource.cpp
@@ -126,7 +126,7 @@ Result NPadResource::Activate(u64 aruid) {
 }
 
 Result NPadResource::Activate() {
-    if (ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultAppletResourceOverflow;
     }
     if (ref_counter == 0) {
diff --git a/src/hid_core/resources/palma/palma.cpp b/src/hid_core/resources/palma/palma.cpp
index be3d3c0edd..9210b456e2 100644
--- a/src/hid_core/resources/palma/palma.cpp
+++ b/src/hid_core/resources/palma/palma.cpp
@@ -62,7 +62,7 @@ Result Palma::GetPalmaOperationInfo(const PalmaConnectionHandle& handle,
     }
     operation_type = static_cast<PalmaOperationType>(operation.operation);
     std::memcpy(out_data.data(), operation.data.data(),
-                std::min(out_data.size(), operation.data.size()));
+                (std::min)(out_data.size(), operation.data.size()));
 
     return ResultSuccess;
 }
diff --git a/src/hid_core/resources/touch_screen/gesture_handler.cpp b/src/hid_core/resources/touch_screen/gesture_handler.cpp
index 4fcaf6ecf1..6309019796 100644
--- a/src/hid_core/resources/touch_screen/gesture_handler.cpp
+++ b/src/hid_core/resources/touch_screen/gesture_handler.cpp
@@ -16,7 +16,7 @@ GestureHandler::~GestureHandler() {}
 
 void GestureHandler::SetTouchState(std::span<TouchState> touch_state, u32 count, s64 timestamp) {
     gesture = {};
-    gesture.active_points = std::min(MaxPoints, static_cast<std::size_t>(count));
+    gesture.active_points = (std::min)(MaxPoints, static_cast<std::size_t>(count));
 
     for (size_t id = 0; id < gesture.active_points; ++id) {
         const auto& [active_x, active_y] = touch_state[id].position;
diff --git a/src/hid_core/resources/touch_screen/touch_screen_resource.cpp b/src/hid_core/resources/touch_screen/touch_screen_resource.cpp
index 79ddaa4dfa..51b94b2466 100644
--- a/src/hid_core/resources/touch_screen/touch_screen_resource.cpp
+++ b/src/hid_core/resources/touch_screen/touch_screen_resource.cpp
@@ -25,8 +25,8 @@ TouchResource::~TouchResource() {
 };
 
 Result TouchResource::ActivateTouch() {
-    if (global_ref_counter == std::numeric_limits<s32>::max() - 1 ||
-        touch_ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (global_ref_counter == (std::numeric_limits<s32>::max)() - 1 ||
+        touch_ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultTouchOverflow;
     }
 
@@ -91,8 +91,8 @@ Result TouchResource::ActivateTouch(u64 aruid) {
 }
 
 Result TouchResource::ActivateGesture() {
-    if (global_ref_counter == std::numeric_limits<s32>::max() - 1 ||
-        gesture_ref_counter == std::numeric_limits<s32>::max() - 1) {
+    if (global_ref_counter == (std::numeric_limits<s32>::max)() - 1 ||
+        gesture_ref_counter == (std::numeric_limits<s32>::max)() - 1) {
         return ResultGestureOverflow;
     }
 
diff --git a/src/input_common/drivers/mouse.cpp b/src/input_common/drivers/mouse.cpp
index 4af2dd36f5..34bf877bf5 100644
--- a/src/input_common/drivers/mouse.cpp
+++ b/src/input_common/drivers/mouse.cpp
@@ -102,11 +102,11 @@ void Mouse::UpdateStickInput() {
     SetAxis(identifier, mouse_axis_y, -last_mouse_change.y);
 
     // Decay input over time
-    const float clamped_length = std::min(1.0f, length);
+    const float clamped_length = (std::min)(1.0f, length);
     const float decay_strength = Settings::values.mouse_panning_decay_strength.GetValue();
     const float decay = 1 - clamped_length * clamped_length * decay_strength * 0.01f;
     const float min_decay = Settings::values.mouse_panning_min_decay.GetValue();
-    const float clamped_decay = std::min(1 - min_decay / 100.0f, decay);
+    const float clamped_decay = (std::min)(1 - min_decay / 100.0f, decay);
     last_mouse_change *= clamped_decay;
 }
 
diff --git a/src/input_common/drivers/sdl_driver.cpp b/src/input_common/drivers/sdl_driver.cpp
index 51169c6f2b..972abec9fe 100644
--- a/src/input_common/drivers/sdl_driver.cpp
+++ b/src/input_common/drivers/sdl_driver.cpp
@@ -120,7 +120,7 @@ public:
         f32 low_frequency_scale = 1.0f;
         if (vibration.low_frequency > low_start_sensitivity_limit) {
             low_frequency_scale =
-                std::max(1.0f - (vibration.low_frequency - low_start_sensitivity_limit) /
+                (std::max)(1.0f - (vibration.low_frequency - low_start_sensitivity_limit) /
                                     low_width_sensitivity_limit,
                          0.3f);
         }
@@ -129,7 +129,7 @@ public:
         f32 high_frequency_scale = 1.0f;
         if (vibration.high_frequency > high_start_sensitivity_limit) {
             high_frequency_scale =
-                std::max(1.0f - (vibration.high_frequency - high_start_sensitivity_limit) /
+                (std::max)(1.0f - (vibration.high_frequency - high_start_sensitivity_limit) /
                                     high_width_sensitivity_limit,
                          0.3f);
         }
diff --git a/src/input_common/drivers/udp_client.cpp b/src/input_common/drivers/udp_client.cpp
index d483cd3490..df1819904b 100644
--- a/src/input_common/drivers/udp_client.cpp
+++ b/src/input_common/drivers/udp_client.cpp
@@ -615,8 +615,8 @@ CalibrationConfigurationJob::CalibrationConfigurationJob(
                                     }
                                     LOG_DEBUG(Input, "Current touch: {} {}", data.touch[0].x,
                                               data.touch[0].y);
-                                    min_x = std::min(min_x, static_cast<u16>(data.touch[0].x));
-                                    min_y = std::min(min_y, static_cast<u16>(data.touch[0].y));
+                                    min_x = (std::min)(min_x, static_cast<u16>(data.touch[0].x));
+                                    min_y = (std::min)(min_y, static_cast<u16>(data.touch[0].y));
                                     if (current_status == Status::Ready) {
                                         // First touch - min data (min_x/min_y)
                                         current_status = Status::Stage1Completed;
diff --git a/src/input_common/helpers/joycon_protocol/calibration.cpp b/src/input_common/helpers/joycon_protocol/calibration.cpp
index 1300ecaf53..057bf29f71 100644
--- a/src/input_common/helpers/joycon_protocol/calibration.cpp
+++ b/src/input_common/helpers/joycon_protocol/calibration.cpp
@@ -140,8 +140,8 @@ Common::Input::DriverResult CalibrationProtocol::GetRingCalibration(RingCalibrat
         ring_data_min = current_value - DefaultRingRange;
         ring_data_default = current_value;
     }
-    ring_data_max = std::max(ring_data_max, current_value);
-    ring_data_min = std::min(ring_data_min, current_value);
+    ring_data_max = (std::max)(ring_data_max, current_value);
+    ring_data_min = (std::min)(ring_data_min, current_value);
     calibration = {
         .default_value = ring_data_default,
         .max_value = ring_data_max,
diff --git a/src/input_common/helpers/joycon_protocol/nfc.cpp b/src/input_common/helpers/joycon_protocol/nfc.cpp
index db83f9ef48..bfdaa74a62 100644
--- a/src/input_common/helpers/joycon_protocol/nfc.cpp
+++ b/src/input_common/helpers/joycon_protocol/nfc.cpp
@@ -327,7 +327,7 @@ Common::Input::DriverResult NfcProtocol::IsTagInRange(TagFoundData& data,
              (output.mcu_data[6] != 0x09 && output.mcu_data[6] != 0x04));
 
     data.type = output.mcu_data[12];
-    data.uuid_size = std::min(output.mcu_data[14], static_cast<u8>(sizeof(TagUUID)));
+    data.uuid_size = (std::min)(output.mcu_data[14], static_cast<u8>(sizeof(TagUUID)));
     memcpy(data.uuid.data(), output.mcu_data.data() + 15, data.uuid.size());
 
     return Common::Input::DriverResult::Success;
@@ -433,7 +433,7 @@ Common::Input::DriverResult NfcProtocol::WriteAmiiboData(const TagUUID& tag_uuid
     // Send Data. Nfc buffer size is 31, Send the data in smaller packages
     while (current_position < buffer.size() && tries++ < timeout_limit) {
         const std::size_t next_position =
-            std::min(current_position + sizeof(NFCRequestState::raw_data), buffer.size());
+            (std::min)(current_position + sizeof(NFCRequestState::raw_data), buffer.size());
         const std::size_t block_size = next_position - current_position;
         const bool is_last_packet = block_size < sizeof(NFCRequestState::raw_data);
 
@@ -479,7 +479,7 @@ Common::Input::DriverResult NfcProtocol::GetMifareData(
     // Send data request. Nfc buffer size is 31, Send the data in smaller packages
     while (current_position < buffer.size() && tries++ < timeout_limit) {
         const std::size_t next_position =
-            std::min(current_position + sizeof(NFCRequestState::raw_data), buffer.size());
+            (std::min)(current_position + sizeof(NFCRequestState::raw_data), buffer.size());
         const std::size_t block_size = next_position - current_position;
         const bool is_last_packet = block_size < sizeof(NFCRequestState::raw_data);
 
@@ -559,7 +559,7 @@ Common::Input::DriverResult NfcProtocol::WriteMifareData(
     // Send data request. Nfc buffer size is 31, Send the data in smaller packages
     while (current_position < buffer.size() && tries++ < timeout_limit) {
         const std::size_t next_position =
-            std::min(current_position + sizeof(NFCRequestState::raw_data), buffer.size());
+            (std::min)(current_position + sizeof(NFCRequestState::raw_data), buffer.size());
         const std::size_t block_size = next_position - current_position;
         const bool is_last_packet = block_size < sizeof(NFCRequestState::raw_data);
 
@@ -731,7 +731,7 @@ Common::Input::DriverResult NfcProtocol::SendWriteDataAmiiboRequest(MCUCommandRe
                                                                     u8 block_id,
                                                                     bool is_last_packet,
                                                                     std::span<const u8> data) {
-    const auto data_size = std::min(data.size(), sizeof(NFCRequestState::raw_data));
+    const auto data_size = (std::min)(data.size(), sizeof(NFCRequestState::raw_data));
     NFCRequestState request{
         .command_argument = NFCCommand::WriteNtag,
         .block_id = block_id,
@@ -754,7 +754,7 @@ Common::Input::DriverResult NfcProtocol::SendWriteDataAmiiboRequest(MCUCommandRe
 Common::Input::DriverResult NfcProtocol::SendReadDataMifareRequest(MCUCommandResponse& output,
                                                                    u8 block_id, bool is_last_packet,
                                                                    std::span<const u8> data) {
-    const auto data_size = std::min(data.size(), sizeof(NFCRequestState::raw_data));
+    const auto data_size = (std::min)(data.size(), sizeof(NFCRequestState::raw_data));
     NFCRequestState request{
         .command_argument = NFCCommand::Mifare,
         .block_id = block_id,
diff --git a/src/input_common/helpers/joycon_protocol/rumble.cpp b/src/input_common/helpers/joycon_protocol/rumble.cpp
index 9fd0b84708..db3420dc0b 100644
--- a/src/input_common/helpers/joycon_protocol/rumble.cpp
+++ b/src/input_common/helpers/joycon_protocol/rumble.cpp
@@ -29,7 +29,7 @@ Common::Input::DriverResult RumbleProtocol::SendVibration(const VibrationValue&
 
     // Protect joycons from damage from strong vibrations
     const f32 clamp_amplitude =
-        1.0f / std::max(1.0f, vibration.high_amplitude + vibration.low_amplitude);
+        1.0f / (std::max)(1.0f, vibration.high_amplitude + vibration.low_amplitude);
 
     const u16 encoded_high_frequency = EncodeHighFrequency(vibration.high_frequency);
     const u8 encoded_high_amplitude =
diff --git a/src/network/room.cpp b/src/network/room.cpp
index 99dcf0c3b4..1a3ad75d2b 100644
--- a/src/network/room.cpp
+++ b/src/network/room.cpp
@@ -951,7 +951,7 @@ void Room::RoomImpl::HandleChatPacket(const ENetEvent* event) {
     }
 
     // Limit the size of chat messages to MaxMessageSize
-    message.resize(std::min(static_cast<u32>(message.size()), MaxMessageSize));
+    message.resize((std::min)(static_cast<u32>(message.size()), MaxMessageSize));
 
     Packet out_packet;
     out_packet.Write(static_cast<u8>(IdChatMessage));
diff --git a/src/shader_recompiler/backend/glasm/reg_alloc.cpp b/src/shader_recompiler/backend/glasm/reg_alloc.cpp
index 3919d63268..8cd20a2f5c 100644
--- a/src/shader_recompiler/backend/glasm/reg_alloc.cpp
+++ b/src/shader_recompiler/backend/glasm/reg_alloc.cpp
@@ -123,7 +123,7 @@ Id RegAlloc::Alloc(bool is_long) {
             if (use[reg]) {
                 continue;
             }
-            num_regs = std::max(num_regs, reg + 1);
+            num_regs = (std::max)(num_regs, reg + 1);
             use[reg] = true;
             Id ret{};
             ret.is_valid.Assign(1);
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp
index 49397c9b2e..99ed4cbc19 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp
@@ -39,7 +39,7 @@ void EmitIAdd32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::strin
     // which may be overwritten by the result of the addition
     if (IR::Inst * overflow{inst.GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp)}) {
         // https://stackoverflow.com/questions/55468823/how-to-detect-integer-overflow-in-c
-        constexpr u32 s32_max{static_cast<u32>(std::numeric_limits<s32>::max())};
+        constexpr u32 s32_max{static_cast<u32>((std::numeric_limits<s32>::max)())};
         const auto sub_a{fmt::format("{}u-{}", s32_max, a)};
         const auto positive_result{fmt::format("int({})>int({})", b, sub_a)};
         const auto negative_result{fmt::format("int({})<int({})", b, sub_a)};
diff --git a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp
index c5ac7b8f2a..af4b6c41e7 100644
--- a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp
+++ b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp
@@ -314,9 +314,9 @@ EmitContext::EmitContext(IR::Program& program, Bindings& bindings, const Profile
         break;
     case Stage::Compute:
         stage_name = "cs";
-        const u32 local_x{std::max(program.workgroup_size[0], 1u)};
-        const u32 local_y{std::max(program.workgroup_size[1], 1u)};
-        const u32 local_z{std::max(program.workgroup_size[2], 1u)};
+        const u32 local_x{(std::max)(program.workgroup_size[0], 1u)};
+        const u32 local_y{(std::max)(program.workgroup_size[1], 1u)};
+        const u32 local_z{(std::max)(program.workgroup_size[2], 1u)};
         header += fmt::format("layout(local_size_x={},local_size_y={},local_size_z={}) in;",
                               local_x, local_y, local_z);
         break;
diff --git a/src/shader_recompiler/backend/glsl/var_alloc.cpp b/src/shader_recompiler/backend/glsl/var_alloc.cpp
index ff6e2d5d5b..d70eefb6ab 100644
--- a/src/shader_recompiler/backend/glsl/var_alloc.cpp
+++ b/src/shader_recompiler/backend/glsl/var_alloc.cpp
@@ -155,7 +155,7 @@ Id VarAlloc::Alloc(GlslVarType type) {
         if (use_tracker.var_use[var]) {
             continue;
         }
-        use_tracker.num_used = std::max(use_tracker.num_used, var + 1);
+        use_tracker.num_used = (std::max)(use_tracker.num_used, var + 1);
         use_tracker.var_use[var] = true;
         Id ret{};
         ret.is_valid.Assign(1);
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index 5dab8b62ea..680bc9aae3 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -366,7 +366,7 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) {
         return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.draw_index));
     case IR::Attribute::FrontFace:
         return ctx.OpSelect(ctx.F32[1], ctx.OpLoad(ctx.U1, ctx.front_face),
-                            ctx.OpBitcast(ctx.F32[1], ctx.Const(std::numeric_limits<u32>::max())),
+                            ctx.OpBitcast(ctx.F32[1], ctx.Const((std::numeric_limits<u32>::max)())),
                             ctx.f32_zero_value);
     case IR::Attribute::PointSpriteS:
         return ctx.OpLoad(ctx.F32[1],
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp
index 960bdea6f1..bf47d4ee96 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp
@@ -42,7 +42,7 @@ Id EmitIAdd32(EmitContext& ctx, IR::Inst* inst, Id a, Id b) {
     SetSignFlag(ctx, inst, result);
     if (IR::Inst * overflow{inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp)}) {
         // https://stackoverflow.com/questions/55468823/how-to-detect-integer-overflow-in-c
-        constexpr u32 s32_max{static_cast<u32>(std::numeric_limits<s32>::max())};
+        constexpr u32 s32_max{static_cast<u32>((std::numeric_limits<s32>::max)())};
         const Id is_positive{ctx.OpSGreaterThanEqual(ctx.U1, a, ctx.u32_zero_value)};
         const Id sub_a{ctx.OpISub(ctx.U32[1], ctx.Const(s32_max), a)};
 
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 388ddce2c8..745dead6c4 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -1593,7 +1593,7 @@ void EmitContext::DefineOutputs(const IR::Program& program) {
             throw NotImplementedException("Storing ClipDistance in fragment stage");
         }
         if (profile.max_user_clip_distances > 0) {
-            const u32 used{std::min(profile.max_user_clip_distances, 8u)};
+            const u32 used{(std::min)(profile.max_user_clip_distances, 8u)};
             const std::array<Id, 8> zero{f32_zero_value, f32_zero_value, f32_zero_value,
                                          f32_zero_value, f32_zero_value, f32_zero_value,
                                          f32_zero_value, f32_zero_value};
diff --git a/src/shader_recompiler/frontend/maxwell/decode.cpp b/src/shader_recompiler/frontend/maxwell/decode.cpp
index 47111a0501..5afe8cbb14 100644
--- a/src/shader_recompiler/frontend/maxwell/decode.cpp
+++ b/src/shader_recompiler/frontend/maxwell/decode.cpp
@@ -73,7 +73,7 @@ constexpr auto ENCODINGS{SortedEncodings()};
 constexpr int WidestLeftBits() {
     int bits{64};
     for (const InstEncoding& encoding : ENCODINGS) {
-        bits = std::min(bits, std::countr_zero(encoding.mask_value.mask));
+        bits = (std::min)(bits, std::countr_zero(encoding.mask_value.mask));
     }
     return 64 - bits;
 }
@@ -87,7 +87,7 @@ constexpr size_t ToFastLookupIndex(u64 value) {
 constexpr size_t FastLookupSize() {
     size_t max_width{};
     for (const InstEncoding& encoding : ENCODINGS) {
-        max_width = std::max(max_width, ToFastLookupIndex(encoding.mask_value.mask));
+        max_width = (std::max)(max_width, ToFastLookupIndex(encoding.mask_value.mask));
     }
     return max_width + 1;
 }
diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/floating_point_conversion_integer.cpp b/src/shader_recompiler/frontend/maxwell/translate/impl/floating_point_conversion_integer.cpp
index 85c18d9422..21d3c2fe53 100644
--- a/src/shader_recompiler/frontend/maxwell/translate/impl/floating_point_conversion_integer.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate/impl/floating_point_conversion_integer.cpp
@@ -60,28 +60,28 @@ std::pair<f64, f64> ClampBounds(DestFormat format, bool is_signed) {
     if (is_signed) {
         switch (format) {
         case DestFormat::I16:
-            return {static_cast<f64>(std::numeric_limits<s16>::max()),
-                    static_cast<f64>(std::numeric_limits<s16>::min())};
+            return {static_cast<f64>((std::numeric_limits<s16>::max)()),
+                    static_cast<f64>((std::numeric_limits<s16>::min)())};
         case DestFormat::I32:
-            return {static_cast<f64>(std::numeric_limits<s32>::max()),
-                    static_cast<f64>(std::numeric_limits<s32>::min())};
+            return {static_cast<f64>((std::numeric_limits<s32>::max)()),
+                    static_cast<f64>((std::numeric_limits<s32>::min)())};
         case DestFormat::I64:
-            return {static_cast<f64>(std::numeric_limits<s64>::max()),
-                    static_cast<f64>(std::numeric_limits<s64>::min())};
+            return {static_cast<f64>((std::numeric_limits<s64>::max)()),
+                    static_cast<f64>((std::numeric_limits<s64>::min)())};
         default:
             break;
         }
     } else {
         switch (format) {
         case DestFormat::I16:
-            return {static_cast<f64>(std::numeric_limits<u16>::max()),
-                    static_cast<f64>(std::numeric_limits<u16>::min())};
+            return {static_cast<f64>((std::numeric_limits<u16>::max)()),
+                    static_cast<f64>((std::numeric_limits<u16>::min)())};
         case DestFormat::I32:
-            return {static_cast<f64>(std::numeric_limits<u32>::max()),
-                    static_cast<f64>(std::numeric_limits<u32>::min())};
+            return {static_cast<f64>((std::numeric_limits<u32>::max)()),
+                    static_cast<f64>((std::numeric_limits<u32>::min)())};
         case DestFormat::I64:
-            return {static_cast<f64>(std::numeric_limits<u64>::max()),
-                    static_cast<f64>(std::numeric_limits<u64>::min())};
+            return {static_cast<f64>((std::numeric_limits<u64>::max)()),
+                    static_cast<f64>((std::numeric_limits<u64>::min)())};
         default:
             break;
         }
diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/integer_floating_point_conversion.cpp b/src/shader_recompiler/frontend/maxwell/translate/impl/integer_floating_point_conversion.cpp
index a2dc0f4a6e..9631fd9dfe 100644
--- a/src/shader_recompiler/frontend/maxwell/translate/impl/integer_floating_point_conversion.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate/impl/integer_floating_point_conversion.cpp
@@ -114,9 +114,9 @@ void I2F(TranslatorVisitor& v, u64 insn, IR::U32U64 src) {
             // Only negate if the input isn't the lowest value
             IR::U1 is_least;
             if (src_bitsize == 64) {
-                is_least = v.ir.IEqual(src, v.ir.Imm64(std::numeric_limits<s64>::min()));
+                is_least = v.ir.IEqual(src, v.ir.Imm64((std::numeric_limits<s64>::min)()));
             } else if (src_bitsize == 32) {
-                is_least = v.ir.IEqual(src, v.ir.Imm32(std::numeric_limits<s32>::min()));
+                is_least = v.ir.IEqual(src, v.ir.Imm32((std::numeric_limits<s32>::min)()));
             } else {
                 const IR::U32 least_value{v.ir.Imm32(-(1 << (src_bitsize - 1)))};
                 is_least = v.ir.IEqual(src, least_value);
diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
index 321ea625bc..a9559b0d7a 100644
--- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
@@ -336,7 +336,7 @@ IR::Program MergeDualVertexPrograms(IR::Program& vertex_a, IR::Program& vertex_b
     }
     result.stage = Stage::VertexB;
     result.info = vertex_a.info;
-    result.local_memory_size = std::max(vertex_a.local_memory_size, vertex_b.local_memory_size);
+    result.local_memory_size = (std::max)(vertex_a.local_memory_size, vertex_b.local_memory_size);
     result.info.loads.mask |= vertex_b.info.loads.mask;
     result.info.stores.mask |= vertex_b.info.stores.mask;
 
diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
index cb82a326c1..1fa39034a8 100644
--- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
+++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
@@ -509,7 +509,7 @@ void VisitUsages(Info& info, IR::Inst& inst) {
             u32 element_size = GetElementSize(info.used_constant_buffer_types, inst.GetOpcode());
             u32& size{info.constant_buffer_used_sizes[index.U32()]};
             if (offset.IsImmediate()) {
-                size = Common::AlignUp(std::max(size, offset.U32() + element_size), 16u);
+                size = Common::AlignUp((std::max)(size, offset.U32() + element_size), 16u);
             } else {
                 size = 0x10'000;
             }
diff --git a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
index 12d7b2d7fa..160c1aaea5 100644
--- a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
+++ b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
@@ -905,7 +905,7 @@ void FoldConstBuffer(Environment& env, IR::Block& block, IR::Inst& inst) {
 }
 
 void FoldDriverConstBuffer(Environment& env, IR::Block& block, IR::Inst& inst, u32 which_bank,
-                           u32 offset_start = 0, u32 offset_end = std::numeric_limits<u16>::max()) {
+                           u32 offset_start = 0, u32 offset_end = (std::numeric_limits<u16>::max)()) {
     const IR::Value bank{inst.Arg(0)};
     const IR::Value offset{inst.Arg(1)};
     if (!bank.IsImmediate() || !offset.IsImmediate()) {
diff --git a/src/shader_recompiler/ir_opt/texture_pass.cpp b/src/shader_recompiler/ir_opt/texture_pass.cpp
index afd880526a..9f04c0afaf 100644
--- a/src/shader_recompiler/ir_opt/texture_pass.cpp
+++ b/src/shader_recompiler/ir_opt/texture_pass.cpp
@@ -517,11 +517,11 @@ void PatchTexelFetch(IR::Block& block, IR::Inst& inst, TexturePixelFormat pixel_
         case TexturePixelFormat::A8B8G8R8_SNORM:
         case TexturePixelFormat::R8G8_SNORM:
         case TexturePixelFormat::R8_SNORM:
-            return 1.f / std::numeric_limits<char>::max();
+            return 1.f / (std::numeric_limits<char>::max)();
         case TexturePixelFormat::R16G16B16A16_SNORM:
         case TexturePixelFormat::R16G16_SNORM:
         case TexturePixelFormat::R16_SNORM:
-            return 1.f / std::numeric_limits<short>::max();
+            return 1.f / (std::numeric_limits<short>::max)();
         default:
             throw InvalidArgument("Invalid texture pixel format");
         }
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index e0f7f82fbe..3c2473266a 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -373,7 +373,7 @@ else()
     set_source_files_properties(vulkan_common/vma.cpp PROPERTIES COMPILE_OPTIONS "-Wno-conversion;-Wno-unused-variable;-Wno-unused-parameter;-Wno-missing-field-initializers")
 
     # Get around GCC failing with intrinsics in Debug
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_BUILD_TYPE MATCHES "Debug")
+    if (CXX_GCC AND CMAKE_BUILD_TYPE MATCHES "Debug")
         set_source_files_properties(host1x/vic.cpp PROPERTIES COMPILE_OPTIONS "-O2")
     endif()
 endif()
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 0cd6861b6d..94ef1a48df 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -36,14 +36,14 @@ BufferCache<P>::BufferCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, R
     const s64 device_local_memory = static_cast<s64>(runtime.GetDeviceLocalMemory());
     const s64 min_spacing_expected = device_local_memory - 1_GiB;
     const s64 min_spacing_critical = device_local_memory - 512_MiB;
-    const s64 mem_threshold = std::min(device_local_memory, TARGET_THRESHOLD);
+    const s64 mem_threshold = (std::min)(device_local_memory, TARGET_THRESHOLD);
     const s64 min_vacancy_expected = (6 * mem_threshold) / 10;
     const s64 min_vacancy_critical = (2 * mem_threshold) / 10;
     minimum_memory = static_cast<u64>(
-        std::max(std::min(device_local_memory - min_vacancy_expected, min_spacing_expected),
+        (std::max)((std::min)(device_local_memory - min_vacancy_expected, min_spacing_expected),
                  DEFAULT_EXPECTED_MEMORY));
     critical_memory = static_cast<u64>(
-        std::max(std::min(device_local_memory - min_vacancy_critical, min_spacing_critical),
+        (std::max)((std::min)(device_local_memory - min_vacancy_critical, min_spacing_critical),
                  DEFAULT_CRITICAL_MEMORY));
 }
 
@@ -553,8 +553,8 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
             ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
                 const DAddr buffer_start = buffer.CpuAddr();
                 const DAddr buffer_end = buffer_start + buffer.SizeBytes();
-                const DAddr new_start = std::max(buffer_start, device_addr);
-                const DAddr new_end = std::min(buffer_end, device_addr + size);
+                const DAddr new_start = (std::max)(buffer_start, device_addr);
+                const DAddr new_end = (std::min)(buffer_end, device_addr + size);
                 memory_tracker.ForEachDownloadRange(
                     new_start, new_end - new_start, false,
                     [&](u64 device_addr_out, u64 range_size) {
@@ -574,7 +574,7 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
                             constexpr u64 align = 64ULL;
                             constexpr u64 mask = ~(align - 1ULL);
                             total_size_bytes += (new_size + align - 1) & mask;
-                            largest_copy = std::max(largest_copy, new_size);
+                            largest_copy = (std::max)(largest_copy, new_size);
                         };
 
                         gpu_modified_ranges.ForEachInRange(device_addr_out, range_size,
@@ -729,8 +729,8 @@ void BufferCache<P>::BindHostVertexBuffers() {
         }
         flags[Dirty::VertexBuffer0 + index] = false;
 
-        host_bindings.min_index = std::min(host_bindings.min_index, index);
-        host_bindings.max_index = std::max(host_bindings.max_index, index);
+        host_bindings.min_index = (std::min)(host_bindings.min_index, index);
+        host_bindings.max_index = (std::max)(host_bindings.max_index, index);
         any_valid = true;
     }
 
@@ -789,7 +789,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
                                                    bool needs_bind) {
     const Binding& binding = channel_state->uniform_buffers[stage][index];
     const DAddr device_addr = binding.device_addr;
-    const u32 size = std::min(binding.size, (*channel_state->uniform_buffer_sizes)[stage][index]);
+    const u32 size = (std::min)(binding.size, (*channel_state->uniform_buffer_sizes)[stage][index]);
     Buffer& buffer = slot_buffers[binding.buffer_id];
     TouchBuffer(buffer, binding.buffer_id);
     const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID &&
@@ -956,7 +956,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {
         Buffer& buffer = slot_buffers[binding.buffer_id];
         TouchBuffer(buffer, binding.buffer_id);
         const u32 size =
-            std::min(binding.size, (*channel_state->compute_uniform_buffer_sizes)[index]);
+            (std::min)(binding.size, (*channel_state->compute_uniform_buffer_sizes)[index]);
         SynchronizeBuffer(buffer, binding.device_addr, size);
 
         const u32 offset = buffer.Offset(binding.device_addr);
@@ -1090,7 +1090,7 @@ void BufferCache<P>::UpdateIndexBuffer() {
     const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
     const u32 draw_size =
         (index_buffer_ref.count + index_buffer_ref.first) * index_buffer_ref.FormatSizeInBytes();
-    const u32 size = std::min(address_size, draw_size);
+    const u32 size = (std::min)(address_size, draw_size);
     if (size == 0 || !device_addr) {
         channel_state->index_buffer = NULL_BINDING;
         return;
@@ -1459,7 +1459,7 @@ bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, DAddr device_addr, u32 si
             .size = range_size,
         });
         total_size_bytes += range_size;
-        largest_copy = std::max(largest_copy, range_size);
+        largest_copy = (std::max)(largest_copy, range_size);
     });
     if (total_size_bytes == 0) {
         return true;
@@ -1594,7 +1594,7 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, DAddr device_addr, u64
                 constexpr u64 align = 64ULL;
                 constexpr u64 mask = ~(align - 1ULL);
                 total_size_bytes += (new_size + align - 1) & mask;
-                largest_copy = std::max(largest_copy, new_size);
+                largest_copy = (std::max)(largest_copy, new_size);
             };
 
             gpu_modified_ranges.ForEachInRange(device_addr_out, range_size, add_download);
@@ -1715,7 +1715,7 @@ Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index,
         // cbufs, which do not store the sizes adjacent to the addresses, so use the fully
         // mapped buffer size for now.
         const u32 memory_layout_size = static_cast<u32>(gpu_memory->GetMemoryLayoutSize(gpu_addr));
-        return std::min(memory_layout_size, static_cast<u32>(8_MiB));
+        return (std::min)(memory_layout_size, static_cast<u32>(8_MiB));
     }();
     // Alignment only applies to the offset of the buffer
     const u32 alignment = runtime.GetStorageBufferAlignment();
diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h
index c95eed1f62..fe68bdbf23 100644
--- a/src/video_core/buffer_cache/memory_tracker_base.h
+++ b/src/video_core/buffer_cache/memory_tracker_base.h
@@ -230,7 +230,7 @@ private:
         std::size_t remaining_size{size};
         std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS};
         u64 page_offset{cpu_address & HIGHER_PAGE_MASK};
-        u64 begin = std::numeric_limits<u64>::max();
+        u64 begin = (std::numeric_limits<u64>::max)();
         u64 end = 0;
         while (remaining_size > 0) {
             const std::size_t copy_amount{
@@ -240,8 +240,8 @@ private:
                 auto [new_begin, new_end] = func(manager, page_offset, copy_amount);
                 if (new_begin != 0 || new_end != 0) {
                     const u64 base_address = page_index << HIGHER_PAGE_BITS;
-                    begin = std::min(new_begin + base_address, begin);
-                    end = std::max(new_end + base_address, end);
+                    begin = (std::min)(new_begin + base_address, begin);
+                    end = (std::max)(new_end + base_address, end);
                 }
             };
             if (manager) {
diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h
index 3db9d8b423..8dc073240e 100644
--- a/src/video_core/buffer_cache/word_manager.h
+++ b/src/video_core/buffer_cache/word_manager.h
@@ -181,7 +181,7 @@ public:
 
     static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) {
         constexpr size_t number_bits = sizeof(u64) * 8;
-        const size_t limit_page_end = number_bits - std::min(page_end, number_bits);
+        const size_t limit_page_end = number_bits - (std::min)(page_end, number_bits);
         u64 bits = (word >> page_start) << page_start;
         bits = (bits << limit_page_end) >> limit_page_end;
         return bits;
@@ -206,11 +206,11 @@ public:
         auto [start_word, start_page] = GetWordPage(start);
         auto [end_word, end_page] = GetWordPage(end + BYTES_PER_PAGE - 1ULL);
         const size_t num_words = NumWords();
-        start_word = std::min(start_word, num_words);
-        end_word = std::min(end_word, num_words);
+        start_word = (std::min)(start_word, num_words);
+        end_word = (std::min)(end_word, num_words);
         const size_t diff = end_word - start_word;
         end_word += (end_page + PAGES_PER_WORD - 1ULL) / PAGES_PER_WORD;
-        end_word = std::min(end_word, num_words);
+        end_word = (std::min)(end_word, num_words);
         end_page += diff * PAGES_PER_WORD;
         constexpr u64 base_mask{~0ULL};
         for (size_t word_index = start_word; word_index < end_word; word_index++) {
@@ -382,7 +382,7 @@ public:
         const std::span<const u64> state_words = words.template Span<type>();
         [[maybe_unused]] const std::span<const u64> untracked_words =
             words.template Span<Type::Untracked>();
-        u64 begin = std::numeric_limits<u64>::max();
+        u64 begin = (std::numeric_limits<u64>::max)();
         u64 end = 0;
         IterateWords(offset, size, [&](size_t index, u64 mask) {
             if constexpr (type == Type::GPU) {
@@ -395,7 +395,7 @@ public:
             const u64 local_page_begin = std::countr_zero(word);
             const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word);
             const u64 page_index = index * PAGES_PER_WORD;
-            begin = std::min(begin, page_index + local_page_begin);
+            begin = (std::min)(begin, page_index + local_page_begin);
             end = page_index + local_page_end;
         });
         static constexpr std::pair<u64, u64> EMPTY{0, 0};
diff --git a/src/video_core/control/channel_state_cache.h b/src/video_core/control/channel_state_cache.h
index 7480d60d13..038c5b8fd1 100644
--- a/src/video_core/control/channel_state_cache.h
+++ b/src/video_core/control/channel_state_cache.h
@@ -73,7 +73,7 @@ public:
     }
 
 protected:
-    static constexpr size_t UNSET_CHANNEL{std::numeric_limits<size_t>::max()};
+    static constexpr size_t UNSET_CHANNEL{(std::numeric_limits<size_t>::max)()};
 
     P* channel_state;
     size_t current_channel_id{UNSET_CHANNEL};
diff --git a/src/video_core/engines/engine_interface.h b/src/video_core/engines/engine_interface.h
index 54631ee6cc..93de389a90 100644
--- a/src/video_core/engines/engine_interface.h
+++ b/src/video_core/engines/engine_interface.h
@@ -37,7 +37,7 @@ public:
         ConsumeSinkImpl();
     }
 
-    std::bitset<std::numeric_limits<u16>::max()> execution_mask{};
+    std::bitset<(std::numeric_limits<u16>::max)()> execution_mask{};
     std::vector<std::pair<u32, u32>> method_sink{};
     bool current_dirty{};
     GPUVAddr current_dma_segment;
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
index e5cc04ec4f..e6f34c7cca 100644
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -30,7 +30,7 @@ void State::ProcessExec(const bool is_linear_) {
 }
 
 void State::ProcessData(const u32 data, const bool is_last_call) {
-    const u32 sub_copy_size = std::min(4U, copy_size - write_offset);
+    const u32 sub_copy_size = (std::min)(4U, copy_size - write_offset);
     std::memcpy(&inner_buffer[write_offset], &data, sub_copy_size);
     write_offset += sub_copy_size;
     if (!is_last_call) {
@@ -58,7 +58,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {
         u32 x_elements = regs.line_length_in;
         u32 x_offset = regs.dest.x;
         const u32 bpp_shift = Common::FoldRight(
-            4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
+            4U, [](u32 x, u32 y) { return (std::min)(x, static_cast<u32>(std::countr_zero(y))); },
             width, x_elements, x_offset, static_cast<u32>(address));
         width >>= bpp_shift;
         x_elements >>= bpp_shift;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 0d47b032c8..c63f908bcc 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -245,15 +245,15 @@ u32 Maxwell3D::GetMaxCurrentVertices() {
         }
         const auto& attribute = regs.vertex_attrib_format[index];
         if (attribute.constant) {
-            num_vertices = std::max(num_vertices, 1U);
+            num_vertices = (std::max)(num_vertices, 1U);
             continue;
         }
         const auto& limit = regs.vertex_stream_limits[index];
         const GPUVAddr gpu_addr_begin = array.Address();
         const GPUVAddr gpu_addr_end = limit.Address() + 1;
         const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
-        num_vertices = std::max(
-            num_vertices, address_size / std::max(attribute.SizeInBytes(), array.stride.Value()));
+        num_vertices = (std::max)(
+            num_vertices, address_size / (std::max)(attribute.SizeInBytes(), array.stride.Value()));
         break;
     }
     return num_vertices;
@@ -262,9 +262,9 @@ u32 Maxwell3D::GetMaxCurrentVertices() {
 size_t Maxwell3D::EstimateIndexBufferSize() {
     GPUVAddr start_address = regs.index_buffer.StartAddress();
     GPUVAddr end_address = regs.index_buffer.EndAddress();
-    static constexpr std::array<size_t, 3> max_sizes = {std::numeric_limits<u8>::max(),
-                                                        std::numeric_limits<u16>::max(),
-                                                        std::numeric_limits<u32>::max()};
+    static constexpr std::array<size_t, 3> max_sizes = {(std::numeric_limits<u8>::max)(),
+                                                        (std::numeric_limits<u16>::max)(),
+                                                        (std::numeric_limits<u32>::max)()};
     const size_t byte_size = regs.index_buffer.FormatSizeInBytes();
     const size_t log2_byte_size = Common::Log2Ceil64(byte_size);
     const size_t cap{GetMaxCurrentVertices() * 4 * byte_size};
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 6b4f1c570e..ce0434f3d7 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1180,11 +1180,11 @@ public:
             }
 
             f32 GetX() const {
-                return std::max(0.0f, translate_x - std::fabs(scale_x));
+                return (std::max)(0.0f, translate_x - std::fabs(scale_x));
             }
 
             f32 GetY() const {
-                return std::max(0.0f, translate_y - std::fabs(scale_y));
+                return (std::max)(0.0f, translate_y - std::fabs(scale_y));
             }
 
             f32 GetWidth() const {
@@ -3091,7 +3091,7 @@ public:
     }
 
     struct DirtyState {
-        using Flags = std::bitset<std::numeric_limits<u8>::max()>;
+        using Flags = std::bitset<(std::numeric_limits<u8>::max)()>;
         using Table = std::array<u8, Regs::NUM_REGS>;
         using Tables = std::array<Table, 2>;
 
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index a4b2c1098b..73a62db37d 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -198,7 +198,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
     u32 bpp_shift = 0U;
     if (!is_remapping) {
         bpp_shift = Common::FoldRight(
-            4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
+            4U, [](u32 x, u32 y) { return (std::min)(x, static_cast<u32>(std::countr_zero(y))); },
             width, x_elements, x_offset, static_cast<u32>(regs.offset_in));
         width >>= bpp_shift;
         x_elements >>= bpp_shift;
@@ -261,7 +261,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
     u32 bpp_shift = 0U;
     if (!is_remapping) {
         bpp_shift = Common::FoldRight(
-            4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
+            4U, [](u32 x, u32 y) { return (std::min)(x, static_cast<u32>(std::countr_zero(y))); },
             width, x_elements, x_offset, static_cast<u32>(regs.offset_out));
         width >>= bpp_shift;
         x_elements >>= bpp_shift;
@@ -312,7 +312,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
     u32 bpp_shift = 0U;
     if (!is_remapping) {
         bpp_shift = Common::FoldRight(
-            4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
+            4U, [](u32 x, u32 y) { return (std::min)(x, static_cast<u32>(std::countr_zero(y))); },
             src_width, dst_width, x_elements, src_x_offset, dst_x_offset,
             static_cast<u32>(regs.offset_in), static_cast<u32>(regs.offset_out));
         src_width >>= bpp_shift;
diff --git a/src/video_core/engines/sw_blitter/converter.cpp b/src/video_core/engines/sw_blitter/converter.cpp
index 2419b56321..785d209f98 100644
--- a/src/video_core/engines/sw_blitter/converter.cpp
+++ b/src/video_core/engines/sw_blitter/converter.cpp
@@ -771,7 +771,7 @@ private:
         };
         const auto to_fp_n = [](f32 base_value, size_t bits, size_t mantissa) {
             constexpr size_t fp32_mantissa_bits = 23;
-            u32 tmp_value = Common::BitCast<u32>(std::max(base_value, 0.0f));
+            u32 tmp_value = Common::BitCast<u32>((std::max)(base_value, 0.0f));
             size_t shift_towards = fp32_mantissa_bits - mantissa;
             return tmp_value >> shift_towards;
         };
diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp
index 782d11d725..a0b13cbffc 100644
--- a/src/video_core/host1x/codecs/h264.cpp
+++ b/src/video_core/host1x/codecs/h264.cpp
@@ -117,7 +117,7 @@ std::span<const u8> H264::ComposeFrame() {
                            (current_context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
 
     u32 max_num_ref_frames =
-        std::max(std::max(current_context.h264_parameter_set.num_refidx_l0_default_active,
+        (std::max)((std::max)(current_context.h264_parameter_set.num_refidx_l0_default_active,
                           current_context.h264_parameter_set.num_refidx_l1_default_active) +
                      1,
                  4);
diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp
index c70d0a506f..7b3dbd7642 100644
--- a/src/video_core/host1x/codecs/vp9.cpp
+++ b/src/video_core/host1x/codecs/vp9.cpp
@@ -228,10 +228,10 @@ constexpr std::array<u8, 254> map_lut{
     std::size_t index{};
 
     if (old_prob * 2 <= 0xff) {
-        index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1));
+        index = static_cast<std::size_t>((std::max)(0, RecenterNonNeg(new_prob, old_prob) - 1));
     } else {
         index = static_cast<std::size_t>(
-            std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1));
+            (std::max)(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1));
     }
 
     return static_cast<s32>(map_lut[index]);
diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp
index 18b3077f9a..9c33370337 100644
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@@ -201,8 +201,8 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot,
 
     slot_surface.resize_destructive(out_luma_width * out_luma_height);
 
-    const auto in_luma_width{std::min(frame->GetWidth(), static_cast<s32>(out_luma_width))};
-    const auto in_luma_height{std::min(frame->GetHeight(), static_cast<s32>(out_luma_height))};
+    const auto in_luma_width{(std::min)(frame->GetWidth(), static_cast<s32>(out_luma_width))};
+    const auto in_luma_height{(std::min)(frame->GetHeight(), static_cast<s32>(out_luma_height))};
     const auto in_luma_stride{frame->GetStride(0)};
 
     const auto in_chroma_stride{frame->GetStride(1)};
@@ -425,9 +425,9 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
 
     slot_surface.resize_destructive(out_luma_width * out_luma_height);
 
-    const auto in_luma_width{std::min(frame->GetWidth(), static_cast<s32>(out_luma_width))};
+    const auto in_luma_width{(std::min)(frame->GetWidth(), static_cast<s32>(out_luma_width))};
     [[maybe_unused]] const auto in_luma_height{
-        std::min(frame->GetHeight(), static_cast<s32>(out_luma_height))};
+        (std::min)(frame->GetHeight(), static_cast<s32>(out_luma_height))};
     const auto in_luma_stride{frame->GetStride(0)};
 
     [[maybe_unused]] const auto in_chroma_width{(frame->GetWidth() + 1) / 2};
@@ -543,15 +543,15 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) {
     auto rect_top{add_one(config.output_config.target_rect_top.Value())};
     auto rect_bottom{add_one(config.output_config.target_rect_bottom.Value())};
 
-    rect_left = std::max(rect_left, dest_left);
-    rect_right = std::min(rect_right, dest_right);
-    rect_top = std::max(rect_top, dest_top);
-    rect_bottom = std::min(rect_bottom, dest_bottom);
+    rect_left = (std::max)(rect_left, dest_left);
+    rect_right = (std::min)(rect_right, dest_right);
+    rect_top = (std::max)(rect_top, dest_top);
+    rect_bottom = (std::min)(rect_bottom, dest_bottom);
 
-    source_left = std::max(source_left, rect_left);
-    source_right = std::min(source_right, rect_right);
-    source_top = std::max(source_top, rect_top);
-    source_bottom = std::min(source_bottom, rect_bottom);
+    source_left = (std::max)(source_left, rect_left);
+    source_right = (std::min)(source_right, rect_right);
+    source_top = (std::max)(source_top, rect_top);
+    source_bottom = (std::min)(source_bottom, rect_bottom);
 
     if (source_left >= source_right || source_top >= source_bottom) {
         return;
@@ -562,14 +562,14 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) {
                                                    1};
     const auto in_surface_width{slot.surface_config.slot_surface_width + 1};
 
-    source_bottom = std::min(source_bottom, out_surface_height);
-    source_right = std::min(source_right, out_surface_width);
+    source_bottom = (std::min)(source_bottom, out_surface_height);
+    source_right = (std::min)(source_right, out_surface_width);
 
     // TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha
     // below max, so it's ignored for now.
 
     if (!slot.color_matrix.matrix_enable) {
-        const auto copy_width = std::min(source_right - source_left, rect_right - rect_left);
+        const auto copy_width = (std::min)(source_right - source_left, rect_right - rect_left);
 
         for (u32 y = source_top; y < source_bottom; y++) {
             const auto dst_line = y * out_surface_width;
@@ -818,8 +818,8 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
     const auto out_chroma_stride = Common::AlignUp(out_chroma_width * BytesPerPixel * 2, 0x10);
     const auto out_chroma_size = out_chroma_height * out_chroma_stride;
 
-    surface_width = std::min(surface_width, out_luma_width);
-    surface_height = std::min(surface_height, out_luma_height);
+    surface_width = (std::min)(surface_width, out_luma_width);
+    surface_height = (std::min)(surface_height, out_luma_height);
 
     [[maybe_unused]] auto DecodeLinear = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
         for (u32 y = 0; y < surface_height; ++y) {
@@ -1089,8 +1089,8 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) {
     const auto out_luma_stride = Common ::AlignUp(out_luma_width * BytesPerPixel, 0x10);
     const auto out_luma_size = out_luma_height * out_luma_stride;
 
-    surface_width = std::min(surface_width, out_luma_width);
-    surface_height = std::min(surface_height, out_luma_height);
+    surface_width = (std::min)(surface_width, out_luma_width);
+    surface_height = (std::min)(surface_height, out_luma_height);
 
     [[maybe_unused]] auto DecodeLinear = [&](std::span<u8> out_buffer) {
         for (u32 y = 0; y < surface_height; y++) {
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index fb529f88b7..328abd0ba4 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -301,7 +301,7 @@ private:
         const u32 indirect_words = 5 + padding;
         const std::size_t first_draw = start_indirect;
         const std::size_t effective_draws = end_indirect - start_indirect;
-        const std::size_t last_draw = start_indirect + std::min(effective_draws, max_draws);
+        const std::size_t last_draw = start_indirect + (std::min)(effective_draws, max_draws);
 
         for (std::size_t index = first_draw; index < last_draw; index++) {
             const std::size_t base = index * indirect_words + 5;
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index ffafc48eff..13f0ea8d96 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -293,7 +293,7 @@ const u8* MemoryManager::GetPointer(GPUVAddr gpu_addr) const {
     return memory.GetPointer<u8>(*address);
 }
 
-#ifdef _MSC_VER // no need for gcc / clang but msvc's compiler is more conservative with inlining.
+#if defined(_MSC_VER) && !defined(__clang__) // no need for gcc / clang but msvc's compiler is more conservative with inlining.
 #pragma inline_recursion(on)
 #endif
 
@@ -329,7 +329,7 @@ inline void MemoryManager::MemoryOperation(GPUVAddr gpu_src_addr, std::size_t si
 
     while (remaining_size > 0) {
         const std::size_t copy_amount{
-            std::min(static_cast<std::size_t>(used_page_size) - page_offset, remaining_size)};
+            (std::min)(static_cast<std::size_t>(used_page_size) - page_offset, remaining_size)};
         auto entry = GetEntry<is_big_pages>(current_address);
         if (entry == EntryType::Mapped) [[likely]] {
             if constexpr (BOOL_BREAK_MAPPED) {
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 448624aa99..9be419932c 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -152,7 +152,7 @@ public:
     PTEKind GetPageKind(GPUVAddr gpu_addr) const;
 
     size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,
-                               size_t max_size = std::numeric_limits<size_t>::max()) const;
+                               size_t max_size = (std::numeric_limits<size_t>::max)()) const;
 
     void FlushCaching();
 
diff --git a/src/video_core/renderer_opengl/blit_image.cpp b/src/video_core/renderer_opengl/blit_image.cpp
index 3b03e8d5ac..b9a502577f 100644
--- a/src/video_core/renderer_opengl/blit_image.cpp
+++ b/src/video_core/renderer_opengl/blit_image.cpp
@@ -45,8 +45,8 @@ void BlitImageHelper::BlitColor(GLuint dst_framebuffer, GLuint src_image_view, G
                        static_cast<float>(src_region.start.x) / static_cast<float>(src_size.width),
                        static_cast<float>(src_region.start.y) /
                            static_cast<float>(src_size.height));
-    glViewport(std::min(dst_region.start.x, dst_region.end.x),
-               std::min(dst_region.start.y, dst_region.end.y),
+    glViewport((std::min)(dst_region.start.x, dst_region.end.x),
+               (std::min)(dst_region.start.y, dst_region.end.y),
                std::abs(dst_region.end.x - dst_region.start.x),
                std::abs(dst_region.end.y - dst_region.start.y));
     glBindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_framebuffer);
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index ade72e1f95..9d7089c5de 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -248,7 +248,7 @@ void BufferCacheRuntime::BindVertexBuffers(VideoCommon::HostBindings<Buffer>& bi
     std::ranges::transform(bindings.strides, buffer_strides.begin(),
                            [](u64 stride) { return static_cast<GLsizei>(stride); });
     const u32 count =
-        std::min(static_cast<u32>(bindings.buffers.size()), max_attributes - bindings.min_index);
+        (std::min)(static_cast<u32>(bindings.buffers.size()), max_attributes - bindings.min_index);
     if (has_unified_vertex_buffers) {
         for (u32 index = 0; index < count; ++index) {
             Buffer& buffer = *bindings.buffers[index];
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index fd471e9795..59d1329d7e 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -59,7 +59,7 @@ class BufferCacheRuntime {
     friend Buffer;
 
 public:
-    static constexpr u8 INVALID_BINDING = std::numeric_limits<u8>::max();
+    static constexpr u8 INVALID_BINDING = (std::numeric_limits<u8>::max)();
 
     explicit BufferCacheRuntime(const Device& device_, StagingBufferPool& staging_buffer_pool_);
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 2ea42abf4b..2746177fab 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -1266,7 +1266,7 @@ void RasterizerOpenGL::SyncPointState() {
     oglEnable(GL_PROGRAM_POINT_SIZE, maxwell3d->regs.point_size_attribute.enabled);
     const bool is_rescaling{texture_cache.IsRescaling()};
     const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f;
-    glPointSize(std::max(1.0f, maxwell3d->regs.point_size * scale));
+    glPointSize((std::max)(1.0f, maxwell3d->regs.point_size * scale));
 }
 
 void RasterizerOpenGL::SyncLineState() {
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index edf0bdd2f1..b6ce57b819 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -617,7 +617,7 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline(
 }
 
 std::unique_ptr<ShaderWorker> ShaderCache::CreateWorkers() const {
-    return std::make_unique<ShaderWorker>(std::max(std::thread::hardware_concurrency(), 2U) - 1,
+    return std::make_unique<ShaderWorker>((std::max)(std::thread::hardware_concurrency(), 2U) - 1,
                                           "GlShaderBuilder",
                                           [this] { return Context{emu_window}; });
 }
diff --git a/src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp b/src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp
index 5767d6b7de..d9535c277d 100644
--- a/src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp
+++ b/src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp
@@ -68,7 +68,7 @@ size_t StagingBuffers::RequestBuffer(size_t requested_size) {
 
 std::optional<size_t> StagingBuffers::FindBuffer(size_t requested_size) {
     size_t known_unsignaled_index = current_sync_index + 1;
-    size_t smallest_buffer = std::numeric_limits<size_t>::max();
+    size_t smallest_buffer = (std::numeric_limits<size_t>::max)();
     std::optional<size_t> found;
     const size_t num_buffers = allocs.size();
     for (size_t index = 0; index < num_buffers; ++index) {
@@ -88,7 +88,7 @@ std::optional<size_t> StagingBuffers::FindBuffer(size_t requested_size) {
             if (!alloc.sync.IsSignaled()) {
                 // Since this fence hasn't been signaled, it's safe to assume all later
                 // fences haven't been signaled either
-                known_unsignaled_index = std::min(known_unsignaled_index, alloc.sync_index);
+                known_unsignaled_index = (std::min)(known_unsignaled_index, alloc.sync_index);
                 continue;
             }
             alloc.sync.Release();
@@ -120,7 +120,7 @@ std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept {
     used_iterator = iterator;
 
     for (size_t region = Region(free_iterator) + 1,
-                region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS);
+                region_end = (std::min)(Region(iterator + size) + 1, NUM_SYNCS);
          region < region_end; ++region) {
         glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED);
         fences[region].Release();
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h
index 19bcf3f355..4027807da1 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -79,7 +79,7 @@ enum : u8 {
 
     Last
 };
-static_assert(Last <= std::numeric_limits<u8>::max());
+static_assert(Last <= (std::numeric_limits<u8>::max)());
 
 } // namespace Dirty
 
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index be14494ca5..8b1737ff51 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -717,7 +717,7 @@ Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_,
         gl_type = tuple.type;
     }
     const int max_host_mip_levels = std::bit_width(info.size.width);
-    gl_num_levels = std::min(info.resources.levels, max_host_mip_levels);
+    gl_num_levels = (std::min)(info.resources.levels, max_host_mip_levels);
     texture = MakeImage(info, gl_internal_format, gl_num_levels);
     current_texture = texture.handle;
     if (runtime->device.HasDebuggingToolAttached()) {
@@ -742,8 +742,8 @@ void Image::UploadMemory(GLuint buffer_handle, size_t buffer_offset,
 
     glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
 
-    u32 current_row_length = std::numeric_limits<u32>::max();
-    u32 current_image_height = std::numeric_limits<u32>::max();
+    u32 current_row_length = (std::numeric_limits<u32>::max)();
+    u32 current_image_height = (std::numeric_limits<u32>::max)();
 
     for (const VideoCommon::BufferImageCopy& copy : copies) {
         if (copy.image_subresource.base_level >= gl_num_levels) {
@@ -788,8 +788,8 @@ void Image::DownloadMemory(std::span<GLuint> buffer_handles, std::span<size_t> b
         glBindBuffer(GL_PIXEL_PACK_BUFFER, buffer_handle);
         glPixelStorei(GL_PACK_ALIGNMENT, 1);
 
-        u32 current_row_length = std::numeric_limits<u32>::max();
-        u32 current_image_height = std::numeric_limits<u32>::max();
+        u32 current_row_length = (std::numeric_limits<u32>::max)();
+        u32 current_image_height = (std::numeric_limits<u32>::max)();
 
         for (const VideoCommon::BufferImageCopy& copy : copies) {
             if (copy.image_subresource.base_level >= gl_num_levels) {
@@ -1033,10 +1033,10 @@ void Image::Scale(bool up_scale) {
     const GLuint draw_fbo = runtime->rescale_draw_fbos[fbo_index].handle;
     for (s32 layer = 0; layer < info.resources.layers; ++layer) {
         for (s32 level = 0; level < info.resources.levels; ++level) {
-            const u32 src_level_width = std::max(1u, src_width >> level);
-            const u32 src_level_height = std::max(1u, src_height >> level);
-            const u32 dst_level_width = std::max(1u, dst_width >> level);
-            const u32 dst_level_height = std::max(1u, dst_height >> level);
+            const u32 src_level_width = (std::max)(1u, src_width >> level);
+            const u32 src_level_height = (std::max)(1u, src_height >> level);
+            const u32 dst_level_width = (std::max)(1u, dst_width >> level);
+            const u32 dst_level_height = (std::max)(1u, dst_height >> level);
 
             glNamedFramebufferTextureLayer(read_fbo, attachment, src_handle, level, layer);
             glNamedFramebufferTextureLayer(draw_fbo, attachment, dst_handle, level, layer);
diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp
index 37213912e3..7bfcd6503b 100644
--- a/src/video_core/renderer_vulkan/blit_image.cpp
+++ b/src/video_core/renderer_vulkan/blit_image.cpp
@@ -340,8 +340,8 @@ void UpdateTwoTexturesDescriptorSet(const Device& device, VkDescriptorSet descri
 
 void BindBlitState(vk::CommandBuffer cmdbuf, const Region2D& dst_region) {
     const VkOffset2D offset{
-        .x = std::min(dst_region.start.x, dst_region.end.x),
-        .y = std::min(dst_region.start.y, dst_region.end.y),
+        .x = (std::min)(dst_region.start.x, dst_region.end.x),
+        .y = (std::min)(dst_region.start.y, dst_region.end.y),
     };
     const VkExtent2D extent{
         .width = static_cast<u32>(std::abs(dst_region.end.x - dst_region.start.x)),
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index e5e1e3ab63..f61f4456fe 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -573,8 +573,8 @@ void BufferCacheRuntime::BindVertexBuffers(VideoCommon::HostBindings<Buffer>& bi
         buffer_handles.push_back(handle);
     }
     const u32 device_max = device.GetMaxVertexInputBindings();
-    const u32 min_binding = std::min(bindings.min_index, device_max);
-    const u32 max_binding = std::min(bindings.max_index, device_max);
+    const u32 min_binding = (std::min)(bindings.min_index, device_max);
+    const u32 max_binding = (std::min)(bindings.max_index, device_max);
     const u32 binding_count = max_binding - min_binding;
     if (binding_count == 0) {
         return;
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index dc068c5e52..f5594450c2 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -562,7 +562,7 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
     static_vector<VkVertexInputBindingDivisorDescriptionEXT, 32> vertex_binding_divisors;
     static_vector<VkVertexInputAttributeDescription, 32> vertex_attributes;
     if (!key.state.dynamic_vertex_input) {
-        const size_t num_vertex_arrays = std::min(
+        const size_t num_vertex_arrays = (std::min)(
             Maxwell::NumVertexArrays, static_cast<size_t>(device.GetMaxVertexInputBindings()));
         for (size_t index = 0; index < num_vertex_arrays; ++index) {
             const bool instanced = key.state.binding_divisors[index] != 0;
diff --git a/src/video_core/renderer_vulkan/vk_present_manager.cpp b/src/video_core/renderer_vulkan/vk_present_manager.cpp
index 9422110895..2c76584c72 100644
--- a/src/video_core/renderer_vulkan/vk_present_manager.cpp
+++ b/src/video_core/renderer_vulkan/vk_present_manager.cpp
@@ -86,8 +86,8 @@ bool CanBlitToSwapchain(const vk::PhysicalDevice& physical_device, VkFormat form
             },
         .extent =
             {
-                .width = std::min(frame_width, swapchain_width),
-                .height = std::min(frame_height, swapchain_height),
+                .width = (std::min)(frame_width, swapchain_width),
+                .height = (std::min)(frame_height, swapchain_height),
                 .depth = 1,
             },
     };
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
index d6ecc2b65c..89e0b1114e 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -202,8 +202,8 @@ public:
         });
         rasterizer->SyncOperation(std::move(func));
         accumulation_since_last_sync = false;
-        first_accumulation_checkpoint = std::min(first_accumulation_checkpoint, num_slots_used);
-        last_accumulation_checkpoint = std::max(last_accumulation_checkpoint, num_slots_used);
+        first_accumulation_checkpoint = (std::min)(first_accumulation_checkpoint, num_slots_used);
+        last_accumulation_checkpoint = (std::max)(last_accumulation_checkpoint, num_slots_used);
     }
 
     void CloseCounter() override {
@@ -311,9 +311,9 @@ public:
 
         if (has_multi_queries) {
             const size_t min_accumulation_limit =
-                std::min(first_accumulation_checkpoint, num_slots_used);
+                (std::min)(first_accumulation_checkpoint, num_slots_used);
             const size_t max_accumulation_limit =
-                std::max(last_accumulation_checkpoint, num_slots_used);
+                (std::max)(last_accumulation_checkpoint, num_slots_used);
             const size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used);
             resolve_buffers.push_back(intermediary_buffer_index);
             queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index],
@@ -332,7 +332,7 @@ public:
         rasterizer->SyncOperation(std::move(func));
         AbandonCurrentQuery();
         num_slots_used = 0;
-        first_accumulation_checkpoint = std::numeric_limits<size_t>::max();
+        first_accumulation_checkpoint = (std::numeric_limits<size_t>::max)();
         last_accumulation_checkpoint = 0;
         accumulation_since_last_sync = has_multi_queries;
         pending_sync.clear();
@@ -414,7 +414,7 @@ private:
         size_t start_slot = query->start_slot;
         for (size_t i = 0; i < banks_set; i++) {
             auto& the_bank = bank_pool.GetBank(bank_id);
-            size_t amount = std::min(the_bank.Size() - start_slot, size_slots);
+            size_t amount = (std::min)(the_bank.Size() - start_slot, size_slots);
             func(&the_bank, start_slot, amount);
             bank_id = the_bank.next_bank - 1;
             start_slot = 0;
@@ -431,11 +431,11 @@ private:
             auto* query = GetQuery(q);
             ApplyBankOp(query, [&indexer](SamplesQueryBank* bank, size_t start, size_t amount) {
                 auto id_ = bank->GetIndex();
-                auto pair = indexer.try_emplace(id_, std::numeric_limits<size_t>::max(),
-                                                std::numeric_limits<size_t>::min());
+                auto pair = indexer.try_emplace(id_, (std::numeric_limits<size_t>::max)(),
+                                                (std::numeric_limits<size_t>::min)());
                 auto& current_pair = pair.first->second;
-                current_pair.first = std::min(current_pair.first, start);
-                current_pair.second = std::max(current_pair.second, amount + start);
+                current_pair.first = (std::min)(current_pair.first, start);
+                current_pair.second = (std::max)(current_pair.second, amount + start);
             });
         }
         for (auto& cont : indexer) {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 2d12fc658f..70ca9583f9 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -131,8 +131,8 @@ VkRect2D GetScissorState(const Maxwell& regs, size_t index, u32 up_scale = 1, u3
     s32 max_y = lower_left ? (clip_height - src.min_y) : src.max_y.Value();
 
     // Bound to render area
-    min_y = std::max(min_y, 0);
-    max_y = std::max(max_y, 0);
+    min_y = (std::max)(min_y, 0);
+    max_y = (std::max)(max_y, 0);
 
     if (src.enable) {
         scissor.offset.x = scale_up(src.min_x);
@@ -142,8 +142,8 @@ VkRect2D GetScissorState(const Maxwell& regs, size_t index, u32 up_scale = 1, u3
     } else {
         scissor.offset.x = 0;
         scissor.offset.y = 0;
-        scissor.extent.width = std::numeric_limits<s32>::max();
-        scissor.extent.height = std::numeric_limits<s32>::max();
+        scissor.extent.width = (std::numeric_limits<s32>::max)();
+        scissor.extent.height = (std::numeric_limits<s32>::max)();
     }
     return scissor;
 }
@@ -380,8 +380,8 @@ void RasterizerVulkan::Clear(u32 layer_count) {
     VkRect2D default_scissor;
     default_scissor.offset.x = 0;
     default_scissor.offset.y = 0;
-    default_scissor.extent.width = std::numeric_limits<s32>::max();
-    default_scissor.extent.height = std::numeric_limits<s32>::max();
+    default_scissor.extent.width = (std::numeric_limits<s32>::max)();
+    default_scissor.extent.height = (std::numeric_limits<s32>::max)();
 
     VkClearRect clear_rect{
         .rect = regs.clear_control.use_scissor ? GetScissorState(regs, 0, up_scale, down_shift)
@@ -393,8 +393,8 @@ void RasterizerVulkan::Clear(u32 layer_count) {
         return;
     }
     clear_rect.rect.extent = VkExtent2D{
-        .width = std::min(clear_rect.rect.extent.width, render_area.width),
-        .height = std::min(clear_rect.rect.extent.height, render_area.height),
+        .width = (std::min)(clear_rect.rect.extent.width, render_area.width),
+        .height = (std::min)(clear_rect.rect.extent.height, render_area.height),
     };
 
     const u32 color_attachment = regs.clear_surface.RT;
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
index 72d5ec35f9..35f497493b 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
@@ -31,7 +31,7 @@ size_t GetStreamBufferSize(const Device& device) {
     VkDeviceSize size{0};
     if (device.HasDebuggingToolAttached()) {
         ForEachDeviceLocalHostVisibleHeap(device, [&size](size_t index, VkMemoryHeap& heap) {
-            size = std::max(size, heap.size);
+            size = (std::max)(size, heap.size);
         });
         // If rebar is not supported, cut the max heap size to 40%. This will allow 2 captures to be
         // loaded at the same time in RenderDoc. If rebar is supported, this shouldn't be an issue
@@ -42,7 +42,7 @@ size_t GetStreamBufferSize(const Device& device) {
     } else {
         size = MAX_STREAM_BUFFER_SIZE;
     }
-    return std::min(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE);
+    return (std::min)(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE);
 }
 } // Anonymous namespace
 
@@ -104,7 +104,7 @@ void StagingBufferPool::TickFrame() {
 
 StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
     if (AreRegionsActive(Region(free_iterator) + 1,
-                         std::min(Region(iterator + size) + 1, NUM_SYNCS))) {
+                         (std::min)(Region(iterator + size) + 1, NUM_SYNCS))) {
         // Avoid waiting for the previous usages to be free
         return GetStagingBuffer(size, MemoryUsage::Upload);
     }
@@ -112,7 +112,7 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
     std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + Region(iterator),
               current_tick);
     used_iterator = iterator;
-    free_iterator = std::max(free_iterator, iterator + size);
+    free_iterator = (std::max)(free_iterator, iterator + size);
 
     if (iterator + size >= stream_buffer_size) {
         std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS,
@@ -170,7 +170,7 @@ std::optional<StagingBufferRef> StagingBufferPool::TryGetReservedBuffer(size_t s
         }
     }
     cache_level.iterate_index = std::distance(entries.begin(), it) + 1;
-    it->tick = deferred ? std::numeric_limits<u64>::max() : scheduler.CurrentTick();
+    it->tick = deferred ? (std::numeric_limits<u64>::max)() : scheduler.CurrentTick();
     ASSERT(!it->deferred);
     it->deferred = deferred;
     return it->Ref();
@@ -206,7 +206,7 @@ StagingBufferRef StagingBufferPool::CreateStagingBuffer(size_t size, MemoryUsage
         .usage = usage,
         .log2_level = log2,
         .index = unique_ids++,
-        .tick = deferred ? std::numeric_limits<u64>::max() : scheduler.CurrentTick(),
+        .tick = deferred ? (std::numeric_limits<u64>::max)() : scheduler.CurrentTick(),
         .deferred = deferred,
     });
     return entry.Ref();
@@ -240,7 +240,7 @@ void StagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, size_t log2) {
         return scheduler.IsFree(entry.tick);
     };
     const size_t begin_offset = staging.delete_index;
-    const size_t end_offset = std::min(begin_offset + deletions_per_tick, old_size);
+    const size_t end_offset = (std::min)(begin_offset + deletions_per_tick, old_size);
     const auto begin = entries.begin() + begin_offset;
     const auto end = entries.begin() + end_offset;
     entries.erase(std::remove_if(begin, end, is_deletable), end);
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h
index a78d2113fb..aef726658a 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.h
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.h
@@ -70,7 +70,7 @@ enum : u8 {
 
     Last,
 };
-static_assert(Last <= std::numeric_limits<u8>::max());
+static_assert(Last <= (std::numeric_limits<u8>::max)());
 
 } // namespace Dirty
 
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp
index a002ca83a0..096b9df087 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -79,15 +79,15 @@ static VkPresentModeKHR ChooseSwapPresentMode(bool has_imm, bool has_mailbox,
 }
 
 VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 width, u32 height) {
-    constexpr auto undefined_size{std::numeric_limits<u32>::max()};
+    constexpr auto undefined_size{(std::numeric_limits<u32>::max)()};
     if (capabilities.currentExtent.width != undefined_size) {
         return capabilities.currentExtent;
     }
     VkExtent2D extent;
-    extent.width = std::max(capabilities.minImageExtent.width,
-                            std::min(capabilities.maxImageExtent.width, width));
-    extent.height = std::max(capabilities.minImageExtent.height,
-                             std::min(capabilities.maxImageExtent.height, height));
+    extent.width = (std::max)(capabilities.minImageExtent.width,
+                            (std::min)(capabilities.maxImageExtent.width, width));
+    extent.height = (std::max)(capabilities.minImageExtent.height,
+                             (std::min)(capabilities.maxImageExtent.height, height));
     return extent;
 }
 
@@ -172,7 +172,7 @@ void Swapchain::Create(
 
 bool Swapchain::AcquireNextImage() {
     const VkResult result = device.GetLogical().AcquireNextImageKHR(
-        *swapchain, std::numeric_limits<u64>::max(), *present_semaphores[frame_index],
+        *swapchain, (std::numeric_limits<u64>::max)(), *present_semaphores[frame_index],
         VK_NULL_HANDLE, &image_index);
     switch (result) {
     case VK_SUCCESS:
@@ -261,10 +261,10 @@ void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities) {
             requested_image_count = capabilities.maxImageCount;
         } else {
             requested_image_count =
-                std::max(requested_image_count, std::min(3U, capabilities.maxImageCount));
+                (std::max)(requested_image_count, (std::min)(3U, capabilities.maxImageCount));
         }
     } else {
-        requested_image_count = std::max(requested_image_count, 3U);
+        requested_image_count = (std::max)(requested_image_count, 3U);
     }
     VkSwapchainCreateInfoKHR swapchain_ci{
         .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR,
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index eda9ff2a5a..1e89652f50 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -509,16 +509,16 @@ TransformBufferCopies(std::span<const VideoCommon::BufferCopy> copies, size_t bu
     }
 }
 struct RangedBarrierRange {
-    u32 min_mip = std::numeric_limits<u32>::max();
-    u32 max_mip = std::numeric_limits<u32>::min();
-    u32 min_layer = std::numeric_limits<u32>::max();
-    u32 max_layer = std::numeric_limits<u32>::min();
+    u32 min_mip = (std::numeric_limits<u32>::max)();
+    u32 max_mip = (std::numeric_limits<u32>::min)();
+    u32 min_layer = (std::numeric_limits<u32>::max)();
+    u32 max_layer = (std::numeric_limits<u32>::min)();
 
     void AddLayers(const VkImageSubresourceLayers& layers) {
-        min_mip = std::min(min_mip, layers.mipLevel);
-        max_mip = std::max(max_mip, layers.mipLevel + 1);
-        min_layer = std::min(min_layer, layers.baseArrayLayer);
-        max_layer = std::max(max_layer, layers.baseArrayLayer + layers.layerCount);
+        min_mip = (std::min)(min_mip, layers.mipLevel);
+        max_mip = (std::max)(max_mip, layers.mipLevel + 1);
+        min_layer = (std::min)(min_layer, layers.baseArrayLayer);
+        max_layer = (std::max)(max_layer, layers.baseArrayLayer + layers.layerCount);
     }
 
     VkImageSubresourceRange SubresourceRange(VkImageAspectFlags aspect_mask) const noexcept {
@@ -747,8 +747,8 @@ void BlitScale(Scheduler& scheduler, VkImage src_image, VkImage dst_image, const
                         .z = 0,
                     },
                     {
-                        .x = std::max(1, src_size.x >> level),
-                        .y = std::max(1, src_size.y >> level),
+                        .x = (std::max)(1, src_size.x >> level),
+                        .y = (std::max)(1, src_size.y >> level),
                         .z = 1,
                     },
                 },
@@ -765,8 +765,8 @@ void BlitScale(Scheduler& scheduler, VkImage src_image, VkImage dst_image, const
                         .z = 0,
                     },
                     {
-                        .x = std::max(1, dst_size.x >> level),
-                        .y = std::max(1, dst_size.y >> level),
+                        .x = (std::max)(1, dst_size.x >> level),
+                        .y = (std::max)(1, dst_size.y >> level),
                         .z = 1,
                     },
                 },
@@ -1956,8 +1956,8 @@ bool Image::BlitScaleHelper(bool scale_up) {
         .end = {static_cast<s32>(dst_width), static_cast<s32>(dst_height)},
     };
     const VkExtent2D extent{
-        .width = std::max(scaled_width, info.size.width),
-        .height = std::max(scaled_height, info.size.height),
+        .width = (std::max)(scaled_width, info.size.width),
+        .height = (std::max)(scaled_height, info.size.height),
     };
 
     auto* view_ptr = blit_view.get();
@@ -2310,21 +2310,21 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
     is_rescaled = is_rescaled_;
     const auto& resolution = runtime.resolution;
 
-    u32 width = std::numeric_limits<u32>::max();
-    u32 height = std::numeric_limits<u32>::max();
+    u32 width = (std::numeric_limits<u32>::max)();
+    u32 height = (std::numeric_limits<u32>::max)();
     for (size_t index = 0; index < NUM_RT; ++index) {
         const ImageView* const color_buffer = color_buffers[index];
         if (!color_buffer) {
             renderpass_key.color_formats[index] = PixelFormat::Invalid;
             continue;
         }
-        width = std::min(width, is_rescaled ? resolution.ScaleUp(color_buffer->size.width)
+        width = (std::min)(width, is_rescaled ? resolution.ScaleUp(color_buffer->size.width)
                                             : color_buffer->size.width);
-        height = std::min(height, is_rescaled ? resolution.ScaleUp(color_buffer->size.height)
+        height = (std::min)(height, is_rescaled ? resolution.ScaleUp(color_buffer->size.height)
                                               : color_buffer->size.height);
         attachments.push_back(color_buffer->RenderTarget());
         renderpass_key.color_formats[index] = color_buffer->format;
-        num_layers = std::max(num_layers, color_buffer->range.extent.layers);
+        num_layers = (std::max)(num_layers, color_buffer->range.extent.layers);
         images[num_images] = color_buffer->ImageHandle();
         image_ranges[num_images] = MakeSubresourceRange(color_buffer);
         rt_map[index] = num_images;
@@ -2333,13 +2333,13 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
     }
     const size_t num_colors = attachments.size();
     if (depth_buffer) {
-        width = std::min(width, is_rescaled ? resolution.ScaleUp(depth_buffer->size.width)
+        width = (std::min)(width, is_rescaled ? resolution.ScaleUp(depth_buffer->size.width)
                                             : depth_buffer->size.width);
-        height = std::min(height, is_rescaled ? resolution.ScaleUp(depth_buffer->size.height)
+        height = (std::min)(height, is_rescaled ? resolution.ScaleUp(depth_buffer->size.height)
                                               : depth_buffer->size.height);
         attachments.push_back(depth_buffer->RenderTarget());
         renderpass_key.depth_format = depth_buffer->format;
-        num_layers = std::max(num_layers, depth_buffer->range.extent.layers);
+        num_layers = (std::max)(num_layers, depth_buffer->range.extent.layers);
         images[num_images] = depth_buffer->ImageHandle();
         const VkImageSubresourceRange subresource_range = MakeSubresourceRange(depth_buffer);
         image_ranges[num_images] = subresource_range;
@@ -2353,8 +2353,8 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
     renderpass_key.samples = samples;
 
     renderpass = runtime.render_pass_cache.Get(renderpass_key);
-    render_area.width = std::min(render_area.width, width);
-    render_area.height = std::min(render_area.height, height);
+    render_area.width = (std::min)(render_area.width, width);
+    render_area.height = (std::min)(render_area.height, height);
 
     num_color_buffers = static_cast<u32>(num_colors);
     framebuffer = runtime.device.GetLogical().CreateFramebuffer({
@@ -2366,7 +2366,7 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
         .pAttachments = attachments.data(),
         .width = render_area.width,
         .height = render_area.height,
-        .layers = static_cast<u32>(std::max(num_layers, 1)),
+        .layers = static_cast<u32>((std::max)(num_layers, 1)),
     });
 }
 
diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp
index baeb8b23a0..573694a145 100644
--- a/src/video_core/shader_environment.cpp
+++ b/src/video_core/shader_environment.cpp
@@ -139,8 +139,8 @@ std::array<u32, 3> GenericEnvironment::WorkgroupSize() const {
 }
 
 u64 GenericEnvironment::ReadInstruction(u32 address) {
-    read_lowest = std::min(read_lowest, address);
-    read_highest = std::max(read_highest, address);
+    read_lowest = (std::min)(read_lowest, address);
+    read_highest = (std::max)(read_highest, address);
 
     if (address >= cached_lowest && address < cached_highest) {
         return code[(address - cached_lowest) / INST_SIZE];
@@ -319,7 +319,7 @@ GraphicsEnvironment::GraphicsEnvironment(Tegra::Engines::Maxwell3D& maxwell3d_,
         break;
     }
     const u64 local_size{sph.LocalMemorySize()};
-    ASSERT(local_size <= std::numeric_limits<u32>::max());
+    ASSERT(local_size <= (std::numeric_limits<u32>::max)());
     local_memory_size = static_cast<u32>(local_size) + sph.common3.shader_local_memory_crs_size;
     texture_bound = maxwell3d->regs.bindless_texture_const_buffer_slot;
     is_proprietary_driver = texture_bound == 2;
diff --git a/src/video_core/shader_environment.h b/src/video_core/shader_environment.h
index 6b372e3365..2d781d82f7 100644
--- a/src/video_core/shader_environment.h
+++ b/src/video_core/shader_environment.h
@@ -86,10 +86,10 @@ protected:
     u32 shared_memory_size{};
     std::array<u32, 3> workgroup_size{};
 
-    u32 read_lowest = std::numeric_limits<u32>::max();
+    u32 read_lowest = (std::numeric_limits<u32>::max)();
     u32 read_highest = 0;
 
-    u32 cached_lowest = std::numeric_limits<u32>::max();
+    u32 cached_lowest = (std::numeric_limits<u32>::max)();
     u32 cached_highest = 0;
     u32 initial_offset = 0;
 
diff --git a/src/video_core/texture_cache/decode_bc.cpp b/src/video_core/texture_cache/decode_bc.cpp
index a018c6df46..5279ff2a0a 100644
--- a/src/video_core/texture_cache/decode_bc.cpp
+++ b/src/video_core/texture_cache/decode_bc.cpp
@@ -67,8 +67,8 @@ void DecompressBlocks(std::span<const u8> input, std::span<u8> output, BufferIma
     const u32 width = copy.image_extent.width;
     const u32 height = copy.image_extent.height * copy.image_subresource.num_layers;
     const u32 depth = copy.image_extent.depth;
-    const u32 block_width = std::min(width, BLOCK_SIZE);
-    const u32 block_height = std::min(height, BLOCK_SIZE);
+    const u32 block_width = (std::min)(width, BLOCK_SIZE);
+    const u32 block_height = (std::min)(height, BLOCK_SIZE);
     const u32 pitch = width * out_bpp;
     size_t input_offset = 0;
     size_t output_offset = 0;
diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp
index d79594ce55..01413f0c9d 100644
--- a/src/video_core/texture_cache/image_base.cpp
+++ b/src/video_core/texture_cache/image_base.cpp
@@ -185,7 +185,7 @@ bool AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_i
     const bool is_rhs_compressed = rhs_block.width > 1 || rhs_block.height > 1;
     const s32 lhs_mips = lhs.info.resources.levels;
     const s32 rhs_mips = rhs.info.resources.levels;
-    const s32 num_mips = std::min(lhs_mips - base->level, rhs_mips);
+    const s32 num_mips = (std::min)(lhs_mips - base->level, rhs_mips);
     AliasedImage lhs_alias;
     AliasedImage rhs_alias;
     lhs_alias.id = rhs_id;
@@ -204,9 +204,9 @@ bool AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_i
             rhs_size.height = Common::DivCeil(rhs_size.height, rhs_block.height);
         }
         const Extent3D copy_size{
-            .width = std::min(lhs_size.width, rhs_size.width),
-            .height = std::min(lhs_size.height, rhs_size.height),
-            .depth = std::min(lhs_size.depth, rhs_size.depth),
+            .width = (std::min)(lhs_size.width, rhs_size.width),
+            .height = (std::min)(lhs_size.height, rhs_size.height),
+            .depth = (std::min)(lhs_size.depth, rhs_size.depth),
         };
         if (copy_size.width == 0 || copy_size.height == 0) {
             LOG_WARNING(HW_GPU, "Copy size is smaller than block size. Mip cannot be aliased.");
@@ -218,7 +218,7 @@ bool AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_i
         const Offset3D rhs_offset{0, 0, is_rhs_3d ? base->layer : 0};
         const s32 lhs_layers = is_lhs_3d ? 1 : lhs.info.resources.layers - base->layer;
         const s32 rhs_layers = is_rhs_3d ? 1 : rhs.info.resources.layers;
-        const s32 num_layers = std::min(lhs_layers, rhs_layers);
+        const s32 num_layers = (std::min)(lhs_layers, rhs_layers);
         const SubresourceLayers lhs_subresource{
             .base_level = mip_level,
             .base_layer = 0,
diff --git a/src/video_core/texture_cache/image_view_base.cpp b/src/video_core/texture_cache/image_view_base.cpp
index 18b9250f91..b7e4049f35 100644
--- a/src/video_core/texture_cache/image_view_base.cpp
+++ b/src/video_core/texture_cache/image_view_base.cpp
@@ -18,9 +18,9 @@ ImageViewBase::ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_i
                              ImageId image_id_, GPUVAddr addr)
     : image_id{image_id_}, gpu_addr{addr}, format{info.format}, type{info.type}, range{info.range},
       size{
-          .width = std::max(image_info.size.width >> range.base.level, 1u),
-          .height = std::max(image_info.size.height >> range.base.level, 1u),
-          .depth = std::max(image_info.size.depth >> range.base.level, 1u),
+          .width = (std::max)(image_info.size.width >> range.base.level, 1u),
+          .height = (std::max)(image_info.size.height >> range.base.level, 1u),
+          .depth = (std::max)(image_info.size.depth >> range.base.level, 1u),
       } {
     ASSERT_MSG(VideoCore::Surface::IsViewCompatible(image_info.format, info.format, false, true),
                "Image view format {} is incompatible with image format {}", info.format,
diff --git a/src/video_core/texture_cache/image_view_info.cpp b/src/video_core/texture_cache/image_view_info.cpp
index 0766a3b79a..8dac8383e0 100644
--- a/src/video_core/texture_cache/image_view_info.cpp
+++ b/src/video_core/texture_cache/image_view_info.cpp
@@ -19,7 +19,7 @@ namespace {
 
 using Tegra::Texture::TextureType;
 
-constexpr u8 RENDER_TARGET_SWIZZLE = std::numeric_limits<u8>::max();
+constexpr u8 RENDER_TARGET_SWIZZLE = (std::numeric_limits<u8>::max)();
 
 [[nodiscard]] u8 CastSwizzle(SwizzleSource source) {
     const u8 casted = static_cast<u8>(source);
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 6c733fe902..e5d559b591 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -56,14 +56,14 @@ TextureCache<P>::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag
         const s64 device_local_memory = static_cast<s64>(runtime.GetDeviceLocalMemory());
         const s64 min_spacing_expected = device_local_memory - 1_GiB;
         const s64 min_spacing_critical = device_local_memory - 512_MiB;
-        const s64 mem_threshold = std::min(device_local_memory, TARGET_THRESHOLD);
+        const s64 mem_threshold = (std::min)(device_local_memory, TARGET_THRESHOLD);
         const s64 min_vacancy_expected = (6 * mem_threshold) / 10;
         const s64 min_vacancy_critical = (2 * mem_threshold) / 10;
         expected_memory = static_cast<u64>(
-            std::max(std::min(device_local_memory - min_vacancy_expected, min_spacing_expected),
+            (std::max)((std::min)(device_local_memory - min_vacancy_expected, min_spacing_expected),
                      DEFAULT_EXPECTED_MEMORY));
         critical_memory = static_cast<u64>(
-            std::max(std::min(device_local_memory - min_vacancy_critical, min_spacing_critical),
+            (std::max)((std::min)(device_local_memory - min_vacancy_critical, min_spacing_critical),
                      DEFAULT_CRITICAL_MEMORY));
         minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2);
     } else {
@@ -586,8 +586,8 @@ std::optional<VideoCore::RasterizerDownloadArea> TextureCache<P>::GetFlushArea(D
             area->end_address = cpu_addr + size;
             area->preemtive = true;
         }
-        area->start_address = std::min(area->start_address, image.cpu_addr);
-        area->end_address = std::max(area->end_address, image.cpu_addr_end);
+        area->start_address = (std::min)(area->start_address, image.cpu_addr);
+        area->end_address = (std::max)(area->end_address, image.cpu_addr_end);
         for (auto image_view_id : image.image_view_ids) {
             auto& image_view = slot_image_views[image_view_id];
             image_view.flags |= ImageViewFlagBits::PreemtiveDownload;
@@ -1273,7 +1273,7 @@ u64 TextureCache<P>::GetScaledImageSizeBytes(const ImageBase& image) {
     const u64 down_shift = static_cast<u64>(Settings::values.resolution_info.down_shift +
                                             Settings::values.resolution_info.down_shift);
     const u64 image_size_bytes =
-        static_cast<u64>(std::max(image.guest_size_bytes, image.unswizzled_size_bytes));
+        static_cast<u64>((std::max)(image.guest_size_bytes, image.unswizzled_size_bytes));
     const u64 tentative_size = (image_size_bytes * scale_up) >> down_shift;
     const u64 fitted_size = Common::AlignUp(tentative_size, 1024);
     return fitted_size;
@@ -1994,7 +1994,7 @@ void TextureCache<P>::RegisterImage(ImageId image_id) {
     ASSERT_MSG(False(image.flags & ImageFlagBits::Registered),
                "Trying to register an already registered image");
     image.flags |= ImageFlagBits::Registered;
-    u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes);
+    u64 tentative_size = (std::max)(image.guest_size_bytes, image.unswizzled_size_bytes);
     if ((IsPixelFormatASTC(image.info.format) &&
          True(image.flags & ImageFlagBits::AcceleratedUpload)) ||
         True(image.flags & ImageFlagBits::Converted)) {
@@ -2168,7 +2168,7 @@ void TextureCache<P>::DeleteImage(ImageId image_id, bool immediate_delete) {
     if (image.HasScaled()) {
         total_used_memory -= GetScaledImageSizeBytes(image);
     }
-    u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes);
+    u64 tentative_size = (std::max)(image.guest_size_bytes, image.unswizzled_size_bytes);
     if ((IsPixelFormatASTC(image.info.format) &&
          True(image.flags & ImageFlagBits::AcceleratedUpload)) ||
         True(image.flags & ImageFlagBits::Converted)) {
@@ -2302,7 +2302,7 @@ void TextureCache<P>::SynchronizeAliases(ImageId image_id) {
     for (const AliasedImage& aliased : image.aliased_images) {
         ImageBase& aliased_image = slot_images[aliased.id];
         if (image.modification_tick < aliased_image.modification_tick) {
-            most_recent_tick = std::max(most_recent_tick, aliased_image.modification_tick);
+            most_recent_tick = (std::max)(most_recent_tick, aliased_image.modification_tick);
             aliased_images.push_back(&aliased);
             any_rescaled |= True(aliased_image.flags & ImageFlagBits::Rescaled);
             any_modified |= True(aliased_image.flags & ImageFlagBits::GpuModified);
@@ -2443,9 +2443,9 @@ void TextureCache<P>::CopyImage(ImageId dst_id, ImageId src_id, std::vector<Imag
         ImageView& dst_view = slot_image_views[dst_view_id];
         ImageView& src_view = slot_image_views[src_view_id];
         [[maybe_unused]] const Extent3D expected_size{
-            .width = std::min(dst_view.size.width, src_view.size.width),
-            .height = std::min(dst_view.size.height, src_view.size.height),
-            .depth = std::min(dst_view.size.depth, src_view.size.depth),
+            .width = (std::min)(dst_view.size.width, src_view.size.width),
+            .height = (std::min)(dst_view.size.height, src_view.size.height),
+            .depth = (std::min)(dst_view.size.depth, src_view.size.depth),
         };
         const Extent3D scaled_extent = [is_rescaled, expected_size]() {
             if (!is_rescaled) {
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index cbc27344b0..6210d63940 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -108,7 +108,7 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI
     /// True when the API can do asynchronous texture downloads.
     static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = P::IMPLEMENTS_ASYNC_DOWNLOADS;
 
-    static constexpr size_t UNSET_CHANNEL{std::numeric_limits<size_t>::max()};
+    static constexpr size_t UNSET_CHANNEL{(std::numeric_limits<size_t>::max)()};
 
     static constexpr s64 TARGET_THRESHOLD = 4_GiB;
     static constexpr s64 DEFAULT_EXPECTED_MEMORY = 1_GiB + 125_MiB;
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index e3faa5bf95..ede451b166 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -327,8 +327,8 @@ template <u32 GOB_EXTENT>
     }
     const SubresourceExtent resources = new_info.resources;
     return SubresourceExtent{
-        .levels = std::max(resources.levels, info.resources.levels),
-        .layers = std::max(resources.layers, info.resources.layers),
+        .levels = (std::max)(resources.levels, info.resources.levels),
+        .layers = (std::max)(resources.layers, info.resources.layers),
     };
 }
 
@@ -354,7 +354,7 @@ template <u32 GOB_EXTENT>
         return std::nullopt;
     }
     return SubresourceExtent{
-        .levels = std::max(new_info.resources.levels, info.resources.levels + base.level),
+        .levels = (std::max)(new_info.resources.levels, info.resources.levels + base.level),
         .layers = 1,
     };
 }
@@ -388,8 +388,8 @@ template <u32 GOB_EXTENT>
         return std::nullopt;
     }
     return SubresourceExtent{
-        .levels = std::max(new_info.resources.levels, info.resources.levels + base.level),
-        .layers = std::max(new_info.resources.layers, info.resources.layers + base.layer),
+        .levels = (std::max)(new_info.resources.levels, info.resources.levels + base.level),
+        .layers = (std::max)(new_info.resources.layers, info.resources.layers + base.layer),
     };
 }
 
@@ -439,14 +439,14 @@ template <u32 GOB_EXTENT>
         }
         layers = 1;
     } else {
-        layers = std::max(resources.layers, info.resources.layers + base->layer);
+        layers = (std::max)(resources.layers, info.resources.layers + base->layer);
     }
     return OverlapResult{
         .gpu_addr = overlap.gpu_addr,
         .cpu_addr = overlap.cpu_addr,
         .resources =
             {
-                .levels = std::max(resources.levels + base->level, info.resources.levels),
+                .levels = (std::max)(resources.levels + base->level, info.resources.levels),
                 .layers = layers,
             },
     };
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index fef0be31d8..85fd06957e 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -1291,7 +1291,7 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues,
     case 1: {
         READ_UINT_VALUES(2)
         u32 L0 = (v[0] >> 2) | (v[1] & 0xC0);
-        u32 L1 = std::min(L0 + (v[1] & 0x3F), 0xFFU);
+        u32 L1 = (std::min)(L0 + (v[1] & 0x3F), 0xFFU);
         ep1 = Pixel(0xFF, L0, L0, L0);
         ep2 = Pixel(0xFF, L1, L1, L1);
     } break;
@@ -1522,7 +1522,7 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
     // Read color data...
     u32 colorDataBits = remainingBits;
     while (remainingBits > 0) {
-        u32 nb = std::min(remainingBits, 8);
+        u32 nb = (std::min)(remainingBits, 8);
         u32 b = strm.ReadBits(nb);
         colorEndpointStream.WriteBits(b, nb);
         remainingBits -= 8;
@@ -1603,7 +1603,7 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
         texelWeightData[clearByteStart - 1] &=
             static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
         std::memset(texelWeightData.data() + clearByteStart, 0,
-                    std::min(16U - clearByteStart, 16U));
+                    (std::min)(16U - clearByteStart, 16U));
     }
 
     IntegerEncodedVector texelWeightValues;
@@ -1674,8 +1674,8 @@ void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height,
                     std::array<u32, 12 * 12> uncompData;
                     DecompressBlock(blockPtr, block_width, block_height, uncompData);
 
-                    u32 decompWidth = std::min(block_width, width - x);
-                    u32 decompHeight = std::min(block_height, height - y);
+                    u32 decompWidth = (std::min)(block_width, width - x);
+                    u32 decompHeight = (std::min)(block_height, height - y);
 
                     const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
                     for (u32 h = 0; h < decompHeight; ++h) {
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 95bcdd37b2..12e4ddf165 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -111,13 +111,13 @@ void SwizzleSubrectImpl(std::span<u8> output, std::span<const u8> input, u32 wid
     const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth;
 
     u32 unprocessed_lines = num_lines;
-    u32 extent_y = std::min(num_lines, height - origin_y);
+    u32 extent_y = (std::min)(num_lines, height - origin_y);
 
     for (u32 slice = 0; slice < depth; ++slice) {
         const u32 z = slice + origin_z;
         const u32 offset_z = (z >> block_depth) * slice_size +
                              ((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height));
-        const u32 lines_in_y = std::min(unprocessed_lines, extent_y);
+        const u32 lines_in_y = (std::min)(unprocessed_lines, extent_y);
         for (u32 line = 0; line < lines_in_y; ++line) {
             const u32 y = line + origin_y;
             const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(y);
@@ -180,7 +180,7 @@ void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes
                       u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth,
                       u32 stride_alignment) {
     const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel;
-    const u32 new_bpp = std::min(4U, static_cast<u32>(std::countr_zero(width * bytes_per_pixel)));
+    const u32 new_bpp = (std::min)(4U, static_cast<u32>(std::countr_zero(width * bytes_per_pixel)));
     width = (width * bytes_per_pixel) >> new_bpp;
     bytes_per_pixel = 1U << new_bpp;
     Swizzle<false>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth,
@@ -191,7 +191,7 @@ void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_p
                     u32 height, u32 depth, u32 block_height, u32 block_depth,
                     u32 stride_alignment) {
     const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel;
-    const u32 new_bpp = std::min(4U, static_cast<u32>(std::countr_zero(width * bytes_per_pixel)));
+    const u32 new_bpp = (std::min)(4U, static_cast<u32>(std::countr_zero(width * bytes_per_pixel)));
     width = (width * bytes_per_pixel) >> new_bpp;
     bytes_per_pixel = 1U << new_bpp;
     Swizzle<true>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth,
diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp
index 39c08b5ae1..2798d5839f 100644
--- a/src/video_core/textures/texture.cpp
+++ b/src/video_core/textures/texture.cpp
@@ -75,7 +75,7 @@ float TSCEntry::MaxAnisotropy() const noexcept {
     if (anisotropic_settings == Settings::AnisotropyMode::Automatic) {
         added_anisotropic = Settings::values.resolution_info.up_scale >>
                             Settings::values.resolution_info.down_shift;
-        added_anisotropic = std::max(added_anisotropic - 1, 0);
+        added_anisotropic = (std::max)(added_anisotropic - 1, 0);
     } else {
         added_anisotropic = static_cast<u32>(Settings::values.max_anisotropy.GetValue()) - 1U;
     }
diff --git a/src/video_core/textures/workers.cpp b/src/video_core/textures/workers.cpp
index a71c305f49..01aa716e11 100644
--- a/src/video_core/textures/workers.cpp
+++ b/src/video_core/textures/workers.cpp
@@ -6,7 +6,7 @@
 namespace Tegra::Texture {
 
 Common::ThreadWorker& GetThreadWorkers() {
-    static Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2,
+    static Common::ThreadWorker workers{(std::max)(std::thread::hardware_concurrency(), 2U) / 2,
                                         "ImageTranscode"};
 
     return workers;
diff --git a/src/video_core/transform_feedback.cpp b/src/video_core/transform_feedback.cpp
index 1f353d2df0..5dda1ffafc 100644
--- a/src/video_core/transform_feedback.cpp
+++ b/src/video_core/transform_feedback.cpp
@@ -104,8 +104,8 @@ std::pair<std::array<Shader::TransformFeedbackVarying, 256>, u32> MakeTransformF
                 }
             }
             xfb[attribute] = varying;
-            count = std::max(count, attribute);
-            highest = std::max(highest, (base_offset + varying.components) * 4);
+            count = (std::max)(count, attribute);
+            highest = (std::max)(highest, (base_offset + varying.components) * 4);
         }
         UNIMPLEMENTED_IF(highest != layout.stride);
     }
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 6fdf1e7874..6d7c33099b 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -699,9 +699,9 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
         LOG_WARNING(Render_Vulkan,
                     "MVK driver breaks when using more than 16 vertex attributes/bindings");
         properties.properties.limits.maxVertexInputAttributes =
-            std::min(properties.properties.limits.maxVertexInputAttributes, 16U);
+            (std::min)(properties.properties.limits.maxVertexInputAttributes, 16U);
         properties.properties.limits.maxVertexInputBindings =
-            std::min(properties.properties.limits.maxVertexInputBindings, 16U);
+            (std::min)(properties.properties.limits.maxVertexInputBindings, 16U);
     }
 
     if (is_turnip) {
diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
index 4ab420afea..675dede61c 100644
--- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
+++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
@@ -136,7 +136,7 @@ namespace Vulkan {
             if (vmaMapMemory(allocator, allocation, &mapped_ptr) != VK_SUCCESS) return {};
         }
         const size_t n = static_cast<size_t>(std::min<VkDeviceSize>(size,
-                                                                    std::numeric_limits<size_t>::max()));
+                                                                    (std::numeric_limits<size_t>::max)()));
         return std::span<u8>{static_cast<u8 *>(mapped_ptr), n};
     }
 
@@ -149,7 +149,7 @@ namespace Vulkan {
             const_cast<MemoryCommit *>(this)->mapped_ptr = p;
         }
         const size_t n = static_cast<size_t>(std::min<VkDeviceSize>(size,
-                                                                    std::numeric_limits<size_t>::max()));
+                                                                    (std::numeric_limits<size_t>::max)()));
         return std::span<const u8>{static_cast<const u8 *>(mapped_ptr), n};
     }
 
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h
index 8fd0bff6af..6501094f05 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.h
+++ b/src/video_core/vulkan_common/vulkan_wrapper.h
@@ -860,7 +860,7 @@ public:
     /// Set object name.
     void SetObjectNameEXT(const char* name) const;
 
-    VkResult Wait(u64 timeout = std::numeric_limits<u64>::max()) const noexcept {
+    VkResult Wait(u64 timeout = (std::numeric_limits<u64>::max)()) const noexcept {
         return dld->vkWaitForFences(owner, 1, &handle, true, timeout);
     }
 
@@ -961,7 +961,7 @@ public:
      * @param timeout Time in nanoseconds to timeout
      * @return        True on successful wait, false on timeout
      */
-    bool Wait(u64 value, u64 timeout = std::numeric_limits<u64>::max()) const {
+    bool Wait(u64 value, u64 timeout = (std::numeric_limits<u64>::max)()) const {
         const VkSemaphoreWaitInfo wait_info{
             .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
             .pNext = nullptr,
diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt
index 38b7b0eec7..d663f6c282 100644
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@@ -256,7 +256,7 @@ if (YUZU_CRASH_DUMPS)
     target_compile_definitions(yuzu PRIVATE YUZU_CRASH_DUMPS)
 endif()
 
-if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+if (CXX_CLANG)
     target_compile_definitions(yuzu PRIVATE
         $<$<VERSION_LESS:$<CXX_COMPILER_VERSION>,15>:CANNOT_EXPLICITLY_INSTANTIATE>
     )
diff --git a/src/yuzu/about_dialog.cpp b/src/yuzu/about_dialog.cpp
index 5b6e32149d..c8edb90268 100644
--- a/src/yuzu/about_dialog.cpp
+++ b/src/yuzu/about_dialog.cpp
@@ -11,14 +11,15 @@ AboutDialog::AboutDialog(QWidget* parent)
     : QDialog(parent)
     , ui{std::make_unique<Ui::AboutDialog>()}
 {
-    const auto description = std::string(Common::g_build_version);
-    const auto build_id = std::string(Common::g_build_id);
+    static const std::string description = std::string(Common::g_build_version);
+    static const std::string build_id = std::string(Common::g_build_id);
+    static const std::string compiler = std::string(Common::g_compiler_id);
 
     std::string yuzu_build;
     if (Common::g_is_dev_build) {
-        yuzu_build = fmt::format("Eden Nightly | {}-{}", description, build_id);
+        yuzu_build = fmt::format("Eden Nightly | {}-{} | {}", description, build_id, compiler);
     } else {
-        yuzu_build = fmt::format("Eden | {}", description);
+        yuzu_build = fmt::format("Eden | {} | {}", description, compiler);
     }
 
     const auto override_build = fmt::format(fmt::runtime(
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index b1ca497e32..63e7a74003 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -381,8 +381,8 @@ qreal GRenderWindow::windowPixelRatio() const {
 
 std::pair<u32, u32> GRenderWindow::ScaleTouch(const QPointF& pos) const {
     const qreal pixel_ratio = windowPixelRatio();
-    return {static_cast<u32>(std::max(std::round(pos.x() * pixel_ratio), qreal{0.0})),
-            static_cast<u32>(std::max(std::round(pos.y() * pixel_ratio), qreal{0.0}))};
+    return {static_cast<u32>((std::max)(std::round(pos.x() * pixel_ratio), qreal{0.0})),
+            static_cast<u32>((std::max)(std::round(pos.y() * pixel_ratio), qreal{0.0}))};
 }
 
 void GRenderWindow::closeEvent(QCloseEvent* event) {
diff --git a/src/yuzu/configuration/configure_touch_from_button.cpp b/src/yuzu/configuration/configure_touch_from_button.cpp
index a6237ab72f..2a4ae3bc89 100644
--- a/src/yuzu/configuration/configure_touch_from_button.cpp
+++ b/src/yuzu/configuration/configure_touch_from_button.cpp
@@ -484,8 +484,8 @@ void TouchScreenPreview::resizeEvent(QResizeEvent* event) {
         return;
     }
 
-    const int target_width = std::min(width(), height() * 4 / 3);
-    const int target_height = std::min(height(), width() * 3 / 4);
+    const int target_width = (std::min)(width(), height() * 4 / 3);
+    const int target_height = (std::min)(height(), width() * 3 / 4);
     if (target_width == width() && target_height == height()) {
         return;
     }
diff --git a/src/yuzu/game_list.cpp b/src/yuzu/game_list.cpp
index 80dd90d876..1ecef4af92 100644
--- a/src/yuzu/game_list.cpp
+++ b/src/yuzu/game_list.cpp
@@ -490,7 +490,7 @@ void GameList::DonePopulating(const QStringList& watch_list) {
     // Also artificially caps the watcher to a certain number of directories
     constexpr int LIMIT_WATCH_DIRECTORIES = 5000;
     constexpr int SLICE_SIZE = 25;
-    int len = std::min(static_cast<int>(watch_list.size()), LIMIT_WATCH_DIRECTORIES);
+    int len = (std::min)(static_cast<int>(watch_list.size()), LIMIT_WATCH_DIRECTORIES);
     for (int i = 0; i < len; i += SLICE_SIZE) {
         watcher->addPaths(watch_list.mid(i, i + SLICE_SIZE));
         QCoreApplication::processEvents();
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 4604a7b904..c6e004813c 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -386,6 +386,7 @@ static void OverrideWindowsFont() {
 }
 #endif
 
+#ifndef _WIN32
 inline static bool isDarkMode() {
 #if QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
     const auto scheme = QGuiApplication::styleHints()->colorScheme();
@@ -397,6 +398,7 @@ inline static bool isDarkMode() {
     return text.lightness() > window.lightness();
 #endif // QT_VERSION
 }
+#endif // _WIN32
 
 GMainWindow::GMainWindow(bool has_broken_vulkan)
     : ui{std::make_unique<Ui::MainWindow>()}, system{std::make_unique<Core::System>()},
@@ -2473,7 +2475,7 @@ void GMainWindow::StoreRecentFile(const QString& filename) {
 
 void GMainWindow::UpdateRecentFiles() {
     const int num_recent_files =
-        std::min(static_cast<int>(UISettings::values.recent_files.size()), max_recent_files_item);
+        (std::min)(static_cast<int>(UISettings::values.recent_files.size()), max_recent_files_item);
 
     for (int i = 0; i < num_recent_files; i++) {
         const QString text = QStringLiteral("&%1. %2").arg(i + 1).arg(
@@ -2652,7 +2654,7 @@ static bool RomFSRawCopy(size_t total_size, size_t& read_size, QProgressDialog&
             if ((new_timestamp - last_timestamp) > 33ms) {
                 last_timestamp = new_timestamp;
                 dialog.setValue(
-                    static_cast<int>(std::min(read_size, total_size) * 100 / total_size));
+                    static_cast<int>((std::min)(read_size, total_size) * 100 / total_size));
                 QCoreApplication::processEvents();
             }
 
@@ -4115,7 +4117,7 @@ void GMainWindow::OnDecreaseVolume() {
     if (current_volume <= 6) {
         step = 1;
     }
-    Settings::values.volume.SetValue(std::max(current_volume - step, 0));
+    Settings::values.volume.SetValue((std::max)(current_volume - step, 0));
     UpdateVolumeUI();
 }
 
@@ -5020,14 +5022,15 @@ void GMainWindow::OnEmulatorUpdateAvailable() {
 
 void GMainWindow::UpdateWindowTitle(std::string_view title_name, std::string_view title_version,
                                     std::string_view gpu_vendor) {
-    const auto description = std::string(Common::g_build_version);
-    const auto build_id = std::string(Common::g_build_id);
+    static const std::string description = std::string(Common::g_build_version);
+    static const std::string build_id = std::string(Common::g_build_id);
+    static const std::string compiler = std::string(Common::g_compiler_id);
 
     std::string yuzu_title;
     if (Common::g_is_dev_build) {
-        yuzu_title = fmt::format("Eden Nightly | {}-{}", description, build_id);
+        yuzu_title = fmt::format("Eden Nightly | {}-{} | {}", description, build_id, compiler);
     } else {
-        yuzu_title = fmt::format("Eden | {}", description);
+        yuzu_title = fmt::format("Eden | {} | {}", description, compiler);
     }
 
     const auto override_title =
@@ -5674,13 +5677,13 @@ void VolumeButton::wheelEvent(QWheelEvent* event) {
 
     if (num_steps > 0) {
         Settings::values.volume.SetValue(
-            std::min(200, Settings::values.volume.GetValue() + num_steps));
+            (std::min)(200, Settings::values.volume.GetValue() + num_steps));
     } else {
         Settings::values.volume.SetValue(
-            std::max(0, Settings::values.volume.GetValue() + num_steps));
+            (std::max)(0, Settings::values.volume.GetValue() + num_steps));
     }
 
-    scroll_multiplier = std::min(MaxMultiplier, scroll_multiplier * 2);
+    scroll_multiplier = (std::min)(MaxMultiplier, scroll_multiplier * 2);
     scroll_timer.start(100); // reset the multiplier if no scroll event occurs within 100 ms
 
     emit VolumeChanged();
@@ -5721,11 +5724,11 @@ static void SetHighDPIAttributes() {
     constexpr float minimum_width = 1350.0f;
     constexpr float minimum_height = 900.0f;
 
-    const float width_ratio = std::max(1.0f, real_width / minimum_width);
-    const float height_ratio = std::max(1.0f, real_height / minimum_height);
+    const float width_ratio = (std::max)(1.0f, real_width / minimum_width);
+    const float height_ratio = (std::max)(1.0f, real_height / minimum_height);
 
     // Get the lower of the 2 ratios and truncate, this is the maximum integer scale.
-    const float max_ratio = std::trunc(std::min(width_ratio, height_ratio));
+    const float max_ratio = std::trunc((std::min)(width_ratio, height_ratio));
 
     if (max_ratio > real_ratio) {
         QApplication::setHighDpiScaleFactorRoundingPolicy(
diff --git a/src/yuzu/play_time_manager.cpp b/src/yuzu/play_time_manager.cpp
index 2669e3a7ab..8317386816 100644
--- a/src/yuzu/play_time_manager.cpp
+++ b/src/yuzu/play_time_manager.cpp
@@ -168,7 +168,7 @@ QString ReadablePlayTime(qulonglong time_seconds) {
     if (time_seconds == 0) {
         return {};
     }
-    const auto time_minutes = std::max(static_cast<double>(time_seconds) / 60, 1.0);
+    const auto time_minutes = (std::max)(static_cast<double>(time_seconds) / 60, 1.0);
     const auto time_hours = static_cast<double>(time_seconds) / 3600;
     const bool is_minutes = time_minutes < 60;
     const char* unit = is_minutes ? "m" : "h";
diff --git a/src/yuzu/util/util.cpp b/src/yuzu/util/util.cpp
index 551df7b4cd..844da5c401 100644
--- a/src/yuzu/util/util.cpp
+++ b/src/yuzu/util/util.cpp
@@ -30,7 +30,7 @@ QString ReadableByteSize(qulonglong size) {
         return QStringLiteral("0");
     }
 
-    const int digit_groups = std::min(static_cast<int>(std::log10(size) / std::log10(1024)),
+    const int digit_groups = (std::min)(static_cast<int>(std::log10(size) / std::log10(1024)),
                                       static_cast<int>(units.size()));
     return QStringLiteral("%L1 %2")
         .arg(size / std::pow(1024, digit_groups), 0, 'f', 1)
diff --git a/tools/cpm-fetch.sh b/tools/cpm-fetch.sh
index 648bbae1c8..5620996433 100755
--- a/tools/cpm-fetch.sh
+++ b/tools/cpm-fetch.sh
@@ -59,7 +59,7 @@ download_package() {
   if grep -e "patches" <<< "$JSON" > /dev/null; then
     PATCHES=$(jq -r '.patches | join(" ")' <<< "$JSON")
     for patch in $PATCHES; do
-      patch -p1 < "$ROOTDIR"/.patch/$package/$patch
+      patch --binary -p1 < "$ROOTDIR"/.patch/$package/$patch
     done
   fi
 
@@ -118,6 +118,14 @@ do
     continue
   fi
 
+  VERSION=$(jq -r ".version" <<< "$JSON")
+  GIT_VERSION=$(jq -r ".git_version" <<< "$JSON")
+  TAG=$(jq -r ".tag" <<< "$JSON")
+  SHA=$(jq -r ".sha" <<< "$JSON")
+
+  [ "$GIT_VERSION" == null ] && GIT_VERSION="$VERSION"
+  [ "$GIT_VERSION" == null ] && GIT_VERSION="$TAG"
+
   # url parsing WOOOHOOHOHOOHOHOH
   URL=$(jq -r ".url" <<< "$JSON")
   REPO=$(jq -r ".repo" <<< "$JSON")
@@ -173,7 +181,7 @@ do
   # key parsing
   KEY=$(jq -r ".key" <<< "$JSON")
 
-  if [ "$KEY" == null ]; then    
+  if [ "$KEY" == null ]; then
     if [ "$SHA" != null ]; then
       KEY=$(cut -c1-4 - <<< "$SHA")
     elif [ "$GIT_VERSION" != null ]; then

From 2502352180c6a96b2b544f3bbf41b5569af3441d Mon Sep 17 00:00:00 2001
From: Caio Oliveira <caiooliveirafarias0@gmail.com>
Date: Wed, 10 Sep 2025 02:22:07 +0200
Subject: [PATCH 32/38] [core, desktop] "fixes" from building on
 mxe/x86_64-w64-mingw32 (#396)

* well, i couldn't build the executable, but in anyway those build
  errors can come back later to bite our backs
* include missing include
* safeguard _MSC_VER only headers

* saw some of those changes on another PR but I cant find it at moment

Signed-off-by: Caio Oliveira <caiooliveirafarias0@gmail.com>

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/396
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Co-committed-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
---
 src/core/file_sys/vfs/vfs_real.cpp                    | 6 +++++-
 src/core/hle/service/nifm/nifm.cpp                    | 3 +++
 src/core/hle/service/nvnflinger/hardware_composer.cpp | 2 ++
 src/core/internal_network/emu_net_state.cpp           | 2 ++
 src/core/internal_network/wifi_scanner.cpp            | 2 ++
 src/yuzu/main.cpp                                     | 2 ++
 6 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/core/file_sys/vfs/vfs_real.cpp b/src/core/file_sys/vfs/vfs_real.cpp
index 052684e9db..4199667171 100644
--- a/src/core/file_sys/vfs/vfs_real.cpp
+++ b/src/core/file_sys/vfs/vfs_real.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -442,11 +445,12 @@ std::vector<VirtualFile> RealVfsDirectory::GetFiles() const {
 FileTimeStampRaw RealVfsDirectory::GetFileTimeStamp(std::string_view path_) const {
     const auto full_path = FS::SanitizePath(path + '/' + std::string(path_));
     const auto fs_path = std::filesystem::path{FS::ToU8String(full_path)};
-    struct stat file_status;
 
 #ifdef _WIN32
+    struct _stat64 file_status;
     const auto stat_result = _wstat64(fs_path.c_str(), &file_status);
 #else
+    struct stat file_status;
     const auto stat_result = stat(fs_path.c_str(), &file_status);
 #endif
 
diff --git a/src/core/hle/service/nifm/nifm.cpp b/src/core/hle/service/nifm/nifm.cpp
index 15c7d8d2c7..7d43677c6f 100644
--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -21,6 +21,7 @@
 #include <atomic>
 #include <chrono>
 #include <mutex>
+#include <optional>
 #include <thread>
 #include <unordered_set>
 #include <common/settings.h>
@@ -32,8 +33,10 @@
 #undef interface
 #include <wlanapi.h>
 #pragma pop_macro("interface")
+#ifdef _MSC_VER
 #pragma comment(lib, "wlanapi.lib")
 #endif
+#endif
 
 namespace {
 
diff --git a/src/core/hle/service/nvnflinger/hardware_composer.cpp b/src/core/hle/service/nvnflinger/hardware_composer.cpp
index 7098f4709d..a262a3dcd5 100644
--- a/src/core/hle/service/nvnflinger/hardware_composer.cpp
+++ b/src/core/hle/service/nvnflinger/hardware_composer.cpp
@@ -4,6 +4,8 @@
 // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-3.0-or-later
 
+#include <optional>
+
 #include <boost/container/small_vector.hpp>
 
 #include "core/hle/service/nvdrv/devices/nvdisp_disp0.h"
diff --git a/src/core/internal_network/emu_net_state.cpp b/src/core/internal_network/emu_net_state.cpp
index 17fa58fa08..d6d1a70a60 100644
--- a/src/core/internal_network/emu_net_state.cpp
+++ b/src/core/internal_network/emu_net_state.cpp
@@ -10,8 +10,10 @@
 #define NOMINMAX
 #include <windows.h>
 #include <wlanapi.h>
+#ifdef _MSC_VER
 #pragma comment(lib, "wlanapi.lib")
 #endif
+#endif
 #include <common/settings.h>
 
 #include <mutex>
diff --git a/src/core/internal_network/wifi_scanner.cpp b/src/core/internal_network/wifi_scanner.cpp
index f4b1738e69..127221099f 100644
--- a/src/core/internal_network/wifi_scanner.cpp
+++ b/src/core/internal_network/wifi_scanner.cpp
@@ -15,8 +15,10 @@ using namespace std::chrono_literals;
 #define NOMINMAX
 #include <windows.h>
 #include <wlanapi.h>
+#ifdef _MSC_VER
 #pragma comment(lib, "wlanapi.lib")
 #endif
+#endif
 
 namespace Network {
 #ifdef ENABLE_WIFI_SCAN
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index c6e004813c..e23e9a6a48 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -178,7 +178,9 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual
 #include <QPlatformSurfaceEvent>
 #include <dwmapi.h>
 #include <windows.h>
+#ifdef _MSC_VER
 #pragma comment(lib, "Dwmapi.lib")
+#endif
 
 static inline void ApplyWindowsTitleBarDarkMode(HWND hwnd, bool enabled) {
     if (!hwnd)

From 13ecc1e481cbb455ef89291ee9f2421386657dcb Mon Sep 17 00:00:00 2001
From: Marcin Serwin <marcin@serwin.dev>
Date: Wed, 10 Sep 2025 18:36:42 +0200
Subject: [PATCH 33/38] [cmake] fix issues when using CPMUTIL_FORCE_SYSTEM
 (#399)

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/399
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Co-authored-by: Marcin Serwin <marcin@serwin.dev>
Co-committed-by: Marcin Serwin <marcin@serwin.dev>
---
 CMakeLists.txt                           |  7 +++++++
 CMakeModules/Findsirit.cmake             | 11 +++++++++++
 externals/cpmfile.json                   |  1 +
 externals/nx_tzdb/CMakeLists.txt         |  9 ++++++---
 src/core/CMakeLists.txt                  |  2 +-
 src/dynarmic/src/dynarmic/CMakeLists.txt |  4 ++++
 6 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 CMakeModules/Findsirit.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 144e77684e..6a9e15cfbd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,6 +204,8 @@ CMAKE_DEPENDENT_OPTION(YUZU_USE_FASTER_LD "Check if a faster linker is available
 
 CMAKE_DEPENDENT_OPTION(USE_SYSTEM_MOLTENVK "Use the system MoltenVK lib (instead of the bundled one)" OFF "APPLE" OFF)
 
+set(YUZU_TZDB_PATH "" CACHE STRING "Path to a pre-downloaded timezone database")
+
 set(DEFAULT_ENABLE_OPENSSL ON)
 if (ANDROID OR WIN32 OR APPLE OR PLATFORM_SUN)
     # - Windows defaults to the Schannel backend.
@@ -465,6 +467,7 @@ else()
     find_package(Opus 1.3 MODULE REQUIRED)
     find_package(ZLIB 1.2 REQUIRED)
     find_package(zstd 1.5 REQUIRED MODULE)
+    find_package(Boost 1.79.0 REQUIRED headers context system fiber)
 
     if (YUZU_TESTS)
         find_package(Catch2 3.0.1 REQUIRED)
@@ -596,6 +599,10 @@ find_package(libusb)
 find_package(VulkanMemoryAllocator)
 find_package(SPIRV-Tools)
 
+if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
+    find_package(xbyak)
+endif()
+
 if (ENABLE_WEB_SERVICE)
     find_package(httplib)
 endif()
diff --git a/CMakeModules/Findsirit.cmake b/CMakeModules/Findsirit.cmake
new file mode 100644
index 0000000000..1611efaad8
--- /dev/null
+++ b/CMakeModules/Findsirit.cmake
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+include(FindPackageHandleStandardArgs)
+
+find_package(PkgConfig QUIET)
+pkg_search_module(sirit QUIET IMPORTED_TARGET sirit)
+find_package_handle_standard_args(sirit
+    REQUIRED_VARS sirit_LINK_LIBRARIES
+    VERSION_VAR sirit_VERSION
+)
diff --git a/externals/cpmfile.json b/externals/cpmfile.json
index 57258f771b..65f50ffdfc 100644
--- a/externals/cpmfile.json
+++ b/externals/cpmfile.json
@@ -15,6 +15,7 @@
         "repo": "eden-emulator/sirit",
         "sha": "db1f1e8ab5",
         "hash": "73eb3a042848c63a10656545797e85f40d142009dfb7827384548a385e1e28e1ac72f42b25924ce530d58275f8638554281e884d72f9c7aaf4ed08690a414b05",
+        "find_args": "MODULE",
         "options": [
             "SIRIT_USE_SYSTEM_SPIRV_HEADERS ON"
         ]
diff --git a/externals/nx_tzdb/CMakeLists.txt b/externals/nx_tzdb/CMakeLists.txt
index 242e1e1fcf..a08c80f2bd 100644
--- a/externals/nx_tzdb/CMakeLists.txt
+++ b/externals/nx_tzdb/CMakeLists.txt
@@ -33,9 +33,12 @@ if (CAN_BUILD_NX_TZDB AND NOT YUZU_DOWNLOAD_TIME_ZONE_DATA)
     set(NX_TZDB_TZ_DIR "${NX_TZDB_BASE_DIR}/zoneinfo")
 endif()
 
-# TODO(crueter): This is a terrible solution, but MSVC fails to link without it
-# Need to investigate further but I still can't reproduce...
-if (MSVC)
+if(NOT YUZU_TZDB_PATH STREQUAL "")
+    set(NX_TZDB_BASE_DIR "${YUZU_TZDB_PATH}")
+    set(NX_TZDB_TZ_DIR "${NX_TZDB_BASE_DIR}/zoneinfo")
+elseif (MSVC)
+    # TODO(crueter): This is a terrible solution, but MSVC fails to link without it
+    # Need to investigate further but I still can't reproduce...
     set(NX_TZDB_VERSION "250725")
     set(NX_TZDB_ARCHIVE "${CPM_SOURCE_CACHE}/nx_tzdb/${NX_TZDB_VERSION}.zip")
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 0be60b55c6..1e8e4ec07a 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1200,7 +1200,7 @@ else()
     target_link_libraries(core PUBLIC Boost::headers)
 endif()
 
-target_link_libraries(core PRIVATE fmt::fmt nlohmann_json::nlohmann_json RenderDoc::API mbedtls)
+target_link_libraries(core PRIVATE fmt::fmt nlohmann_json::nlohmann_json RenderDoc::API mbedtls mbedcrypto)
 if (MINGW)
     target_link_libraries(core PRIVATE ${MSWSOCK_LIBRARY})
 endif()
diff --git a/src/dynarmic/src/dynarmic/CMakeLists.txt b/src/dynarmic/src/dynarmic/CMakeLists.txt
index efae44d917..b74626bcd5 100644
--- a/src/dynarmic/src/dynarmic/CMakeLists.txt
+++ b/src/dynarmic/src/dynarmic/CMakeLists.txt
@@ -160,6 +160,10 @@ if ("A64" IN_LIST DYNARMIC_FRONTENDS)
 endif()
 
 if ("x86_64" IN_LIST ARCHITECTURE)
+    # Newer versions of xbyak (>= 7.25.0) have stricter checks that currently
+    # fail in dynarmic
+    target_compile_definitions(dynarmic PRIVATE XBYAK_STRICT_CHECK_MEM_REG_SIZE=0)
+
     target_compile_definitions(dynarmic PRIVATE XBYAK_OLD_DISP_CHECK=1)
     target_link_libraries(dynarmic
         PRIVATE

From 3fbfd64722526ade3e4cd33d97838611c27f35ad Mon Sep 17 00:00:00 2001
From: MaranBr <maranbr@outlook.com>
Date: Fri, 12 Sep 2025 16:02:12 +0200
Subject: [PATCH 34/38] [fs] Fix integrity check validation for new updates
 (#395)

This fixes the integrity check validation for new updates.

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/395
Reviewed-by: Shinmegumi <shinmegumi@eden-emu.dev>
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: MaranBr <maranbr@outlook.com>
Co-committed-by: MaranBr <maranbr@outlook.com>
---
 src/core/file_sys/content_archive.cpp         | 13 +++----
 ...ssystem_integrity_verification_storage.cpp | 36 +++++--------------
 .../fssystem_nca_file_system_driver.cpp       |  8 ++---
 3 files changed, 17 insertions(+), 40 deletions(-)

diff --git a/src/core/file_sys/content_archive.cpp b/src/core/file_sys/content_archive.cpp
index 6652523589..b961cdb096 100644
--- a/src/core/file_sys/content_archive.cpp
+++ b/src/core/file_sys/content_archive.cpp
@@ -34,12 +34,9 @@ NCA::NCA(VirtualFile file_, const NCA* base_nca)
     }
 
     reader = std::make_shared<NcaReader>();
-    if (Result rc =
-            reader->Initialize(file, GetCryptoConfiguration(), GetNcaCompressionConfiguration());
-        R_FAILED(rc)) {
+    if (Result rc = reader->Initialize(file, GetCryptoConfiguration(), GetNcaCompressionConfiguration()); R_FAILED(rc)) {
         if (rc != ResultInvalidNcaSignature) {
-            LOG_ERROR(Loader, "File reader errored out during header read: {:#x}",
-                      rc.GetInnerValue());
+            LOG_ERROR(Loader, "File reader errored out during header read: {:#x}", rc.GetInnerValue());
         }
         status = Loader::ResultStatus::ErrorBadNCAHeader;
         return;
@@ -84,10 +81,8 @@ NCA::NCA(VirtualFile file_, const NCA* base_nca)
     std::vector<VirtualFile> filesystems(fs_count);
     for (s32 i = 0; i < fs_count; i++) {
         NcaFsHeaderReader header_reader;
-        const Result rc = fs.OpenStorage(&filesystems[i], &header_reader, i);
-        if (R_FAILED(rc)) {
-            LOG_ERROR(Loader, "File reader errored out during read of section {}: {:#x}", i,
-                      rc.GetInnerValue());
+        if (Result rc = fs.OpenStorage(&filesystems[i], &header_reader, i); R_FAILED(rc)) {
+            LOG_DEBUG(Loader, "File reader errored out during read of section {}: {:#x}", i, rc.GetInnerValue());
             status = Loader::ResultStatus::ErrorBadNCAHeader;
             return;
         }
diff --git a/src/core/file_sys/fssystem/fssystem_integrity_verification_storage.cpp b/src/core/file_sys/fssystem/fssystem_integrity_verification_storage.cpp
index 57cdc19248..c1bad3ec36 100644
--- a/src/core/file_sys/fssystem/fssystem_integrity_verification_storage.cpp
+++ b/src/core/file_sys/fssystem/fssystem_integrity_verification_storage.cpp
@@ -4,23 +4,18 @@
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
-#include "core/file_sys/fssystem/fssystem_integrity_verification_storage.h"
 #include "common/alignment.h"
+#include "core/file_sys/fssystem/fssystem_integrity_verification_storage.h"
 
 namespace FileSys {
 
-constexpr inline u32 ILog2(u32 val)
-{
+constexpr inline u32 ILog2(u32 val) {
     ASSERT(val > 0);
     return static_cast<u32>((sizeof(u32) * 8) - 1 - std::countl_zero<u32>(val));
 }
 
-void IntegrityVerificationStorage::Initialize(VirtualFile hs,
-                                              VirtualFile ds,
-                                              s64 verif_block_size,
-                                              s64 upper_layer_verif_block_size,
-                                              bool is_real_data)
-{
+void IntegrityVerificationStorage::Initialize(VirtualFile hs, VirtualFile ds, s64 verif_block_size,
+                                              s64 upper_layer_verif_block_size, bool is_real_data) {
     // Validate preconditions.
     ASSERT(verif_block_size >= HashSize);
 
@@ -40,28 +35,22 @@ void IntegrityVerificationStorage::Initialize(VirtualFile hs,
     ASSERT(m_upper_layer_verification_block_size == 1ll << m_upper_layer_verification_block_order);
 
     // Validate sizes.
-    if (m_data_storage != nullptr) {
+    {
         s64 hash_size = m_hash_storage->GetSize();
         s64 data_size = m_data_storage->GetSize();
         ASSERT(((hash_size / HashSize) * m_verification_block_size) >= data_size);
-    } else {
-        LOG_ERROR(Loader,
-                  "Failed to initialize integrity verification store. Game, update, or DLC may not "
-                  "work.");
     }
 
     // Set data.
     m_is_real_data = is_real_data;
 }
 
-void IntegrityVerificationStorage::Finalize()
-{
+void IntegrityVerificationStorage::Finalize() {
     m_hash_storage = VirtualFile();
     m_data_storage = VirtualFile();
 }
 
-size_t IntegrityVerificationStorage::Read(u8* buffer, size_t size, size_t offset) const
-{
+size_t IntegrityVerificationStorage::Read(u8* buffer, size_t size, size_t offset) const {
     // Succeed if zero size.
     if (size == 0) {
         return size;
@@ -70,13 +59,7 @@ size_t IntegrityVerificationStorage::Read(u8* buffer, size_t size, size_t offset
     // Validate arguments.
     ASSERT(buffer != nullptr);
 
-    if (m_data_storage == nullptr) {
-        LOG_ERROR(Loader,
-                  "Integrity verification store failed read operation. Game, update or DLC may not "
-                  "work.");
-        return 0;
-    }
-
+    // Validate the offset.
     s64 data_size = m_data_storage->GetSize();
     ASSERT(offset <= static_cast<size_t>(data_size));
 
@@ -104,8 +87,7 @@ size_t IntegrityVerificationStorage::Read(u8* buffer, size_t size, size_t offset
     return m_data_storage->Read(buffer, read_size, offset);
 }
 
-size_t IntegrityVerificationStorage::GetSize() const
-{
+size_t IntegrityVerificationStorage::GetSize() const {
     return m_data_storage->GetSize();
 }
 
diff --git a/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.cpp b/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.cpp
index 1bc7039318..4cfa5c58f8 100644
--- a/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.cpp
+++ b/src/core/file_sys/fssystem/fssystem_nca_file_system_driver.cpp
@@ -1051,8 +1051,8 @@ Result NcaFileSystemDriver::CreatePatchMetaStorage(
     ASSERT(out_aes_ctr_ex_meta != nullptr);
     ASSERT(out_indirect_meta != nullptr);
     ASSERT(base_storage != nullptr);
-    ASSERT(patch_info.HasAesCtrExTable());
-    ASSERT(patch_info.HasIndirectTable());
+    //ASSERT(patch_info.HasAesCtrExTable());
+    //ASSERT(patch_info.HasIndirectTable());
     ASSERT(Common::IsAligned<s64>(patch_info.aes_ctr_ex_size, NcaHeader::XtsBlockSize));
 
     // Validate patch info extents.
@@ -1334,8 +1334,8 @@ Result NcaFileSystemDriver::CreateIntegrityVerificationStorageImpl(
             R_UNLESS(last_layer_info_offset + layer_info.size <= layer_info_offset,
                      ResultRomNcaInvalidIntegrityLayerInfoOffset);
         }
-        storage_info.SetDataStorage(std::make_shared<OffsetVfsFile>(
-            std::move(base_storage), layer_info.size, last_layer_info_offset));
+        storage_info[level_hash_info.max_layers - 1] = std::make_shared<OffsetVfsFile>(
+            std::move(base_storage), layer_info.size, last_layer_info_offset);
 
         // Make the integrity romfs storage.
         auto integrity_storage = std::make_shared<IntegrityRomFsStorage>();

From ad6045d9a4bb068054a0a713bb1d8940c1068e30 Mon Sep 17 00:00:00 2001
From: MaranBr <maranbr@outlook.com>
Date: Fri, 12 Sep 2025 16:02:25 +0200
Subject: [PATCH 35/38] [vk] Fix regression on PR 321 (#394)

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/394
Reviewed-by: Shinmegumi <shinmegumi@eden-emu.dev>
Reviewed-by: crueter <crueter@eden-emu.dev>
Co-authored-by: MaranBr <maranbr@outlook.com>
Co-committed-by: MaranBr <maranbr@outlook.com>
---
 .../renderer_vulkan/vk_texture_cache.cpp      | 32 ++++++-------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 1e89652f50..466be26577 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -2160,34 +2160,20 @@ VkImageView ImageView::StorageView(Shader::TextureType texture_type,
     if (!image_handle) {
         return VK_NULL_HANDLE;
     }
-
+    if (image_format == Shader::ImageFormat::Typeless) {
+        return Handle(texture_type);
+    }
+    const bool is_signed{image_format == Shader::ImageFormat::R8_SINT ||
+                         image_format == Shader::ImageFormat::R16_SINT};
     if (!storage_views) {
         storage_views = std::make_unique<StorageViews>();
     }
-
-    // Storage images MUST use identity component mapping.
-    // Typeless: use the underlying image's native format.
-    if (image_format == Shader::ImageFormat::Typeless) {
-        auto& view = storage_views->unsigneds[static_cast<size_t>(texture_type)];
-        if (view) {
-            return *view;
-        }
-        const auto fmt_info =
-                MaxwellToVK::SurfaceFormat(*device, FormatType::Optimal, /*is_image=*/true, format);
-        const VkFormat vk_format = fmt_info.format;
-        // Storage images are color-aspect only
-        view = MakeView(vk_format, VK_IMAGE_ASPECT_COLOR_BIT); // identity components inside
+    auto& views{is_signed ? storage_views->signeds : storage_views->unsigneds};
+    auto& view{views[static_cast<size_t>(texture_type)]};
+    if (view) {
         return *view;
     }
-    const bool is_signed = (image_format == Shader::ImageFormat::R8_SINT ||image_format == Shader::ImageFormat::R16_SINT);
-    auto& views = is_signed ? storage_views->signeds : storage_views->unsigneds;
-    auto& view  = views[static_cast<size_t>(texture_type)];
-    if (view) {
-       return *view;
-    }
-
-    const VkFormat vk_format = Format(image_format);
-    view = MakeView(vk_format, VK_IMAGE_ASPECT_COLOR_BIT);// identity components inside
+    view = MakeView(Format(image_format), VK_IMAGE_ASPECT_COLOR_BIT);
     return *view;
 }
 

From 28d26b0d7677801a0da2bd9253c5bc342e1c8a4c Mon Sep 17 00:00:00 2001
From: Caio Oliveira <caiooliveirafarias0@gmail.com>
Date: Fri, 12 Sep 2025 16:07:35 +0200
Subject: [PATCH 36/38] [ci, docs] Refactor building with Windows (#400)

* this a initial PR to improve documentation of building under windows

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/400
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: Shinmegumi <shinmegumi@eden-emu.dev>
Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Co-committed-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
---
 .ci/windows/install-msvc.ps1       |  42 +++++
 .ci/windows/install-vulkan-sdk.ps1 |   6 +
 docs/build/Windows.md              | 262 ++++++++++++++++++-----------
 3 files changed, 212 insertions(+), 98 deletions(-)
 create mode 100755 .ci/windows/install-msvc.ps1

diff --git a/.ci/windows/install-msvc.ps1 b/.ci/windows/install-msvc.ps1
new file mode 100755
index 0000000000..b88f727ed8
--- /dev/null
+++ b/.ci/windows/install-msvc.ps1
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: 2025 Eden Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+$ErrorActionPreference = "Stop"
+
+# Check if running as administrator
+if (-not ([bool](net session 2>$null))) {
+    Write-Host "This script must be run with administrator privileges!"
+    Exit 1
+}
+
+$VSVer = "17"
+$ExeFile = "vs_BuildTools.exe"
+$Uri = "https://aka.ms/vs/$VSVer/release/$ExeFile"
+$Destination = "./$ExeFile"
+
+Write-Host "Downloading Visual Studio Build Tools from $Uri"
+$WebClient = New-Object System.Net.WebClient
+$WebClient.DownloadFile($Uri, $Destination)
+Write-Host "Finished downloading $ExeFile"
+
+$VSROOT = "C:/VSBuildTools/$VSVer"
+$Arguments = @(
+    "--installPath `"$VSROOT`"",                               # set custom installation path
+    "--quiet",                                                  # suppress UI
+    "--wait",                                                   # wait for installation to complete
+    "--norestart",                                              # prevent automatic restart
+    "--add Microsoft.VisualStudio.Workload.VCTools",            # add C++ build tools workload
+    "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",  # add core x86/x64 C++ tools
+    "--add Microsoft.VisualStudio.Component.Windows10SDK.19041" # add specific Windows SDK
+)
+
+Write-Host "Installing Visual Studio Build Tools"
+$InstallProcess = Start-Process -FilePath $Destination -NoNewWindow -PassThru -Wait -ArgumentList $Arguments
+$ExitCode = $InstallProcess.ExitCode
+
+if ($ExitCode -ne 0) {
+    Write-Host "Error installing Visual Studio Build Tools (Error: $ExitCode)"
+    Exit $ExitCode
+}
+
+Write-Host "Finished installing Visual Studio Build Tools"
diff --git a/.ci/windows/install-vulkan-sdk.ps1 b/.ci/windows/install-vulkan-sdk.ps1
index 1d4e1b20bf..4c5274d1b7 100755
--- a/.ci/windows/install-vulkan-sdk.ps1
+++ b/.ci/windows/install-vulkan-sdk.ps1
@@ -3,6 +3,12 @@
 
 $ErrorActionPreference = "Stop"
 
+# Check if running as administrator
+if (-not ([bool](net session 2>$null))) {
+    Write-Host "This script must be run with administrator privileges!"
+    Exit 1
+}
+
 $VulkanSDKVer = "1.4.321.1"
 $ExeFile = "vulkansdk-windows-X64-$VulkanSDKVer.exe"
 $Uri = "https://sdk.lunarg.com/sdk/download/$VulkanSDKVer/windows/$ExeFile"
diff --git a/docs/build/Windows.md b/docs/build/Windows.md
index c1792983aa..76602e6d69 100644
--- a/docs/build/Windows.md
+++ b/docs/build/Windows.md
@@ -1,149 +1,195 @@
-# THIS GUIDE IS INTENDED FOR DEVELOPERS ONLY, SUPPORT WILL ONLY BE GIVEN IF YOU'RE A DEVELOPER.
+# ⚠️ This guide is for developers ONLY! Support will be provided to developers ONLY.
 
-## Method I: MSVC Build for Windows
+## 📋 Current building methods:
 
-### Minimal Dependencies
+* [ Minimal Dependencies](#minimal-dependencies)
+* [⚡ Method I: MSVC Build for Windows](#method-i-msvc-build-for-windows)
+* [🐧 Method II: MinGW-w64 Build with MSYS2](#method-ii-mingw-w64-build-with-msys2)
+* [🖥️ Method III: CLion Environment Setup](#method-iii-clion-environment-setup)
+* [💻 Building from the command line with MSVC](#building-from-the-command-line-with-msvc)
+* [📜 Building with Scripts](#building-with-scripts)
 
-On Windows, all library dependencies are automatically included within the `externals` folder, or can be downloaded on-demand. To build Eden, you need to install:
+---
 
-  * **[Visual Studio 2022 Community](https://visualstudio.microsoft.com/downloads/)** - **Make sure to select C++ support in the installer. Make sure to update to the latest version if already installed.**
-  * **[CMake](https://cmake.org/download/)** - Used to generate Visual Studio project files. Does not matter if either 32-bit or 64-bit version is installed.
-  * **[Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows)** - **Make sure to select Latest SDK.**
-    - A convenience script to install the latest SDK is provided in `.ci\windows\install-vulkan-sdk.ps1`.
 
-  ![2](https://i.imgur.com/giDwuTm.png)
+## Minimal Dependencies
 
-  * **Git** - We recommend [Git for Windows](https://gitforwindows.org).
+On Windows, **all** library dependencies are **automatically included** within the `externals` folder.
 
-  ![3](https://i.imgur.com/UeSzkBw.png)
+You still need to install:
 
-  * While installing Git Bash, you should tell it to include Git in your system path. (Choose the "Git from the command line and also from 3rd-party software" option.) If you missed that, don't worry, you'll just have to manually tell CMake where your git.exe is, since it's used to include version info into the built executable.
+* **[CMake](https://cmake.org/download/)** - Used to generate Visual Studio project files.
+* **[Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows)** - Make sure to select **Latest SDK**.
 
-  ![4](https://i.imgur.com/x0rRs1t.png)
+  * *A convenience script to install the latest SDK is provided in `.ci/windows/install-vulkan-sdk.ps1`*
+* **[Git for Windows](https://gitforwindows.org)** - We recommend installing Git for command line use and version control integration.
 
-### Cloning Eden with Git
+  <img src="https://i.imgur.com/x0rRs1t.png" width="500">
 
-**Master:**
-  ```cmd
-  git clone --recursive https://git.eden-emu.dev/eden-emu/eden
-  cd eden
-  ```
+  * *While installing Git Bash, select "Git from the command line and also from 3rd-party software". If missed, manually set `git.exe` path in CMake.*
 
-  ![9](https://i.imgur.com/CcxIAht.png)
+---
 
-* *(Note: eden by default downloads to `C:\Users\<user-name>\eden` (Master)
+## ⚡ Method I: MSVC Build for Windows
 
-### Building
+### a. Prerequisites to MSVC Build
 
-* Open the CMake GUI application and point it to the `eden` (Master) 
+* **[Visual Studio 2022 Community](https://visualstudio.microsoft.com/downloads/)** - Make sure to **select C++ support** in the installer, or **update to the latest version** if already installed.
 
-  ![10](https://i.imgur.com/qOslIWv.png)
+  * *A convenience script to install the **minimal** version (Visual Build Tools) is provided in `.ci/windows/install-msvc.ps1`*
 
-* For the build directory, use a `/build` subdirectory inside the source directory or some other directory of your choice. (Tell CMake to create it.)
+---
+
+### b. Clone the eden repository with Git
+
+Open Terminal on 
+
+```cmd
+git clone https://git.eden-emu.dev/eden-emu/eden
+cd eden
+```
+
+* *By default `eden` downloads to `C:\Users\<user-name>\eden`*
+
+---
+
+### c. Building
+
+* Open the CMake GUI application and point it to the `eden`
+
+  <img src="https://i.imgur.com/qOslIWv.png" width="500">
+
+* For the build directory, use a `build/` subdirectory inside the source directory or some other directory of your choice. (Tell CMake to create it.)
 
 * Click the "Configure" button and choose `Visual Studio 17 2022`, with `x64` for the optional platform.
 
-  ![12](https://i.imgur.com/DKiREaK.png)
-
-  * *(Note: If you used GitHub's own app to clone, run `git submodule update --init --recursive` to get the remaining dependencies)*
+  <img src="https://i.imgur.com/DKiREaK.png" width="500">
 
   * *(You may also want to disable `YUZU_TESTS` in this case since Catch2 is not yet supported with this.)*
 
-  ![13](https://user-images.githubusercontent.com/22451773/180585999-07316d6e-9751-4d11-b957-1cf57cd7cd58.png)
+  <img src="https://user-images.githubusercontent.com/22451773/180585999-07316d6e-9751-4d11-b957-1cf57cd7cd58.png" width="500">
 
 * Click "Generate" to create the project files.
 
-  ![15](https://i.imgur.com/5LKg92k.png)
+  <img src="https://i.imgur.com/5LKg92k.png" width="500">
 
 * Open the solution file `yuzu.sln` in Visual Studio 2022, which is located in the build folder.
 
-  ![16](https://i.imgur.com/208yMml.png)
+  <img src="https://i.imgur.com/208yMml.png" width="500">
 
-* Depending if you want a graphical user interface or not (`eden` has the graphical user interface, while `eden-cmd` doesn't), select `eden` or `eden-cmd` in the Solution Explorer, right-click and `Set as StartUp Project`.
+* * Depending on whether you want a graphical user interface or not, select in the Solution Explorer:
+    * `eden` (GUI)
+    * `eden-cmd` (command-line only)
+  * Then **right-click** and choose `Set as StartUp Project`.
 
-  ![17](https://i.imgur.com/nPMajnn.png)  ![18](https://i.imgur.com/BDMLzRZ.png)
+  <img src="https://i.imgur.com/nPMajnn.png" height="500">
+  <img src="https://i.imgur.com/BDMLzRZ.png" height="500">
 
-* Select the appropriate build type, Debug for debug purposes or Release for performance (in case of doubt choose Release).
+* Select the appropriate build type, `Debug` for debug purposes or `Release` for performance (in case of doubt choose `Release`).
 
-  ![19](https://i.imgur.com/qxg4roC.png)
+  <img src="https://i.imgur.com/qxg4roC.png" width="500">
 
-* Right-click the project you want to build and press Build in the submenu or press F5.
+* **Right-click** the project you want to build and press **Build** in the submenu or press `F5`.
 
-  ![20](https://i.imgur.com/CkQgOFW.png)
+  <img src="https://i.imgur.com/CkQgOFW.png" height="500">
 
-## Method II: MinGW-w64 Build with MSYS2
+---
 
-### Prerequisites to install
+## 🐧 Method II: MinGW-w64 Build with MSYS2
 
-* [MSYS2](https://www.msys2.org)
-* [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows) - **Make sure to select Latest SDK.**
-* Make sure to follow the instructions and update to the latest version by running `pacman -Syu` as many times as needed.
+### a. Prerequisites to MinGW-w64
 
-### Install eden dependencies for MinGW-w64
+* **[MSYS2](https://www.msys2.org)** - A versatile and up-to-date development environment for Windows, providing a Unix-like shell, package manager, and toolchain.
 
-* Open the `MSYS2 MinGW 64-bit` (mingw64.exe) shell
-* Download and install all dependencies using: `pacman -Syu git make mingw-w64-x86_64-SDL2 mingw-w64-x86_64-cmake mingw-w64-x86_64-python-pip mingw-w64-x86_64-qt6 mingw-w64-x86_64-toolchain autoconf libtool automake-wrapper`
-* Add MinGW binaries to the PATH: `echo 'PATH=/mingw64/bin:$PATH' >> ~/.bashrc`
-* Add glslangValidator to the PATH: `echo 'PATH=$(readlink -e /c/VulkanSDK/*/Bin/):$PATH' >> ~/.bashrc`
+---
 
-### Clone the eden repository with Git
+### b. Install eden dependencies for MinGW-w64
 
-  ```bash
-  git clone --recursive https://git.eden-emu.dev/eden-emu/eden
-  cd eden
-  ```
+* Open the `MSYS2 MinGW 64-bit` shell (`mingw64.exe`)
+* Download and install all dependencies using:
+  * `pacman -Syu git make mingw-w64-x86_64-SDL2 mingw-w64-x86_64-cmake mingw-w64-x86_64-python-pip mingw-w64-x86_64-qt6 mingw-w64-x86_64-toolchain autoconf libtool automake-wrapper`
+* Add MinGW binaries to the PATH:
+  * `echo 'PATH=/mingw64/bin:$PATH' >> ~/.bashrc`
+* Add VulkanSDK to the PATH:
+  * `echo 'PATH=$(readlink -e /c/VulkanSDK/*/Bin/):$PATH' >> ~/.bashrc`
 
-### Run the following commands to build eden (dynamically linked build)
+---
+
+### c. Clone the eden repository with Git
+
+```cmd
+git clone https://git.eden-emu.dev/eden-emu/eden
+cd eden
+```
+
+---
+
+### d. Building dynamically-linked eden
+
+* This process will generate a *dynamically* linked build
 
 ```bash
+# Make build dir and enter
 mkdir build && cd build
-cmake -G "MSYS Makefiles" -DYUZU_TESTS=OFF ..
+
+# Generate CMake Makefiles
+cmake .. -G "MSYS Makefiles" -DYUZU_TESTS=OFF
+
+# Build
 make -j$(nproc)
-# test eden out with
+
+# Run eden!
 ./bin/eden.exe
 ```
 
-* *(Note: This build is not a static build meaning that you need to include all of the DLLs with the .exe in order to use it!)*
+* *Warning: This build is not a **static** build meaning that you **need** to include all of the DLLs with the .exe in order to use it or other systems!*
 
-e.g.
-```Bash
-cp externals/ffmpeg-*/bin/*.dll bin/
+---
+
+### Additional notes
+
+
+* Eden doesn't require the rather large Qt dependency, but you will lack a GUI frontend
+
+```bash
+# ...
+
+# Generate CMake Makefiles (withou QT)
+cmake .. -G "MSYS Makefiles" -DYUZU_TESTS=OFF -DENABLE_QT=no
+
+$ ...
 ```
 
-Bonus Note: Running programs from inside `MSYS2 MinGW x64` shell has a different %PATH% than directly from explorer. This different %PATH% has the locations of the other DLLs required.
-![image](https://user-images.githubusercontent.com/190571/165000848-005e8428-8a82-41b1-bb4d-4ce7797cdac8.png)
+* Running programs from inside `MSYS2 MinGW x64` shell has a different `%PATH%` than directly from explorer.
+  * This different `%PATH%` has the locations of the other DLLs required.
 
+    <img src="https://user-images.githubusercontent.com/190571/165000848-005e8428-8a82-41b1-bb4d-4ce7797cdac8.png" width="500">
 
-### Building without Qt (Optional)
+---
 
-Doesn't require the rather large Qt dependency, but you will lack a GUI frontend:
+## 🖥️ Method III: CLion Environment Setup
 
-  * Pass the `-DENABLE_QT=no` flag to cmake
-
-## Method III: CLion Environment Setup
-
-### Minimal Dependencies
-
-To build eden, you need to install the following:
+### a. Prerequisites to CLion
 
 * [CLion](https://www.jetbrains.com/clion/) - This IDE is not free; for a free alternative, check Method I
-* [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows) - Make sure to select the Latest SDK.
 
-### Cloning eden with CLion
+---
+
+### b. Cloning eden with CLion
 
 * Clone the Repository:
 
-![1](https://user-images.githubusercontent.com/42481638/216899046-0d41d7d6-8e4d-4ed2-9587-b57088af5214.png)
-![2](https://user-images.githubusercontent.com/42481638/216899061-b2ea274a-e88c-40ae-bf0b-4450b46e9fea.png)
-![3](https://user-images.githubusercontent.com/42481638/216899076-0e5988c4-d431-4284-a5ff-9ecff973db76.png)
+<img src="https://user-images.githubusercontent.com/42481638/216899046-0d41d7d6-8e4d-4ed2-9587-b57088af5214.png" width="500">
+<img src="https://user-images.githubusercontent.com/42481638/216899061-b2ea274a-e88c-40ae-bf0b-4450b46e9fea.png" width="500">
+<img src="https://user-images.githubusercontent.com/42481638/216899076-0e5988c4-d431-4284-a5ff-9ecff973db76.png" width="500">
 
+---
 
-
-### Building & Setup
+### c. Building & Setup
 
 * Once Cloned, You will be taken to a prompt like the image below:
 
-![4](https://user-images.githubusercontent.com/42481638/216899092-3fe4cec6-a540-44e3-9e1e-3de9c2fffc2f.png)
+<img src="https://user-images.githubusercontent.com/42481638/216899092-3fe4cec6-a540-44e3-9e1e-3de9c2fffc2f.png" width="500">
 
 * Set the settings to the image below:
 * Change `Build type: Release`
@@ -152,42 +198,62 @@ To build eden, you need to install the following:
 * Change `Generator: Let CMake decide`
 * Change `Build directory: build`
 
-![5](https://user-images.githubusercontent.com/42481638/216899164-6cee8482-3d59-428f-b1bc-e6dc793c9b20.png)
+<img src="https://user-images.githubusercontent.com/42481638/216899164-6cee8482-3d59-428f-b1bc-e6dc793c9b20.png" width="500">
 
 * Click OK; now Clion will build a directory and index your code to allow for IntelliSense. Please be patient.
 * Once this process has been completed (No loading bar bottom right), you can now build eden
 * In the top right, click on the drop-down menu, select all configurations, then select eden
 
-![6](https://user-images.githubusercontent.com/42481638/216899226-975048e9-bc6d-4ec1-bc2d-bd8a1e15ed04.png)
+<img src="https://user-images.githubusercontent.com/42481638/216899226-975048e9-bc6d-4ec1-bc2d-bd8a1e15ed04.png" height="500" >
 
 * Now run by clicking the play button or pressing Shift+F10, and eden will auto-launch once built.
 
-![7](https://user-images.githubusercontent.com/42481638/216899275-d514ec6a-e563-470e-81e2-3e04f0429b68.png)
+<img src="https://user-images.githubusercontent.com/42481638/216899275-d514ec6a-e563-470e-81e2-3e04f0429b68.png" width="500">
 
-## Building from the command line with MSVC
+---
+
+## 💻 Building from the command line with MSVC
 
 ```cmd
-git clone --recursive https://git.eden-emu.dev/eden-emu/eden
+# Clone eden and enter
+git clone https://git.eden-emu.dev/eden-emu/eden
 cd eden
-mkdir build
-cd build
-cmake .. -G "Visual Studio 17 2022" -A x64
+
+# Make build dir and enter
+mkdir build && cd build
+
+# Generate CMake Makefiles
+cmake .. -G "Visual Studio 17 2022" -A x64 -DYUZU_TESTS=OFF
+
+# Build
 cmake --build . --config Release
 ```
 
-### Building with Scripts
-A convenience script for building is provided in `.ci/windows/build.sh`. You must run this with Bash, e.g. Git Bash or MinGW TTY. To use this script, you must have windeployqt installed (usually bundled with Qt) and set the `WINDEPLOYQT` environment variable to its canonical Bash location, e.g. `WINDEPLOYQT="/c/Qt/6.9.1/msvc2022_64/bin/windeployqt6.exe" .ci/windows/build.sh`.
+## 📜 Building with Scripts
 
-Extra CMake flags should be placed in the arguments of the script.
+* A convenience script for building is provided in `.ci/windows/build.sh`.
+* You must run this with Bash, e.g. Git Bash or MinGW TTY.
+* To use this script, you must have `windeployqt` installed (usually bundled with Qt) and set the `WINDEPLOYQT` environment variable to its canonical Bash location:
+  * `WINDEPLOYQT="/c/Qt/6.9.1/msvc2022_64/bin/windeployqt6.exe" .ci/windows/build.sh`.
+* You can use `aqtinstall`, more info on <https://github.com/miurahr/aqtinstall> and <https://ddalcino.github.io/aqt-list-server/>
 
-Additional environment variables can be used to control building:
-- `BUILD_TYPE`: Sets the build type to use. Defaults to `Release`
 
-The following environment variables are boolean flags. Set to `true` to enable or `false` to disable:
-- `DEVEL` (default FALSE): Disable Qt update checker
-- `USE_WEBENGINE` (default FALSE): Enable Qt WebEngine
-- `USE_MULTIMEDIA` (default TRUE): Enable Qt Multimedia
-- `BUNDLE_QT` (default FALSE): Use bundled Qt
-  * Note that using system Qt requires you to include the Qt CMake directory in `CMAKE_PREFIX_PATH`, e.g. `.ci/windows/build.sh -DCMAKE_PREFIX_PATH=C:/Qt/6.9.0/msvc2022_64/lib/cmake/Qt6`
+* Extra CMake flags should be placed in the arguments of the script.
+
+#### Additional environment variables can be used to control building:
+
+* `BUILD_TYPE` (default `Release`): Sets the build type to use.
+
+* The following environment variables are boolean flags. Set to `true` to enable or `false` to disable:
+
+  * `DEVEL` (default FALSE): Disable Qt update checker
+  * `USE_WEBENGINE` (default FALSE): Enable Qt WebEngine
+  * `USE_MULTIMEDIA` (default TRUE): Enable Qt Multimedia
+  * `BUNDLE_QT` (default FALSE): Use bundled Qt
+
+  * Note that using **system Qt** requires you to include the Qt CMake directory in `CMAKE_PREFIX_PATH`
+    * `.ci/windows/build.sh -DCMAKE_PREFIX_PATH=C:/Qt/6.9.0/msvc2022_64/lib/cmake/Qt6`
+
+* After building, a zip can be packaged via `.ci/windows/package.sh`. You must have 7-zip installed and in your PATH.
+  * The resulting zip will be placed into `artifacts` in the source directory.
 
-After building, a zip can be packaged via `.ci/windows/package.sh`. Note that you must have 7-zip installed and in your PATH. The resulting zip will be placed into `artifacts` in the source directory.

From 7b830b6c8b12f590423f87e9666df76e021500c7 Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Wed, 23 Jul 2025 22:33:15 +0100
Subject: [PATCH 37/38] [sse2neon] Update to v1.8.0

---
 externals/sse2neon/sse2neon.h | 1453 ++++++++++++++++++---------------
 1 file changed, 809 insertions(+), 644 deletions(-)

diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h
index 67ad0ae6f8..4626e923fd 100755
--- a/externals/sse2neon/sse2neon.h
+++ b/externals/sse2neon/sse2neon.h
@@ -54,6 +54,7 @@
 //   Cuda Chen <clh960524@gmail.com>
 //   Aymen Qader <aymen.qader@arm.com>
 //   Anthony Roberts <anthony.roberts@linaro.org>
+//   Sean Luchen <seanluchen@google.com>
 
 /* Tunable configurations */
 
@@ -65,7 +66,7 @@
 #ifndef SSE2NEON_PRECISE_MINMAX
 #define SSE2NEON_PRECISE_MINMAX (0)
 #endif
-/* _mm_rcp_ps and _mm_div_ps */
+/* _mm_rcp_ps */
 #ifndef SSE2NEON_PRECISE_DIV
 #define SSE2NEON_PRECISE_DIV (0)
 #endif
@@ -113,6 +114,11 @@
 #warning "GCC versions earlier than 10 are not supported."
 #endif
 
+#if defined(__OPTIMIZE__) && !defined(SSE2NEON_SUPPRESS_WARNINGS)
+#warning \
+    "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon."
+#endif
+
 /* C language does not allow initializing a variable with a function call. */
 #ifdef __cplusplus
 #define _sse2neon_const static const
@@ -120,18 +126,34 @@
 #define _sse2neon_const const
 #endif
 
+#include <fenv.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <string.h>
 
-#if defined(_WIN32)
-/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
- * from both MinGW-w64 and MSVC.
- */
+FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val)
+{
+    double tmp;
+    memcpy(&tmp, &val, sizeof(uint64_t));
+    return tmp;
+}
+FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
+{
+    int64_t tmp;
+    memcpy(&tmp, &val, sizeof(uint64_t));
+    return tmp;
+}
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h> from MSVC. */
 #define SSE2NEON_ALLOC_DEFINED
 #endif
 
 /* If using MSVC */
 #ifdef _MSC_VER
+#if defined(_M_ARM64EC)
+#define _DISABLE_SOFTINTRIN_ 1
+#endif
 #include <intrin.h>
 #if SSE2NEON_INCLUDE_WINDOWS_H
 #include <processthreadsapi.h>
@@ -147,7 +169,7 @@
 #endif
 
 #if (defined(_M_AMD64) || defined(__x86_64__)) || \
-    (defined(_M_ARM64) || defined(__arm64__))
+    (defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__arm64__))
 #define SSE2NEON_HAS_BITSCAN64
 #endif
 #endif
@@ -230,7 +252,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #pragma GCC push_options
 #pragma GCC target("fpu=neon")
 #endif
-#elif defined(__aarch64__) || defined(_M_ARM64)
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #if !defined(__clang__) && !defined(_MSC_VER)
 #pragma GCC push_options
 #pragma GCC target("+simd")
@@ -244,12 +266,15 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #pragma GCC push_options
 #endif
 #else
-#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#error \
+    "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A \
+(you could try setting target explicitly with -march or -mcpu)"
 #endif
 #endif
 
 #include <arm_neon.h>
-#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
+#if (!defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)) && \
+    (__ARM_ARCH == 8)
 #if defined __has_include && __has_include(<arm_acle.h>)
 #include <arm_acle.h>
 #endif
@@ -267,7 +292,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #endif
 
 /* Rounding functions require either Aarch64 instructions or libm fallback */
-#if !defined(__aarch64__) && !defined(_M_ARM64)
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
 #include <math.h>
 #endif
 
@@ -276,7 +301,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
  * To write or access to these registers in user mode,
  * we have to perform syscall instead.
  */
-#if (!defined(__aarch64__) && !defined(_M_ARM64))
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
 #include <sys/time.h>
 #endif
 
@@ -315,6 +340,15 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
 
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_pd().
+ * Argument fp1 is a digit[01] that represents the fp from argument "b"
+ * of mm_shuffle_pd that will be placed in fp1 of result.
+ * fp0 is a digit[01] that represents the fp from argument "a" of mm_shuffle_pd
+ * that will be placed in fp0 of result.
+ */
+#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0))
+
 #if __has_builtin(__builtin_shufflevector)
 #define _sse2neon_shuffle(type, a, b, ...) \
     __builtin_shufflevector(a, b, __VA_ARGS__)
@@ -376,13 +410,18 @@ typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
 // On ARM 32-bit architecture, the float64x2_t is not supported.
 // The data type __m128d should be represented in a different way for related
 // intrinsic conversion.
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
 #else
 typedef float32x4_t __m128d;
 #endif
 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 
+// Some intrinsics operate on unaligned data types.
+typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
+typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
+typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
+
 // __int64 is defined in the Intrinsics Guide which maps to different datatype
 // in different data model
 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
@@ -472,7 +511,7 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 
 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
 
@@ -604,7 +643,7 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64)
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
 /* emulate vaddv u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 {
@@ -619,7 +658,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64)
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
 /* emulate vaddvq u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
 {
@@ -637,7 +676,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64)
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
 /* emulate vaddvq u16 variant */
 FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
 {
@@ -692,6 +731,13 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
  */
 
 /* Constants for use with _mm_prefetch. */
+#if defined(_M_ARM64EC)
+/* winnt.h already defines these constants as macros, so undefine them first. */
+#undef _MM_HINT_NTA
+#undef _MM_HINT_T0
+#undef _MM_HINT_T1
+#undef _MM_HINT_T2
+#endif
 enum _mm_hint {
     _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
     _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
@@ -707,7 +753,7 @@ typedef struct {
     uint8_t bit23 : 1;
     uint8_t bit24 : 1;
     uint8_t res2 : 7;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     uint32_t res3;
 #endif
 } fpcr_bitfield;
@@ -851,8 +897,8 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
 // supported by WoA has crypto extensions. If this changes in the future,
 // this can be verified via the runtime-only method of:
 // IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
-#if (defined(_M_ARM64) && !defined(__clang__)) || \
-    (defined(__ARM_FEATURE_CRYPTO) &&             \
+#if ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) || \
+    (defined(__ARM_FEATURE_CRYPTO) &&                                      \
      (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
 // Wraps vmull_p64
 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
@@ -977,8 +1023,8 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 //   __m128i _mm_shuffle_epi32_default(__m128i a,
 //                                     __constrange(0, 255) int imm) {
 //       __m128i ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       ret[0] = a[(imm)        & 0x3];   ret[1] = a[((imm) >> 2) & 0x3];
+//       ret[2] = a[((imm) >> 4) & 0x03];  ret[3] = a[((imm) >> 6) & 0x03];
 //       return ret;
 //   }
 #define _mm_shuffle_epi32_default(a, imm)                                   \
@@ -1076,7 +1122,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
 }
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #define _mm_shuffle_epi32_splat(a, imm) \
     vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
 #else
@@ -1093,8 +1139,8 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
 //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
 //                                 __constrange(0, 255) int imm) {
 //       __m128 ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       ret[0] = a[(imm)        & 0x3];   ret[1] = a[((imm) >> 2) & 0x3];
+//       ret[2] = b[((imm) >> 4) & 0x03];  ret[3] = b[((imm) >> 6) & 0x03];
 //       return ret;
 //   }
 //
@@ -1516,7 +1562,7 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpret_m64_s32(
         vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
@@ -1541,7 +1587,7 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
                           0);
@@ -1672,7 +1718,7 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
 #else
@@ -1725,7 +1771,7 @@ FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_DIV
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -1763,7 +1809,11 @@ FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
 #if !defined(SSE2NEON_ALLOC_DEFINED)
 FORCE_INLINE void _mm_free(void *addr)
 {
+#if defined(_WIN32)
+    _aligned_free(addr);
+#else
     free(addr);
+#endif
 }
 #endif
 
@@ -1783,7 +1833,7 @@ FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
 #if defined(_MSC_VER) && !defined(__clang__)
     _WriteStatusReg(ARM64_FPCR, value);
 #else
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(value));  /* write */
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
 #endif
 }
 
@@ -1795,14 +1845,14 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
 {
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -1817,25 +1867,20 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
 {
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-    r.value = _sse2neon_get_fpcr();
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    if (r.field.bit22) {
-        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
-    } else {
-        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    switch (fegetround()) {
+    case FE_TONEAREST:
+        return _MM_ROUND_NEAREST;
+    case FE_DOWNWARD:
+        return _MM_ROUND_DOWN;
+    case FE_UPWARD:
+        return _MM_ROUND_UP;
+    case FE_TOWARDZERO:
+        return _MM_ROUND_TOWARD_ZERO;
+    default:
+        // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+        // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error
+        // cases we treat them as FE_TOWARDZERO (truncate).
+        return _MM_ROUND_TOWARD_ZERO;
     }
 }
 
@@ -1928,7 +1973,7 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
 {
     return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+        vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
 }
 
 // Load unaligned 64-bit integer from memory into the first element of dst.
@@ -1936,7 +1981,7 @@ FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 {
     return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+        vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
 }
 
 // Allocate size bytes of memory, aligned to the alignment specified in align,
@@ -1946,6 +1991,9 @@ FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 #if !defined(SSE2NEON_ALLOC_DEFINED)
 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
 {
+#if defined(_WIN32)
+    return _aligned_malloc(size, align);
+#else
     void *ptr;
     if (align == 1)
         return malloc(size);
@@ -1954,6 +2002,7 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
     if (!posix_memalign(&ptr, align, size))
         return ptr;
     return NULL;
+#endif
 }
 #endif
 
@@ -2117,7 +2166,7 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
 FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 {
     uint8x8_t input = vreinterpret_u8_m64(a);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     uint8x8_t tmp = vshr_n_u8(input, 7);
     return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
@@ -2138,7 +2187,7 @@ FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 FORCE_INLINE int _mm_movemask_ps(__m128 a)
 {
     uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     static const int32_t shift[4] = {0, 1, 2, 3};
     uint32x4_t tmp = vshrq_n_u32(input, 31);
     return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
@@ -2372,7 +2421,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
     uint64x1_t t = vpaddl_u32(vpaddl_u16(
         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
     return vreinterpret_m64_u16(
-        vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+        vset_lane_u16((uint16_t) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
 }
 
 // Macro: Set the flush zero bits of the MXCSR control and status register to
@@ -2385,14 +2434,14 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
     // regardless of the value of the FZ bit.
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -2400,10 +2449,10 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
 
     r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     _sse2neon_set_fpcr(r.value);
 #else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
 #endif
 }
 
@@ -2431,44 +2480,26 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
 {
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-    r.value = _sse2neon_get_fpcr();
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
     switch (rounding) {
-    case _MM_ROUND_TOWARD_ZERO:
-        r.field.bit22 = 1;
-        r.field.bit23 = 1;
+    case _MM_ROUND_NEAREST:
+        rounding = FE_TONEAREST;
         break;
     case _MM_ROUND_DOWN:
-        r.field.bit22 = 0;
-        r.field.bit23 = 1;
+        rounding = FE_DOWNWARD;
         break;
     case _MM_ROUND_UP:
-        r.field.bit22 = 1;
-        r.field.bit23 = 0;
+        rounding = FE_UPWARD;
         break;
-    default:  //_MM_ROUND_NEAREST
-        r.field.bit22 = 0;
-        r.field.bit23 = 0;
+    case _MM_ROUND_TOWARD_ZERO:
+        rounding = FE_TOWARDZERO;
+        break;
+    default:
+        // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+        // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as
+        // FE_TOWARDZERO (truncate).
+        rounding = FE_TOWARDZERO;
     }
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-    _sse2neon_set_fpcr(r.value);
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
+    fesetround(rounding);
 }
 
 // Copy single-precision (32-bit) floating-point element a to the lower element
@@ -2524,10 +2555,10 @@ FORCE_INLINE __m128 _mm_setzero_ps(void)
 // in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
 #ifdef _sse2neon_shuffle
-#define _mm_shuffle_pi16(a, imm)                                       \
-    vreinterpret_m64_s16(vshuffle_s16(                                 \
-        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
-        ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
+#define _mm_shuffle_pi16(a, imm)                                         \
+    vreinterpret_m64_s16(vshuffle_s16(                                   \
+        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), ((imm) & 0x3), \
+        (((imm) >> 2) & 0x3), (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3)))
 #else
 #define _mm_shuffle_pi16(a, imm)                                              \
     _sse2neon_define1(                                                        \
@@ -2658,7 +2689,8 @@ FORCE_INLINE void _mm_lfence(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
+    !SSE2NEON_PRECISE_SQRT
     return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
 #else
     float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
@@ -2887,7 +2919,7 @@ FORCE_INLINE __m128 _mm_undefined_ps(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -2903,7 +2935,7 @@ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -2962,15 +2994,21 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
+    c[0] = a0 + b0;
+    c[1] = a1 + b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -2981,14 +3019,16 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_add_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1];
+    c[0] = a0 + b0;
+    c[1] = a1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -3140,7 +3180,7 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
 #else
     return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
@@ -3212,7 +3252,7 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3238,17 +3278,21 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
 FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3260,15 +3304,16 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
 FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3307,17 +3352,21 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
 FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3329,15 +3378,16 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
 FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3349,17 +3399,21 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
 FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3371,15 +3425,16 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
 FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmple_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3421,17 +3476,21 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
 FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3443,14 +3502,15 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
 FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3462,7 +3522,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
 FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
 #else
@@ -3488,20 +3548,22 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
 FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3521,20 +3583,22 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
 FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3554,20 +3618,22 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
 FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3587,20 +3653,22 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
 FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3620,7 +3688,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
 FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     // Excluding NaNs, any two floating point numbers can be compared.
     uint64x2_t not_nan_a =
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
@@ -3628,19 +3696,17 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3652,17 +3718,15 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
 FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3674,7 +3738,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
 FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     // Two NaNs are not equal in comparison operation.
     uint64x2_t not_nan_a =
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
@@ -3683,19 +3747,17 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_s32(
         vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3707,17 +3769,15 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
 FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3729,13 +3789,13 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
 FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 >= *(double *) &b0);
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return a0 >= b0;
 #endif
 }
 
@@ -3744,13 +3804,14 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
 FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 > *(double *) &b0);
+    return a0 > b0;
 #endif
 }
 
@@ -3759,13 +3820,14 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
 FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 <= *(double *) &b0);
+    return a0 <= b0;
 #endif
 }
 
@@ -3774,13 +3836,14 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
 FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 < *(double *) &b0);
+    return a0 < b0;
 #endif
 }
 
@@ -3789,7 +3852,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
 FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
 #else
     uint32x4_t a_not_nan =
@@ -3818,7 +3881,7 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
 FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
 #else
@@ -3849,8 +3912,11 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
         vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
     return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
 #endif
 }
@@ -3861,8 +3927,11 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 {
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
     int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
@@ -3873,13 +3942,14 @@ FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
 #else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_ps(0, 0, (float) a1, (float) a0);
 #endif
 }
 
@@ -3888,7 +3958,7 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
 FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
 #else
@@ -3907,7 +3977,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 {
 #if defined(__ARM_FEATURE_FRINT)
     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
-#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
+#elif (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (_MM_GET_ROUNDING_MODE()) {
     case _MM_ROUND_NEAREST:
@@ -3961,7 +4031,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
 #else
@@ -3975,10 +4045,12 @@ FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
 #else
-    return ((double *) &a)[0];
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return _a;
 #endif
 }
 
@@ -3987,11 +4059,12 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
     return (int32_t) ret;
 #endif
 }
@@ -4001,11 +4074,12 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
     return (int64_t) ret;
 #endif
 }
@@ -4022,13 +4096,15 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
 FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(vsetq_lane_f32(
         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
         vreinterpretq_f32_m128(a), 0));
 #else
-    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
-                                                 vreinterpretq_f32_m128(a), 0));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0));
 #endif
 }
 
@@ -4056,13 +4132,13 @@ FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
 FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    double bf = (double) b;
+    int64_t _b = sse2neon_recast_f64_s64((double) b);
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4084,13 +4160,13 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
 FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    double bf = (double) b;
+    int64_t _b = sse2neon_recast_f64_s64((double) b);
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4121,12 +4197,12 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 {
     double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
 #else
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+    return vreinterpretq_m128d_s64(vsetq_lane_s64(
+        sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4135,8 +4211,9 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 {
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
     return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
 }
 
@@ -4145,8 +4222,9 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
 {
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
     int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
@@ -4164,8 +4242,9 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 {
-    double ret = *((double *) &a);
-    return (int32_t) ret;
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int32_t) _a;
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
@@ -4173,11 +4252,12 @@ FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int64_t) _a;
 #endif
 }
 
@@ -4191,15 +4271,21 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] / db[0];
-    c[1] = da[1] / db[1];
+    c[0] = a0 / b0;
+    c[1] = a1 / b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -4211,7 +4297,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     float64x2_t tmp =
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_f64(
@@ -4243,7 +4329,7 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
 FORCE_INLINE __m128d _mm_load_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vld1q_f64(p));
 #else
     const float *fp = (const float *) p;
@@ -4263,7 +4349,7 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
 FORCE_INLINE __m128d _mm_load_sd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
 #else
     const float *fp = (const float *) p;
@@ -4285,7 +4371,7 @@ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
 #else
     return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
@@ -4298,7 +4384,7 @@ FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
 #else
@@ -4324,7 +4410,7 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
 #else
@@ -4340,7 +4426,7 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     float64x2_t v = vld1q_f64(p);
     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
 #else
@@ -4361,7 +4447,7 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 {
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+    return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
 }
 
 // Load unaligned 32-bit integer from memory into the first element of dst.
@@ -4369,7 +4455,7 @@ FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
 {
     return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+        vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
 }
 
 // Multiply packed signed 16-bit integers in a and b, producing intermediate
@@ -4380,7 +4466,7 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
 {
     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
                               vget_low_s16(vreinterpretq_s16_m128i(b)));
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int32x4_t high =
         vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
 
@@ -4434,7 +4520,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
 FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #if SSE2NEON_PRECISE_MINMAX
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
@@ -4444,15 +4530,19 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 > b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 > b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
 
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
 #endif
 }
 
@@ -4462,12 +4552,14 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
 FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_max_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 > b0 ? a0 : b0, a1};
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
@@ -4495,7 +4587,7 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
 FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #if SSE2NEON_PRECISE_MINMAX
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
@@ -4505,14 +4597,18 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 < b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 < b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
 #endif
 }
 
@@ -4522,12 +4618,14 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
 FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_min_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 < b0 ? a0 : b0, a1};
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
@@ -4678,15 +4776,21 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] * db[0];
-    c[1] = da[1] * db[1];
+    c[0] = a0 * b0;
+    c[1] = a1 * b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -4739,7 +4843,7 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
     uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
     uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     uint32x4_t ab7654 =
         vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
     uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
@@ -4895,11 +4999,11 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
                                   signed char b1,
                                   signed char b0)
 {
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    int8_t ALIGN_STRUCT(16) data[16] = {
+        (int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+        (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+        (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+        (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
     return (__m128i) vld1q_s8(data);
 }
 
@@ -4909,7 +5013,7 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 {
     double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
 #else
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
@@ -4926,7 +5030,7 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
 FORCE_INLINE __m128d _mm_set_sd(double a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
 #else
     return _mm_set_pd(0, a);
@@ -4973,10 +5077,11 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
 FORCE_INLINE __m128d _mm_set1_pd(double d)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
 #else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+    int64_t _d = sse2neon_recast_f64_s64(d);
+    return vreinterpretq_m128d_s64(vdupq_n_s64(_d));
 #endif
 }
 
@@ -5029,11 +5134,11 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
                                    signed char b14,
                                    signed char b15)
 {
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    int8_t ALIGN_STRUCT(16) data[16] = {
+        (int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+        (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+        (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+        (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
     return (__m128i) vld1q_s8(data);
 }
 
@@ -5049,7 +5154,7 @@ FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
 FORCE_INLINE __m128d _mm_setzero_pd(void)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vdupq_n_f64(0));
 #else
     return vreinterpretq_m128d_f32(vdupq_n_f32(0));
@@ -5136,12 +5241,12 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
 #define _mm_shuffle_pd(a, b, imm8)                                            \
     vreinterpretq_m128d_s64(                                                  \
         vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
-                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+                      (imm8) & 0x1, (((imm8) & 0x2) >> 1) + 2))
 #else
-#define _mm_shuffle_pd(a, b, imm8)                                     \
-    _mm_castsi128_pd(_mm_set_epi64x(                                   \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#define _mm_shuffle_pd(a, b, imm8)                                       \
+    _mm_castsi128_pd(_mm_set_epi64x(                                     \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), ((imm8) & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), (imm8) & 0x1)))
 #endif
 
 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
@@ -5222,7 +5327,7 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
     if (_sse2neon_unlikely(imm & ~15))
         return _mm_setzero_si128();
     return vreinterpretq_m128i_s16(
-        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16((int16_t) imm)));
 }
 
 // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
@@ -5250,13 +5355,13 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
 // dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
-#define _mm_slli_si128(a, imm)                                              \
-    _sse2neon_define1(                                                      \
-        __m128i, a, int8x16_t ret;                                          \
-        if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
-        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
-        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),      \
-                            ((imm <= 0 || imm > 15) ? 0 : (16 - imm)));     \
+#define _mm_slli_si128(a, imm)                                                \
+    _sse2neon_define1(                                                        \
+        __m128i, a, int8x16_t ret;                                            \
+        if (_sse2neon_unlikely((imm) == 0)) ret = vreinterpretq_s8_m128i(_a); \
+        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);        \
+        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),        \
+                            (((imm) <= 0 || (imm) > 15) ? 0 : (16 - (imm)))); \
         _sse2neon_return(vreinterpretq_m128i_s8(ret));)
 
 // Compute the square root of packed double-precision (64-bit) floating-point
@@ -5264,12 +5369,15 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
 FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double a0 = sqrt(((double *) &a)[0]);
-    double a1 = sqrt(((double *) &a)[1]);
-    return _mm_set_pd(a1, a0);
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double _a0 = sqrt(a0);
+    double _a1 = sqrt(a1);
+    return _mm_set_pd(_a1, _a0);
 #endif
 }
 
@@ -5279,10 +5387,13 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
 FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return _mm_move_sd(a, _mm_sqrt_pd(b));
 #else
-    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+    double _a, _b;
+    _a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    _b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return _mm_set_pd(_a, sqrt(_b));
 #endif
 }
 
@@ -5295,7 +5406,7 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
     if (_sse2neon_unlikely(c & ~15))
         return _mm_cmplt_epi16(a, _mm_setzero_si128());
     return vreinterpretq_m128i_s16(
-        vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
+        vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t) -c)));
 }
 
 // Shift packed 32-bit integers in a right by count while shifting in sign bits,
@@ -5315,7 +5426,7 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
 {
-    const int count = (imm & ~15) ? 15 : imm;
+    const int16_t count = (imm & ~15) ? 15 : (int16_t) imm;
     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
 }
 
@@ -5377,13 +5488,13 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
 // store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                                \
-    _sse2neon_define0(                                                        \
-        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {       \
-            ret = _mm_setzero_si128();                                        \
-        } else {                                                              \
-            ret = vreinterpretq_m128i_u16(                                    \
-                vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
+#define _mm_srli_epi16(a, imm)                                                 \
+    _sse2neon_define0(                                                         \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {        \
+            ret = _mm_setzero_si128();                                         \
+        } else {                                                               \
+            ret = vreinterpretq_m128i_u16(vshlq_u16(                           \
+                vreinterpretq_u16_m128i(_a), vdupq_n_s16((int16_t) - (imm)))); \
         } _sse2neon_return(ret);)
 
 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
@@ -5419,7 +5530,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
         __m128i, a, int8x16_t ret;                                     \
         if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
         else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
-                            (imm > 15 ? 0 : imm));                     \
+                            ((imm) > 15 ? 0 : (imm)));                 \
         _sse2neon_return(vreinterpretq_m128i_s8(ret));)
 
 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
@@ -5428,7 +5539,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
 #else
     vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
@@ -5441,7 +5552,7 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
     vst1q_f64((float64_t *) mem_addr,
               vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
@@ -5457,7 +5568,7 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
@@ -5483,7 +5594,7 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
@@ -5502,7 +5613,7 @@ FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
@@ -5553,7 +5664,7 @@ FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
 {
 #if __has_builtin(__builtin_nontemporal_store)
     __builtin_nontemporal_store(a, (__m128d *) p);
-#elif defined(__aarch64__) || defined(_M_ARM64)
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     vst1q_f64(p, vreinterpretq_f64_m128d(a));
 #else
     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
@@ -5633,15 +5744,21 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
 //  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] - db[0];
-    c[1] = da[1] - db[1];
+    c[0] = a0 - b0;
+    c[1] = a1 - b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -5730,7 +5847,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s16(
         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 #else
@@ -5746,7 +5863,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s32(
         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 #else
@@ -5762,7 +5879,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s64(
         vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -5777,7 +5894,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s8(
         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
@@ -5795,7 +5912,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -5810,7 +5927,7 @@ FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s16(
         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 #else
@@ -5826,7 +5943,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s32(
         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 #else
@@ -5842,7 +5959,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s64(
         vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -5857,7 +5974,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s8(
         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
@@ -5873,7 +5990,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -5910,7 +6027,7 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
 FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 {
     _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
                                              vreinterpretq_f64_m128d(b),
                                              vreinterpretq_f64_m128d(mask)));
@@ -5926,7 +6043,7 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 {
     _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_FMA) /* VFPv4+ */
     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
                                             vreinterpretq_f32_m128(mask),
@@ -5941,13 +6058,19 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[] = {da[0] + da[1], db[0] + db[1]};
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 + a1, b0 + b1};
     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
 #endif
 }
@@ -5957,7 +6080,7 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -5973,17 +6096,23 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 // Horizontally subtract adjacent pairs of double-precision (64-bit)
 // floating-point elements in a and b, and pack the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
-FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
     return vreinterpretq_m128d_f64(
-        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
+        vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
 #else
-    double *da = (double *) &_a;
-    double *db = (double *) &_b;
-    double c[] = {da[0] - da[1], db[0] - db[1]};
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 - a1, b0 - b1};
     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
 #endif
 }
@@ -5995,7 +6124,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 {
     float32x4_t a = vreinterpretq_f32_m128(_a);
     float32x4_t b = vreinterpretq_f32_m128(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
 #else
@@ -6020,7 +6149,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(
         vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
 #else
@@ -6034,7 +6163,7 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
 #elif defined(_sse2neon_shuffle)
@@ -6053,7 +6182,7 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128_f32(
         vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
 #elif defined(_sse2neon_shuffle)
@@ -6121,32 +6250,32 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
 // the result right by imm8 bytes, and store the low 16 bytes in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
 #if defined(__GNUC__) && !defined(__clang__)
-#define _mm_alignr_epi8(a, b, imm)                                            \
-    __extension__({                                                           \
-        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
-        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
-        __m128i ret;                                                          \
-        if (_sse2neon_unlikely((imm) & ~31))                                  \
-            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
-        else if (imm >= 16)                                                   \
-            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
-        else                                                                  \
-            ret =                                                             \
-                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
-        ret;                                                                  \
+#define _mm_alignr_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                 \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                 \
+        __m128i ret;                                               \
+        if (_sse2neon_unlikely((imm) & ~31))                       \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));           \
+        else if ((imm) >= 16)                                      \
+            ret = _mm_srli_si128(a, (imm) >= 16 ? (imm) - 16 : 0); \
+        else                                                       \
+            ret = vreinterpretq_m128i_u8(                          \
+                vextq_u8(_b, _a, (imm) < 16 ? (imm) : 0));         \
+        ret;                                                       \
     })
 
 #else
-#define _mm_alignr_epi8(a, b, imm)                                          \
-    _sse2neon_define2(                                                      \
-        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a);         \
-        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;           \
-        if (_sse2neon_unlikely((imm) & ~31)) ret =                          \
-            vreinterpretq_m128i_u8(vdupq_n_u8(0));                          \
-        else if (imm >= 16) ret =                                           \
-            _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0);                   \
-        else ret =                                                          \
-            vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
+#define _mm_alignr_epi8(a, b, imm)                                  \
+    _sse2neon_define2(                                              \
+        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
+        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;   \
+        if (_sse2neon_unlikely((imm) & ~31)) ret =                  \
+            vreinterpretq_m128i_u8(vdupq_n_u8(0));                  \
+        else if ((imm) >= 16) ret =                                 \
+            _mm_srli_si128(_a, (imm) >= 16 ? (imm) - 16 : 0);       \
+        else ret = vreinterpretq_m128i_u8(                          \
+            vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0));            \
         _sse2neon_return(ret);)
 
 #endif
@@ -6162,7 +6291,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
             uint8x8_t tmp_low;                                              \
             uint8x8_t tmp_high;                                             \
             if ((imm) >= 8) {                                               \
-                const int idx = (imm) -8;                                   \
+                const int idx = (imm) - 8;                                  \
                 tmp_low = vreinterpret_u8_m64(_a);                          \
                 tmp_high = vdup_n_u8(0);                                    \
                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
@@ -6181,7 +6310,7 @@ FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
 #else
     return vreinterpretq_m128i_s16(
@@ -6197,7 +6326,7 @@ FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
 #else
     return vreinterpretq_m128i_s32(
@@ -6229,7 +6358,7 @@ FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
     return vreinterpretq_s64_s16(
@@ -6254,7 +6383,7 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t res = vuzp_s16(a, b);
@@ -6269,7 +6398,7 @@ FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s16(
         vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
@@ -6285,7 +6414,7 @@ FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s32(
         vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
 #else
@@ -6301,7 +6430,7 @@ FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t c = vuzp_s16(a, b);
@@ -6316,7 +6445,7 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
 {
     int32x2_t a = vreinterpret_s32_m64(_a);
     int32x2_t b = vreinterpret_s32_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
 #else
     int32x2x2_t c = vuzp_s32(a, b);
@@ -6331,7 +6460,7 @@ FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s16(
         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
@@ -6347,7 +6476,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t c = vuzp_s16(a, b);
@@ -6362,7 +6491,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     uint8x16_t a = vreinterpretq_u8_m128i(_a);
     int8x16_t b = vreinterpretq_s8_m128i(_b);
     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
@@ -6466,7 +6595,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
     uint8x16_t idx_masked =
         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
 #elif defined(__GNUC__)
     int8x16_t ret;
@@ -6512,7 +6641,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
     // (b < 0) ? 0xFFFF : 0
     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
     // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
 #else
     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
@@ -6541,7 +6670,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
 
     // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
 #else
     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
@@ -6570,7 +6699,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
 
     // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
 #else
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
@@ -6599,7 +6728,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
 
     // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
 #else
     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
@@ -6628,7 +6757,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
 
     // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
 #else
     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
@@ -6657,7 +6786,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
 
     // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
 #else
     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
@@ -6683,14 +6812,14 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
     _sse2neon_define2(                                                  \
         __m128i, a, b,                                                  \
         const uint16_t _mask[8] =                                       \
-            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0);   \
+            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 1)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 2)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 3)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 4)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 5)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 6)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 7)) ? (uint16_t) - 1 : 0x0);  \
         uint16x8_t _mask_vec = vld1q_u16(_mask);                        \
         uint16x8_t __a = vreinterpretq_u16_m128i(_a);                   \
         uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
@@ -6715,11 +6844,9 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
 {
-    const uint32_t ALIGN_STRUCT(16)
-        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    const uint32_t ALIGN_STRUCT(16) data[4] = {
+        (imm8 & (1 << 0)) ? UINT32_MAX : 0, (imm8 & (1 << 1)) ? UINT32_MAX : 0,
+        (imm8 & (1 << 2)) ? UINT32_MAX : 0, (imm8 & (1 << 3)) ? UINT32_MAX : 0};
     uint32x4_t mask = vld1q_u32(data);
     float32x4_t a = vreinterpretq_f32_m128(_a);
     float32x4_t b = vreinterpretq_f32_m128(_b);
@@ -6746,7 +6873,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
 {
     uint64x2_t mask =
         vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     float64x2_t a = vreinterpretq_f64_m128d(_a);
     float64x2_t b = vreinterpretq_f64_m128d(_b);
     return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
@@ -6776,11 +6903,13 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
 FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double *f = (double *) &a;
-    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(ceil(a1), ceil(a0));
 #endif
 }
 
@@ -6790,7 +6919,7 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
 #else
@@ -6823,7 +6952,7 @@ FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
 // in dst
 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_u64(
         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
 #else
@@ -6980,7 +7109,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
         _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
     __m128d tmp = _mm_and_pd(mul, mulMask);
 #else
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
                              : 0;
@@ -6988,16 +7117,28 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
                              : 0;
 #else
-    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
-    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double d0 = (imm & 0x10) ? a0 * b0 : 0;
+    double d1 = (imm & 0x20) ? a1 * b1 : 0;
 #endif
     __m128d tmp = _mm_set_pd(d1, d0);
 #endif
     // Sum the products
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
 #else
-    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+    double _tmp0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
+    double _tmp1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
+    double sum = _tmp0 + _tmp1;
 #endif
     // Conditionally store the sum
     const __m128d sumMask =
@@ -7014,7 +7155,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 {
     float32x4_t elementwise_prod = _mm_mul_ps(a, b);
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     /* shortcuts */
     if (imm == 0xFF) {
         return _mm_set1_ps(vaddvq_f32(elementwise_prod));
@@ -7084,11 +7225,13 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
 FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double *f = (double *) &a;
-    return _mm_set_pd(floor(f[1]), floor(f[0]));
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(floor(a1), floor(a0));
 #endif
 }
 
@@ -7098,7 +7241,7 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
 #else
@@ -7157,24 +7300,24 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
 // element from b into tmp using the control in imm8. Store tmp to dst using
 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
-#define _mm_insert_ps(a, b, imm8)                                            \
-    _sse2neon_define2(                                                       \
-        __m128, a, b,                                                        \
-        float32x4_t tmp1 =                                                   \
-            vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3),            \
-                           vreinterpretq_f32_m128(_a), 0);                   \
-        float32x4_t tmp2 =                                                   \
-            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                          \
-                           vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
-        const uint32_t data[4] =                                             \
-            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,             \
-                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,             \
-                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,             \
-                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);            \
-        uint32x4_t mask = vld1q_u32(data);                                   \
-        float32x4_t all_zeros = vdupq_n_f32(0);                              \
-                                                                             \
-        _sse2neon_return(vreinterpretq_m128_f32(                             \
+#define _mm_insert_ps(a, b, imm8)                                              \
+    _sse2neon_define2(                                                         \
+        __m128, a, b,                                                          \
+        float32x4_t tmp1 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(_b, ((imm8) >> 6) & 0x3),            \
+                           vreinterpretq_f32_m128(_a), 0);                     \
+        float32x4_t tmp2 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                            \
+                           vreinterpretq_f32_m128(_a), (((imm8) >> 4) & 0x3)); \
+        const uint32_t data[4] =                                               \
+            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,               \
+                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,               \
+                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,               \
+                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);              \
+        uint32x4_t mask = vld1q_u32(data);                                     \
+        float32x4_t all_zeros = vdupq_n_f32(0);                                \
+                                                                               \
+        _sse2neon_return(vreinterpretq_m128_f32(                               \
             vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
 
 // Compare packed signed 32-bit integers in a and b, and store packed maximum
@@ -7256,7 +7399,7 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
 {
     __m128i dst;
     uint16_t min, idx = 0;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     // Find the minimum value
     min = vminvq_u16(vreinterpretq_u16_m128i(a));
 
@@ -7359,7 +7502,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
     c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
     uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
     c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     // |0|4|2|6|
     c04 = vpaddq_s16(c04, c26);
     // |1|5|3|7|
@@ -7419,7 +7562,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
 FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     switch (rounding) {
     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
         return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
@@ -7488,7 +7631,7 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (rounding) {
     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
@@ -7621,7 +7764,7 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
     uint64x2_t zeros = vbicq_u64(m, v);
 
     // If both 128-bit variables are populated (non-zero) then return 1.
-    // For comparision purposes, first compact each var down to 32-bits.
+    // For comparison purposes, first compact each var down to 32-bits.
     uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
 
     // if folding minimum is non-zero then both vars must be non-zero
@@ -7635,9 +7778,9 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 {
-    int64x2_t s64 =
+    int64x2_t s64_vec =
         vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+    return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
 }
 
 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
@@ -7655,9 +7798,9 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
 {
-    int64x2_t s64 =
+    int64x2_t s64_vec =
         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+    return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
 }
 
 /* SSE4.2 */
@@ -7825,40 +7968,40 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
                                       SSE2NEON_CAT(u, size)))                \
     } while (0)
 
-#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
-    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
-                                                int lb)                       \
-    {                                                                         \
-        __m128i mtx[16];                                                      \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
-        return SSE2NEON_CAT(                                                  \
-            _sse2neon_aggregate_equal_any_,                                   \
-            SSE2NEON_CAT(                                                     \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
-                                             type))))(la, lb, mtx);           \
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                               \
+    static uint16_t _sse2neon_cmp_##type##_equal_any(__m128i a, int la, \
+                                                     __m128i b, int lb) \
+    {                                                                   \
+        __m128i mtx[16];                                                \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),    \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));      \
+        return SSE2NEON_CAT(                                            \
+            _sse2neon_aggregate_equal_any_,                             \
+            SSE2NEON_CAT(                                               \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                  \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
+                                             type))))(la, lb, mtx);     \
     }
 
-#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
-    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
-                                                 int lb)                       \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_RANGES(                                                        \
-            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_ranges_,                                       \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
-                                             type))))(la, lb, mtx);            \
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)          \
+    static uint16_t _sse2neon_cmp_##us##type##_ranges(__m128i a, int la,     \
+                                                      __m128i b, int lb)     \
+    {                                                                        \
+        __m128i mtx[16];                                                     \
+        PCMPSTR_RANGES(                                                      \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);    \
+        return SSE2NEON_CAT(                                                 \
+            _sse2neon_aggregate_ranges_,                                     \
+            SSE2NEON_CAT(                                                    \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                       \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,      \
+                                             type))))(la, lb, mtx);          \
     }
 
 #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
-    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
-                                                    __m128i b, int lb)         \
+    static uint16_t _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,    \
+                                                         __m128i b, int lb)    \
     {                                                                          \
         __m128i mtx[16];                                                       \
         PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
@@ -7872,29 +8015,34 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
     }
 
-static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_equal_any_8x16(int la,
+                                                   int lb,
+                                                   __m128i mtx[16])
 {
-    int res = 0;
+    uint16_t res = 0;
     int m = (1 << la) - 1;
     uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
     uint8x16_t vec = vcombine_u8(t_lo, t_hi);
     for (int j = 0; j < lb; j++) {
         mtx[j] = vreinterpretq_m128i_u8(
             vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
         mtx[j] = vreinterpretq_m128i_u8(
             vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        uint16_t tmp =
+            _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
         res |= (tmp << j);
     }
     return res;
 }
 
-static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_equal_any_16x8(int la,
+                                                   int lb,
+                                                   __m128i mtx[16])
 {
-    int res = 0;
-    int m = (1 << la) - 1;
+    uint16_t res = 0;
+    uint16_t m = (uint16_t) (1 << la) - 1;
     uint16x8_t vec =
         vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
     for (int j = 0; j < lb; j++) {
@@ -7902,7 +8050,8 @@ static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
             vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
         mtx[j] = vreinterpretq_m128i_u16(
             vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        uint16_t tmp =
+            _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
         res |= (tmp << j);
     }
     return res;
@@ -7916,10 +8065,10 @@ static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
 
 SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
 
-static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
 {
-    int res = 0;
-    int m = (1 << la) - 1;
+    uint16_t res = 0;
+    uint16_t m = (uint16_t) (1 << la) - 1;
     uint16x8_t vec =
         vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
     for (int j = 0; j < lb; j++) {
@@ -7931,24 +8080,24 @@ static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
             vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
         uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
                                        vreinterpretq_u32_m128i(tmp));
-#if defined(__aarch64__) || defined(_M_ARM64)
-        int t = vaddvq_u32(vec_res) ? 1 : 0;
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+        uint16_t t = vaddvq_u32(vec_res) ? 1 : 0;
 #else
         uint64x2_t sumh = vpaddlq_u32(vec_res);
-        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+        uint16_t t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
 #endif
         res |= (t << j);
     }
     return res;
 }
 
-static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
 {
-    int res = 0;
-    int m = (1 << la) - 1;
+    uint16_t res = 0;
+    uint16_t m = (uint16_t) ((1 << la) - 1);
     uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
     uint8x16_t vec = vcombine_u8(t_lo, t_hi);
     for (int j = 0; j < lb; j++) {
         mtx[j] = vreinterpretq_m128i_u8(
@@ -7959,7 +8108,7 @@ static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
             vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
         uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
                                        vreinterpretq_u16_m128i(tmp));
-        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        uint16_t t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
         res |= (t << j);
     }
     return res;
@@ -7981,22 +8130,25 @@ SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
 #undef SSE2NEON_CMP_RANGES_IS_BYTE
 #undef SSE2NEON_CMP_RANGES_IS_WORD
 
-static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a,
+                                              int la,
+                                              __m128i b,
+                                              int lb)
 {
     uint8x16_t mtx =
         vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x10000 - (1 << la);
-    int tb = 0x10000 - (1 << lb);
+    uint16_t m0 = (la < lb) ? 0 : (uint16_t) ((1 << la) - (1 << lb));
+    uint16_t m1 = (uint16_t) (0x10000 - (1 << la));
+    uint16_t tb = (uint16_t) (0x10000 - (1 << lb));
     uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
     uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
     vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
-    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
-    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
-    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
-    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
-    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+    vec0_lo = vtst_u8(vdup_n_u8((uint8_t) m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8((uint8_t) (m0 >> 8)), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8((uint8_t) m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8((uint8_t) (m1 >> 8)), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8((uint8_t) tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8((uint8_t) (tb >> 8)), vec_mask);
 
     res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
     res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
@@ -8005,17 +8157,20 @@ static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
     res_lo = vand_u8(res_lo, vec_mask);
     res_hi = vand_u8(res_hi, vec_mask);
 
-    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
-    return res;
+    return _sse2neon_vaddv_u8(res_lo) +
+           (uint16_t) (_sse2neon_vaddv_u8(res_hi) << 8);
 }
 
-static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+static uint16_t _sse2neon_cmp_word_equal_each(__m128i a,
+                                              int la,
+                                              __m128i b,
+                                              int lb)
 {
     uint16x8_t mtx =
         vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x100 - (1 << la);
-    int tb = 0x100 - (1 << lb);
+    uint16_t m0 = (uint16_t) ((la < lb) ? 0 : ((1 << la) - (1 << lb)));
+    uint16_t m1 = (uint16_t) (0x100 - (1 << la));
+    uint16_t tb = (uint16_t) (0x100 - (1 << lb));
     uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
     uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
     uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
@@ -8030,18 +8185,22 @@ static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
 
 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
-    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
-        int bound, int la, int lb, __m128i mtx[16])                            \
+    static uint16_t                                                            \
+        _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(          \
+            int bound, int la, int lb, __m128i mtx[16])                        \
     {                                                                          \
-        int res = 0;                                                           \
-        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint16_t res = 0;                                                      \
+        uint16_t m1 =                                                          \
+            (uint16_t) (SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la));  \
         uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
             vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
             vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
         uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
-            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
-                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
-            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+            vcombine_u##size(                                                  \
+                vtst_u##size(vdup_n_u##size((uint##size##_t) m1), vec_mask),   \
+                vtst_u##size(vdup_n_u##size((uint##size##_t)(m1 >> 8)),        \
+                             vec_mask)),                                       \
+            vtstq_u##size(vdupq_n_u##size((uint##size##_t) m1), vec_mask));    \
         uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
         uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
         for (int j = 0; j < lb; j++) {                                         \
@@ -8058,7 +8217,7 @@ static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
             int val = 1;                                                       \
             for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
                 val &= ptr[k * bound + j];                                     \
-            res += val << i;                                                   \
+            res += (uint16_t) (val << i);                                      \
         }                                                                      \
         return res;                                                            \
     }
@@ -8105,14 +8264,17 @@ enum {
     SSE2NEON_CMPESTR_LIST
 #undef _
 };
-typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+typedef uint16_t (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
 static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
 #define _(name, func_suffix) _sse2neon_##func_suffix,
     SSE2NEON_CMPESTR_LIST
 #undef _
 };
 
-FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+FORCE_INLINE uint16_t _sse2neon_sido_negative(int res,
+                                              int lb,
+                                              int imm8,
+                                              int bound)
 {
     switch (imm8 & 0x30) {
     case _SIDD_NEGATIVE_POLARITY:
@@ -8125,7 +8287,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
         break;
     }
 
-    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+    return (uint16_t) (res & ((bound == 8) ? 0xFF : 0xFFFF));
 }
 
 FORCE_INLINE int _sse2neon_clz(unsigned int x)
@@ -8174,7 +8336,7 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 #define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
 
 #define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
-    const int var = (imm & 0x01) ? 8 : 16
+    const int var = ((imm) & 0x01) ? 8 : 16
 
 #define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
     int tmp1 = la ^ (la >> 31);                  \
@@ -8189,28 +8351,28 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
 // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
 // string a and b.
-#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
-    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
-    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                         \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                               \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                               \
+    uint16_t r2 = (_sse2neon_cmpfunc_table[(imm8) & 0x0f])(a, la, b, lb); \
     r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
 
-#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
-    return (r2 == 0) ? bound                                     \
-                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
-                                      : _sse2neon_ctz(r2))
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)            \
+    return (r2 == 0) ? bound                                       \
+                     : (((imm8) & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                        : _sse2neon_ctz(r2))
 
 #define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
     __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
-    if (imm8 & 0x40) {                                                         \
+    if ((imm8) & 0x40) {                                                       \
         if (bound == 8) {                                                      \
             uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
                                        vld1q_u16(_sse2neon_cmpestr_mask16b));  \
             dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
                 tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
         } else {                                                               \
-            uint8x16_t vec_r2 =                                                \
-                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t vec_r2 = vcombine_u8(vdup_n_u8((uint8_t) r2),           \
+                                            vdup_n_u8((uint8_t) (r2 >> 8)));   \
             uint8x16_t tmp =                                                   \
                 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
             dst = vreinterpretq_m128i_u8(                                      \
@@ -8221,8 +8383,8 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
             dst = vreinterpretq_m128i_u16(                                     \
                 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
         } else {                                                               \
-            dst = vreinterpretq_m128i_u8(                                      \
-                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+            dst = vreinterpretq_m128i_u8(vsetq_lane_u8(                        \
+                (uint8_t) (r2 & 0xff), vreinterpretq_u8_m128i(dst), 0));       \
         }                                                                      \
     }                                                                          \
     return dst
@@ -8325,7 +8487,7 @@ FORCE_INLINE int _mm_cmpestrz(__m128i a,
 
 #define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
     do {                                                                 \
-        if (imm8 & 0x01) {                                               \
+        if ((imm8) & 0x01) {                                             \
             uint16x8_t equal_mask_##str =                                \
                 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
             uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
@@ -8423,7 +8585,7 @@ FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
 // in b for greater than.
 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     return vreinterpretq_m128i_u64(
         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -8443,11 +8605,11 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    (defined(_M_ARM64) && !defined(__clang__))
+    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
     crc = __crc32ch(crc, v);
 #else
-    crc = _mm_crc32_u8(crc, v & 0xff);
-    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+    crc = _mm_crc32_u8(crc, (uint8_t) (v & 0xff));
+    crc = _mm_crc32_u8(crc, (uint8_t) ((v >> 8) & 0xff));
 #endif
     return crc;
 }
@@ -8462,11 +8624,11 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    (defined(_M_ARM64) && !defined(__clang__))
+    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
     crc = __crc32cw(crc, v);
 #else
-    crc = _mm_crc32_u16(crc, v & 0xffff);
-    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+    crc = _mm_crc32_u16(crc, (uint16_t) (v & 0xffff));
+    crc = _mm_crc32_u16(crc, (uint16_t) ((v >> 16) & 0xffff));
 #endif
     return crc;
 }
@@ -8480,11 +8642,11 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
-#elif (defined(_M_ARM64) && !defined(__clang__))
+#elif ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
     crc = __crc32cd((uint32_t) crc, v);
 #else
-    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) (v & 0xffffffff));
+    crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) ((v >> 32) & 0xffffffff));
 #endif
     return crc;
 }
@@ -8499,7 +8661,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    (defined(_M_ARM64) && !defined(__clang__))
+    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
     crc = __crc32cb(crc, v);
 #else
     crc ^= v;
@@ -8530,7 +8692,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
     crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
 #else  // Fall back to the generic table lookup approach
     // Adapted from: https://create.stephan-brumme.com/crc32/
-    // Apply half-byte comparision algorithm for the best ratio between
+    // Apply half-byte comparison algorithm for the best ratio between
     // performance and lookup table.
 
     // The lookup table just needs to store every 16th entry
@@ -8550,7 +8712,8 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
 
 /* AES */
 
-#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
+#if !defined(__ARM_FEATURE_CRYPTO) && \
+    ((!defined(_M_ARM64) && !defined(_M_ARM64EC)) || defined(__clang__))
 /* clang-format off */
 #define SSE2NEON_AES_SBOX(w)                                           \
     {                                                                  \
@@ -8641,7 +8804,7 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
 #undef SSE2NEON_AES_H0
 
 /* x_time function and matrix multiply function */
-#if !defined(__aarch64__) && !defined(_M_ARM64)
+#if !defined(__aarch64__)
 #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
 #define SSE2NEON_MULTIPLY(x, y)                                  \
     (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
@@ -8657,7 +8820,7 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
 // for more information.
 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__)
     static const uint8_t shift_rows[] = {
         0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
         0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
@@ -8697,9 +8860,9 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
 #define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
     (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
      ((uint32_t) (b1) << 8) | (uint32_t) (b0))
-// muliplying 'x' by 2 in GF(2^8)
+// multiplying 'x' by 2 in GF(2^8)
 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
-// muliplying 'x' by 3 in GF(2^8)
+// multiplying 'x' by 3 in GF(2^8)
 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
 #define SSE2NEON_AES_U0(p) \
     SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
@@ -8784,7 +8947,7 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
     v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
 
     w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
-                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
+                                 0x1b);  // multiplying 'v' by 2 in GF(2^8)
     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
 
@@ -8816,7 +8979,8 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
                   SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
     }
 
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)),
+                         RoundKey);
 #endif
 }
 
@@ -8866,7 +9030,7 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
         _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
     };
 
-    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8(v)), RoundKey);
 #endif
 }
 
@@ -8904,7 +9068,8 @@ FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
         v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
     }
 
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)),
+                         RoundKey);
 #endif
 }
 
@@ -9129,14 +9294,14 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
 {
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -9150,7 +9315,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #if __has_builtin(__builtin_popcount)
     return __builtin_popcount(a);
 #elif defined(_MSC_VER)
@@ -9179,7 +9344,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #if __has_builtin(__builtin_popcountll)
     return __builtin_popcountll(a);
 #elif defined(_MSC_VER)
@@ -9210,14 +9375,14 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
     // regardless of the value of the FZ bit.
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -9225,10 +9390,10 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 
     r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     _sse2neon_set_fpcr(r.value);
 #else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
 #endif
 }
 
@@ -9236,7 +9401,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
 FORCE_INLINE uint64_t _rdtsc(void)
 {
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     uint64_t val;
 
     /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the

From 609e73af1d7e16f556e4f55ec6d1041221f35dc3 Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Wed, 23 Jul 2025 22:33:46 +0100
Subject: [PATCH 38/38] [sse2neon] update to stable

---
 externals/sse2neon/sse2neon.h | 749 ++++++++++++++++------------------
 1 file changed, 358 insertions(+), 391 deletions(-)

diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h
index 4626e923fd..79b90fe864 100755
--- a/externals/sse2neon/sse2neon.h
+++ b/externals/sse2neon/sse2neon.h
@@ -1,6 +1,3 @@
-// SPDX-FileCopyrightText: Copyright 2015-2024 SSE2NEON Contributors
-// SPDX-License-Identifier: MIT
-
 #ifndef SSE2NEON_H
 #define SSE2NEON_H
 
@@ -131,17 +128,17 @@
 #include <stdlib.h>
 #include <string.h>
 
-FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val)
+FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64)
 {
-    double tmp;
-    memcpy(&tmp, &val, sizeof(uint64_t));
-    return tmp;
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
 }
-FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
+FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
 {
-    int64_t tmp;
-    memcpy(&tmp, &val, sizeof(uint64_t));
-    return tmp;
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(uint64_t));
+    return i64;
 }
 
 #if defined(_WIN32) && !defined(__MINGW32__)
@@ -151,9 +148,6 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
 
 /* If using MSVC */
 #ifdef _MSC_VER
-#if defined(_M_ARM64EC)
-#define _DISABLE_SOFTINTRIN_ 1
-#endif
 #include <intrin.h>
 #if SSE2NEON_INCLUDE_WINDOWS_H
 #include <processthreadsapi.h>
@@ -169,7 +163,7 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
 #endif
 
 #if (defined(_M_AMD64) || defined(__x86_64__)) || \
-    (defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__arm64__))
+    (defined(_M_ARM64) || defined(__arm64__))
 #define SSE2NEON_HAS_BITSCAN64
 #endif
 #endif
@@ -252,7 +246,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #pragma GCC push_options
 #pragma GCC target("fpu=neon")
 #endif
-#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#elif defined(__aarch64__) || defined(_M_ARM64)
 #if !defined(__clang__) && !defined(_MSC_VER)
 #pragma GCC push_options
 #pragma GCC target("+simd")
@@ -273,8 +267,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #endif
 
 #include <arm_neon.h>
-#if (!defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)) && \
-    (__ARM_ARCH == 8)
+#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
 #if defined __has_include && __has_include(<arm_acle.h>)
 #include <arm_acle.h>
 #endif
@@ -292,7 +285,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #endif
 
 /* Rounding functions require either Aarch64 instructions or libm fallback */
-#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 #include <math.h>
 #endif
 
@@ -301,7 +294,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
  * To write or access to these registers in user mode,
  * we have to perform syscall instead.
  */
-#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#if (!defined(__aarch64__) && !defined(_M_ARM64))
 #include <sys/time.h>
 #endif
 
@@ -410,7 +403,7 @@ typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
 // On ARM 32-bit architecture, the float64x2_t is not supported.
 // The data type __m128d should be represented in a different way for related
 // intrinsic conversion.
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
 #else
 typedef float32x4_t __m128d;
@@ -511,7 +504,7 @@ typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
 
 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
 
@@ -643,7 +636,7 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 /* emulate vaddv u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 {
@@ -658,7 +651,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 /* emulate vaddvq u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
 {
@@ -676,7 +669,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
 }
 #endif
 
-#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 /* emulate vaddvq u16 variant */
 FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
 {
@@ -731,13 +724,6 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
  */
 
 /* Constants for use with _mm_prefetch. */
-#if defined(_M_ARM64EC)
-/* winnt.h already defines these constants as macros, so undefine them first. */
-#undef _MM_HINT_NTA
-#undef _MM_HINT_T0
-#undef _MM_HINT_T1
-#undef _MM_HINT_T2
-#endif
 enum _mm_hint {
     _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
     _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
@@ -753,7 +739,7 @@ typedef struct {
     uint8_t bit23 : 1;
     uint8_t bit24 : 1;
     uint8_t res2 : 7;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     uint32_t res3;
 #endif
 } fpcr_bitfield;
@@ -897,8 +883,8 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
 // supported by WoA has crypto extensions. If this changes in the future,
 // this can be verified via the runtime-only method of:
 // IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
-#if ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) || \
-    (defined(__ARM_FEATURE_CRYPTO) &&                                      \
+#if (defined(_M_ARM64) && !defined(__clang__)) || \
+    (defined(__ARM_FEATURE_CRYPTO) &&             \
      (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
 // Wraps vmull_p64
 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
@@ -1023,8 +1009,8 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 //   __m128i _mm_shuffle_epi32_default(__m128i a,
 //                                     __constrange(0, 255) int imm) {
 //       __m128i ret;
-//       ret[0] = a[(imm)        & 0x3];   ret[1] = a[((imm) >> 2) & 0x3];
-//       ret[2] = a[((imm) >> 4) & 0x03];  ret[3] = a[((imm) >> 6) & 0x03];
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
 //       return ret;
 //   }
 #define _mm_shuffle_epi32_default(a, imm)                                   \
@@ -1122,7 +1108,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
 }
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #define _mm_shuffle_epi32_splat(a, imm) \
     vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
 #else
@@ -1139,8 +1125,8 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
 //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
 //                                 __constrange(0, 255) int imm) {
 //       __m128 ret;
-//       ret[0] = a[(imm)        & 0x3];   ret[1] = a[((imm) >> 2) & 0x3];
-//       ret[2] = b[((imm) >> 4) & 0x03];  ret[3] = b[((imm) >> 6) & 0x03];
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
 //       return ret;
 //   }
 //
@@ -1562,7 +1548,7 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpret_m64_s32(
         vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
@@ -1587,7 +1573,7 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
                           0);
@@ -1718,7 +1704,7 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
 #else
@@ -1771,7 +1757,7 @@ FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -1845,14 +1831,14 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
 {
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -1991,18 +1977,20 @@ FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 #if !defined(SSE2NEON_ALLOC_DEFINED)
 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
 {
-#if defined(_WIN32)
-    return _aligned_malloc(size, align);
-#else
     void *ptr;
     if (align == 1)
         return malloc(size);
     if (align == 2 || (sizeof(void *) == 8 && align == 4))
         align = sizeof(void *);
+#if defined(_WIN32)
+    ptr = _aligned_malloc(size, align);
+    if (ptr)
+        return ptr;
+#else
     if (!posix_memalign(&ptr, align, size))
         return ptr;
-    return NULL;
 #endif
+    return NULL;
 }
 #endif
 
@@ -2166,7 +2154,7 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
 FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 {
     uint8x8_t input = vreinterpret_u8_m64(a);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     uint8x8_t tmp = vshr_n_u8(input, 7);
     return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
@@ -2187,7 +2175,7 @@ FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 FORCE_INLINE int _mm_movemask_ps(__m128 a)
 {
     uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static const int32_t shift[4] = {0, 1, 2, 3};
     uint32x4_t tmp = vshrq_n_u32(input, 31);
     return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
@@ -2421,7 +2409,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
     uint64x1_t t = vpaddl_u32(vpaddl_u16(
         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
     return vreinterpret_m64_u16(
-        vset_lane_u16((uint16_t) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+        vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
 }
 
 // Macro: Set the flush zero bits of the MXCSR control and status register to
@@ -2434,14 +2422,14 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
     // regardless of the value of the FZ bit.
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -2449,7 +2437,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
 
     r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     _sse2neon_set_fpcr(r.value);
 #else
     __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
@@ -2555,10 +2543,10 @@ FORCE_INLINE __m128 _mm_setzero_ps(void)
 // in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
 #ifdef _sse2neon_shuffle
-#define _mm_shuffle_pi16(a, imm)                                         \
-    vreinterpret_m64_s16(vshuffle_s16(                                   \
-        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), ((imm) & 0x3), \
-        (((imm) >> 2) & 0x3), (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3)))
+#define _mm_shuffle_pi16(a, imm)                                       \
+    vreinterpret_m64_s16(vshuffle_s16(                                 \
+        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+        ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
 #else
 #define _mm_shuffle_pi16(a, imm)                                              \
     _sse2neon_define1(                                                        \
@@ -2689,8 +2677,7 @@ FORCE_INLINE void _mm_lfence(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
-    !SSE2NEON_PRECISE_SQRT
+#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT
     return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
 #else
     float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
@@ -2919,7 +2906,7 @@ FORCE_INLINE __m128 _mm_undefined_ps(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -2935,7 +2922,7 @@ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -2994,7 +2981,7 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3019,7 +3006,7 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_add_pd(a, b));
 #else
     double a0, a1, b0;
@@ -3180,7 +3167,7 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
 #else
     return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
@@ -3252,7 +3239,7 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3278,7 +3265,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
 FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3304,7 +3291,7 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
 FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
@@ -3352,7 +3339,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
 FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3378,7 +3365,7 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
 FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
@@ -3399,7 +3386,7 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
 FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3425,7 +3412,7 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
 FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmple_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
@@ -3476,7 +3463,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
 FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -3502,7 +3489,7 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
 FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
     double a0, b0;
@@ -3522,7 +3509,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
 FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
 #else
@@ -3548,7 +3535,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
 FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
@@ -3583,7 +3570,7 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
 FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
@@ -3618,7 +3605,7 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
 FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
@@ -3653,7 +3640,7 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
 FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_u64(veorq_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
@@ -3688,7 +3675,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
 FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     // Excluding NaNs, any two floating point numbers can be compared.
     uint64x2_t not_nan_a =
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
@@ -3718,7 +3705,7 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
 FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
 #else
     double a0, b0;
@@ -3738,7 +3725,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
 FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     // Two NaNs are not equal in comparison operation.
     uint64x2_t not_nan_a =
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
@@ -3769,7 +3756,7 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
 FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
     double a0, b0;
@@ -3789,7 +3776,7 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
 FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
 #else
     double a0, b0;
@@ -3804,7 +3791,7 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
 FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
 #else
     double a0, b0;
@@ -3820,7 +3807,7 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
 FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
 #else
     double a0, b0;
@@ -3836,7 +3823,7 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
 FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
 #else
     double a0, b0;
@@ -3852,7 +3839,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
 FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
 #else
     uint32x4_t a_not_nan =
@@ -3881,7 +3868,7 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
 FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
 #else
@@ -3942,7 +3929,7 @@ FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
 #else
@@ -3958,7 +3945,7 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
 FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
 #else
@@ -3977,7 +3964,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 {
 #if defined(__ARM_FEATURE_FRINT)
     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
-#elif (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (_MM_GET_ROUNDING_MODE()) {
     case _MM_ROUND_NEAREST:
@@ -4031,7 +4018,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
 #else
@@ -4045,7 +4032,7 @@ FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
 #else
     double _a =
@@ -4059,7 +4046,7 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
@@ -4074,7 +4061,7 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
@@ -4096,7 +4083,7 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
 FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(vsetq_lane_f32(
         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
         vreinterpretq_f32_m128(a), 0));
@@ -4132,7 +4119,7 @@ FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
 FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
@@ -4160,7 +4147,7 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
 FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
@@ -4197,7 +4184,7 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 {
     double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
 #else
@@ -4252,7 +4239,7 @@ FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     double _a =
@@ -4271,7 +4258,7 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -4297,7 +4284,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float64x2_t tmp =
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_f64(
@@ -4329,7 +4316,7 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
 FORCE_INLINE __m128d _mm_load_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vld1q_f64(p));
 #else
     const float *fp = (const float *) p;
@@ -4349,7 +4336,7 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
 FORCE_INLINE __m128d _mm_load_sd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
 #else
     const float *fp = (const float *) p;
@@ -4371,7 +4358,7 @@ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
 #else
     return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
@@ -4384,7 +4371,7 @@ FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
 #else
@@ -4410,7 +4397,7 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
 #else
@@ -4426,7 +4413,7 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float64x2_t v = vld1q_f64(p);
     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
 #else
@@ -4466,7 +4453,7 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
 {
     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
                               vget_low_s16(vreinterpretq_s16_m128i(b)));
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int32x4_t high =
         vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
 
@@ -4520,7 +4507,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
 FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #if SSE2NEON_PRECISE_MINMAX
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
@@ -4552,7 +4539,7 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
 FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_max_pd(a, b));
 #else
     double a0, a1, b0;
@@ -4587,7 +4574,7 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
 FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #if SSE2NEON_PRECISE_MINMAX
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
@@ -4618,7 +4605,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
 FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_min_pd(a, b));
 #else
     double a0, a1, b0;
@@ -4776,7 +4763,7 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -4843,7 +4830,7 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
     uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
     uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     uint32x4_t ab7654 =
         vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
     uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
@@ -5013,7 +5000,7 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 {
     double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
 #else
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
@@ -5030,7 +5017,7 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
 FORCE_INLINE __m128d _mm_set_sd(double a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
 #else
     return _mm_set_pd(0, a);
@@ -5077,7 +5064,7 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
 FORCE_INLINE __m128d _mm_set1_pd(double d)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
 #else
     int64_t _d = sse2neon_recast_f64_s64(d);
@@ -5154,7 +5141,7 @@ FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
 FORCE_INLINE __m128d _mm_setzero_pd(void)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vdupq_n_f64(0));
 #else
     return vreinterpretq_m128d_f32(vdupq_n_f32(0));
@@ -5241,12 +5228,12 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
 #define _mm_shuffle_pd(a, b, imm8)                                            \
     vreinterpretq_m128d_s64(                                                  \
         vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
-                      (imm8) & 0x1, (((imm8) & 0x2) >> 1) + 2))
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
 #else
-#define _mm_shuffle_pd(a, b, imm8)                                       \
-    _mm_castsi128_pd(_mm_set_epi64x(                                     \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(b), ((imm8) & 0x2) >> 1), \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(a), (imm8) & 0x1)))
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
 #endif
 
 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
@@ -5327,7 +5314,7 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
     if (_sse2neon_unlikely(imm & ~15))
         return _mm_setzero_si128();
     return vreinterpretq_m128i_s16(
-        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16((int16_t) imm)));
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
 }
 
 // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
@@ -5355,13 +5342,13 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
 // dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
-#define _mm_slli_si128(a, imm)                                                \
-    _sse2neon_define1(                                                        \
-        __m128i, a, int8x16_t ret;                                            \
-        if (_sse2neon_unlikely((imm) == 0)) ret = vreinterpretq_s8_m128i(_a); \
-        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);        \
-        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),        \
-                            (((imm) <= 0 || (imm) > 15) ? 0 : (16 - (imm)))); \
+#define _mm_slli_si128(a, imm)                                              \
+    _sse2neon_define1(                                                      \
+        __m128i, a, int8x16_t ret;                                          \
+        if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
+        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
+        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),      \
+                            ((imm <= 0 || imm > 15) ? 0 : (16 - imm)));     \
         _sse2neon_return(vreinterpretq_m128i_s8(ret));)
 
 // Compute the square root of packed double-precision (64-bit) floating-point
@@ -5369,7 +5356,7 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
 FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
 #else
     double a0, a1;
@@ -5387,7 +5374,7 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
 FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_sqrt_pd(b));
 #else
     double _a, _b;
@@ -5406,7 +5393,7 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
     if (_sse2neon_unlikely(c & ~15))
         return _mm_cmplt_epi16(a, _mm_setzero_si128());
     return vreinterpretq_m128i_s16(
-        vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t) -c)));
+        vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
 }
 
 // Shift packed 32-bit integers in a right by count while shifting in sign bits,
@@ -5426,7 +5413,7 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
 {
-    const int16_t count = (imm & ~15) ? 15 : (int16_t) imm;
+    const int count = (imm & ~15) ? 15 : imm;
     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
 }
 
@@ -5488,13 +5475,13 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
 // store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                                 \
-    _sse2neon_define0(                                                         \
-        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {        \
-            ret = _mm_setzero_si128();                                         \
-        } else {                                                               \
-            ret = vreinterpretq_m128i_u16(vshlq_u16(                           \
-                vreinterpretq_u16_m128i(_a), vdupq_n_s16((int16_t) - (imm)))); \
+#define _mm_srli_epi16(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u16(                                    \
+                vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
         } _sse2neon_return(ret);)
 
 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
@@ -5530,7 +5517,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
         __m128i, a, int8x16_t ret;                                     \
         if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
         else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
-                            ((imm) > 15 ? 0 : (imm)));                 \
+                            (imm > 15 ? 0 : imm));                     \
         _sse2neon_return(vreinterpretq_m128i_s8(ret));)
 
 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
@@ -5539,7 +5526,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
 #else
     vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
@@ -5552,7 +5539,7 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
     vst1q_f64((float64_t *) mem_addr,
               vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
@@ -5568,7 +5555,7 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
@@ -5594,7 +5581,7 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
@@ -5613,7 +5600,7 @@ FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
 #else
     vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
@@ -5664,7 +5651,7 @@ FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
 {
 #if __has_builtin(__builtin_nontemporal_store)
     __builtin_nontemporal_store(a, (__m128d *) p);
-#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#elif defined(__aarch64__) || defined(_M_ARM64)
     vst1q_f64(p, vreinterpretq_f64_m128d(a));
 #else
     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
@@ -5744,7 +5731,7 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
 //  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -5847,7 +5834,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s16(
         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 #else
@@ -5863,7 +5850,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s32(
         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 #else
@@ -5879,7 +5866,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s64(
         vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -5894,7 +5881,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s8(
         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
@@ -5912,7 +5899,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -5927,7 +5914,7 @@ FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s16(
         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 #else
@@ -5943,7 +5930,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s32(
         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 #else
@@ -5959,7 +5946,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s64(
         vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -5974,7 +5961,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s8(
         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
@@ -5990,7 +5977,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -6027,7 +6014,7 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
 FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 {
     _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
                                              vreinterpretq_f64_m128d(b),
                                              vreinterpretq_f64_m128d(mask)));
@@ -6043,7 +6030,7 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 {
     _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_FMA) /* VFPv4+ */
     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
                                             vreinterpretq_f32_m128(mask),
@@ -6058,7 +6045,7 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -6080,7 +6067,7 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -6098,7 +6085,7 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
 FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
     return vreinterpretq_m128d_f64(
@@ -6124,7 +6111,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 {
     float32x4_t a = vreinterpretq_f32_m128(_a);
     float32x4_t b = vreinterpretq_f32_m128(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
 #else
@@ -6149,7 +6136,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
         vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
 #else
@@ -6163,7 +6150,7 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
 #elif defined(_sse2neon_shuffle)
@@ -6182,7 +6169,7 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128_f32(
         vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
 #elif defined(_sse2neon_shuffle)
@@ -6250,32 +6237,32 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
 // the result right by imm8 bytes, and store the low 16 bytes in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
 #if defined(__GNUC__) && !defined(__clang__)
-#define _mm_alignr_epi8(a, b, imm)                                 \
-    __extension__({                                                \
-        uint8x16_t _a = vreinterpretq_u8_m128i(a);                 \
-        uint8x16_t _b = vreinterpretq_u8_m128i(b);                 \
-        __m128i ret;                                               \
-        if (_sse2neon_unlikely((imm) & ~31))                       \
-            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));           \
-        else if ((imm) >= 16)                                      \
-            ret = _mm_srli_si128(a, (imm) >= 16 ? (imm) - 16 : 0); \
-        else                                                       \
-            ret = vreinterpretq_m128i_u8(                          \
-                vextq_u8(_b, _a, (imm) < 16 ? (imm) : 0));         \
-        ret;                                                       \
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
     })
 
 #else
-#define _mm_alignr_epi8(a, b, imm)                                  \
-    _sse2neon_define2(                                              \
-        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
-        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;   \
-        if (_sse2neon_unlikely((imm) & ~31)) ret =                  \
-            vreinterpretq_m128i_u8(vdupq_n_u8(0));                  \
-        else if ((imm) >= 16) ret =                                 \
-            _mm_srli_si128(_a, (imm) >= 16 ? (imm) - 16 : 0);       \
-        else ret = vreinterpretq_m128i_u8(                          \
-            vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0));            \
+#define _mm_alignr_epi8(a, b, imm)                                          \
+    _sse2neon_define2(                                                      \
+        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a);         \
+        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;           \
+        if (_sse2neon_unlikely((imm) & ~31)) ret =                          \
+            vreinterpretq_m128i_u8(vdupq_n_u8(0));                          \
+        else if (imm >= 16) ret =                                           \
+            _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0);                   \
+        else ret =                                                          \
+            vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
         _sse2neon_return(ret);)
 
 #endif
@@ -6310,7 +6297,7 @@ FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
 #else
     return vreinterpretq_m128i_s16(
@@ -6326,7 +6313,7 @@ FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
 #else
     return vreinterpretq_m128i_s32(
@@ -6358,7 +6345,7 @@ FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
     return vreinterpretq_s64_s16(
@@ -6383,7 +6370,7 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t res = vuzp_s16(a, b);
@@ -6398,7 +6385,7 @@ FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s16(
         vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
@@ -6414,7 +6401,7 @@ FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s32(
         vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
 #else
@@ -6430,7 +6417,7 @@ FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t c = vuzp_s16(a, b);
@@ -6445,7 +6432,7 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
 {
     int32x2_t a = vreinterpret_s32_m64(_a);
     int32x2_t b = vreinterpret_s32_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
 #else
     int32x2x2_t c = vuzp_s32(a, b);
@@ -6460,7 +6447,7 @@ FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s16(
         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
@@ -6476,7 +6463,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t c = vuzp_s16(a, b);
@@ -6491,7 +6478,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     uint8x16_t a = vreinterpretq_u8_m128i(_a);
     int8x16_t b = vreinterpretq_s8_m128i(_b);
     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
@@ -6595,7 +6582,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
     uint8x16_t idx_masked =
         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
 #elif defined(__GNUC__)
     int8x16_t ret;
@@ -6641,7 +6628,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
     // (b < 0) ? 0xFFFF : 0
     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
     // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
 #else
     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
@@ -6670,7 +6657,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
 
     // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
 #else
     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
@@ -6699,7 +6686,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
 
     // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
 #else
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
@@ -6728,7 +6715,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
 
     // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
 #else
     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
@@ -6757,7 +6744,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
 
     // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
 #else
     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
@@ -6786,7 +6773,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
 
     // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
 #else
     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
@@ -6844,9 +6831,11 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
 {
-    const uint32_t ALIGN_STRUCT(16) data[4] = {
-        (imm8 & (1 << 0)) ? UINT32_MAX : 0, (imm8 & (1 << 1)) ? UINT32_MAX : 0,
-        (imm8 & (1 << 2)) ? UINT32_MAX : 0, (imm8 & (1 << 3)) ? UINT32_MAX : 0};
+    const uint32_t
+        ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                                    ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                                    ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                                    ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
     uint32x4_t mask = vld1q_u32(data);
     float32x4_t a = vreinterpretq_f32_m128(_a);
     float32x4_t b = vreinterpretq_f32_m128(_b);
@@ -6873,7 +6862,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
 {
     uint64x2_t mask =
         vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     float64x2_t a = vreinterpretq_f64_m128d(_a);
     float64x2_t b = vreinterpretq_f64_m128d(_b);
     return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
@@ -6903,7 +6892,7 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
 FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
 #else
     double a0, a1;
@@ -6919,7 +6908,7 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
 #else
@@ -6952,7 +6941,7 @@ FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
 // in dst
 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_u64(
         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
 #else
@@ -7109,7 +7098,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
         _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
     __m128d tmp = _mm_and_pd(mul, mulMask);
 #else
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
                              : 0;
@@ -7131,7 +7120,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
     __m128d tmp = _mm_set_pd(d1, d0);
 #endif
     // Sum the products
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
 #else
     double _tmp0 = sse2neon_recast_u64_f64(
@@ -7155,7 +7144,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 {
     float32x4_t elementwise_prod = _mm_mul_ps(a, b);
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     /* shortcuts */
     if (imm == 0xFF) {
         return _mm_set1_ps(vaddvq_f32(elementwise_prod));
@@ -7225,7 +7214,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
 FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
 #else
     double a0, a1;
@@ -7241,7 +7230,7 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
 #else
@@ -7300,24 +7289,24 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
 // element from b into tmp using the control in imm8. Store tmp to dst using
 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
-#define _mm_insert_ps(a, b, imm8)                                              \
-    _sse2neon_define2(                                                         \
-        __m128, a, b,                                                          \
-        float32x4_t tmp1 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(_b, ((imm8) >> 6) & 0x3),            \
-                           vreinterpretq_f32_m128(_a), 0);                     \
-        float32x4_t tmp2 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                            \
-                           vreinterpretq_f32_m128(_a), (((imm8) >> 4) & 0x3)); \
-        const uint32_t data[4] =                                               \
-            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,               \
-                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,               \
-                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,               \
-                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);              \
-        uint32x4_t mask = vld1q_u32(data);                                     \
-        float32x4_t all_zeros = vdupq_n_f32(0);                                \
-                                                                               \
-        _sse2neon_return(vreinterpretq_m128_f32(                               \
+#define _mm_insert_ps(a, b, imm8)                                            \
+    _sse2neon_define2(                                                       \
+        __m128, a, b,                                                        \
+        float32x4_t tmp1 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3),            \
+                           vreinterpretq_f32_m128(_a), 0);                   \
+        float32x4_t tmp2 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                          \
+                           vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
+        const uint32_t data[4] =                                             \
+            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);            \
+        uint32x4_t mask = vld1q_u32(data);                                   \
+        float32x4_t all_zeros = vdupq_n_f32(0);                              \
+                                                                             \
+        _sse2neon_return(vreinterpretq_m128_f32(                             \
             vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
 
 // Compare packed signed 32-bit integers in a and b, and store packed maximum
@@ -7399,7 +7388,7 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
 {
     __m128i dst;
     uint16_t min, idx = 0;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     // Find the minimum value
     min = vminvq_u16(vreinterpretq_u16_m128i(a));
 
@@ -7502,7 +7491,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
     c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
     uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
     c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     // |0|4|2|6|
     c04 = vpaddq_s16(c04, c26);
     // |1|5|3|7|
@@ -7562,7 +7551,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
 FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     switch (rounding) {
     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
         return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
@@ -7631,7 +7620,7 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
 {
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (rounding) {
     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
@@ -7778,9 +7767,9 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 {
-    int64x2_t s64_vec =
+    int64x2_t s64 =
         vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
-    return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
 }
 
 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
@@ -7798,9 +7787,9 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
 {
-    int64x2_t s64_vec =
+    int64x2_t s64 =
         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
 }
 
 /* SSE4.2 */
@@ -7968,40 +7957,40 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
                                       SSE2NEON_CAT(u, size)))                \
     } while (0)
 
-#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                               \
-    static uint16_t _sse2neon_cmp_##type##_equal_any(__m128i a, int la, \
-                                                     __m128i b, int lb) \
-    {                                                                   \
-        __m128i mtx[16];                                                \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),    \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));      \
-        return SSE2NEON_CAT(                                            \
-            _sse2neon_aggregate_equal_any_,                             \
-            SSE2NEON_CAT(                                               \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                  \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
-                                             type))))(la, lb, mtx);     \
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
     }
 
-#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)          \
-    static uint16_t _sse2neon_cmp_##us##type##_ranges(__m128i a, int la,     \
-                                                      __m128i b, int lb)     \
-    {                                                                        \
-        __m128i mtx[16];                                                     \
-        PCMPSTR_RANGES(                                                      \
-            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);    \
-        return SSE2NEON_CAT(                                                 \
-            _sse2neon_aggregate_ranges_,                                     \
-            SSE2NEON_CAT(                                                    \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                       \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,      \
-                                             type))))(la, lb, mtx);          \
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
     }
 
 #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
-    static uint16_t _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,    \
-                                                         __m128i b, int lb)    \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
     {                                                                          \
         __m128i mtx[16];                                                       \
         PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
@@ -8015,34 +8004,29 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
     }
 
-static uint16_t _sse2neon_aggregate_equal_any_8x16(int la,
-                                                   int lb,
-                                                   __m128i mtx[16])
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
 {
-    uint16_t res = 0;
+    int res = 0;
     int m = (1 << la) - 1;
     uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
     uint8x16_t vec = vcombine_u8(t_lo, t_hi);
     for (int j = 0; j < lb; j++) {
         mtx[j] = vreinterpretq_m128i_u8(
             vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
         mtx[j] = vreinterpretq_m128i_u8(
             vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        uint16_t tmp =
-            _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
         res |= (tmp << j);
     }
     return res;
 }
 
-static uint16_t _sse2neon_aggregate_equal_any_16x8(int la,
-                                                   int lb,
-                                                   __m128i mtx[16])
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
 {
-    uint16_t res = 0;
-    uint16_t m = (uint16_t) (1 << la) - 1;
+    int res = 0;
+    int m = (1 << la) - 1;
     uint16x8_t vec =
         vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
     for (int j = 0; j < lb; j++) {
@@ -8050,8 +8034,7 @@ static uint16_t _sse2neon_aggregate_equal_any_16x8(int la,
             vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
         mtx[j] = vreinterpretq_m128i_u16(
             vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        uint16_t tmp =
-            _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
         res |= (tmp << j);
     }
     return res;
@@ -8065,10 +8048,10 @@ static uint16_t _sse2neon_aggregate_equal_any_16x8(int la,
 
 SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
 
-static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
 {
-    uint16_t res = 0;
-    uint16_t m = (uint16_t) (1 << la) - 1;
+    int res = 0;
+    int m = (1 << la) - 1;
     uint16x8_t vec =
         vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
     for (int j = 0; j < lb; j++) {
@@ -8080,24 +8063,24 @@ static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
             vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
         uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
                                        vreinterpretq_u32_m128i(tmp));
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
-        uint16_t t = vaddvq_u32(vec_res) ? 1 : 0;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
 #else
         uint64x2_t sumh = vpaddlq_u32(vec_res);
-        uint16_t t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
 #endif
         res |= (t << j);
     }
     return res;
 }
 
-static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
 {
-    uint16_t res = 0;
-    uint16_t m = (uint16_t) ((1 << la) - 1);
+    int res = 0;
+    int m = (1 << la) - 1;
     uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
     uint8x16_t vec = vcombine_u8(t_lo, t_hi);
     for (int j = 0; j < lb; j++) {
         mtx[j] = vreinterpretq_m128i_u8(
@@ -8108,7 +8091,7 @@ static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
             vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
         uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
                                        vreinterpretq_u16_m128i(tmp));
-        uint16_t t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
         res |= (t << j);
     }
     return res;
@@ -8130,25 +8113,22 @@ SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
 #undef SSE2NEON_CMP_RANGES_IS_BYTE
 #undef SSE2NEON_CMP_RANGES_IS_WORD
 
-static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a,
-                                              int la,
-                                              __m128i b,
-                                              int lb)
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
 {
     uint8x16_t mtx =
         vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
-    uint16_t m0 = (la < lb) ? 0 : (uint16_t) ((1 << la) - (1 << lb));
-    uint16_t m1 = (uint16_t) (0x10000 - (1 << la));
-    uint16_t tb = (uint16_t) (0x10000 - (1 << lb));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
     uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
     uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
     vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    vec0_lo = vtst_u8(vdup_n_u8((uint8_t) m0), vec_mask);
-    vec0_hi = vtst_u8(vdup_n_u8((uint8_t) (m0 >> 8)), vec_mask);
-    vec1_lo = vtst_u8(vdup_n_u8((uint8_t) m1), vec_mask);
-    vec1_hi = vtst_u8(vdup_n_u8((uint8_t) (m1 >> 8)), vec_mask);
-    tmp_lo = vtst_u8(vdup_n_u8((uint8_t) tb), vec_mask);
-    tmp_hi = vtst_u8(vdup_n_u8((uint8_t) (tb >> 8)), vec_mask);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
 
     res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
     res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
@@ -8157,20 +8137,17 @@ static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a,
     res_lo = vand_u8(res_lo, vec_mask);
     res_hi = vand_u8(res_hi, vec_mask);
 
-    return _sse2neon_vaddv_u8(res_lo) +
-           (uint16_t) (_sse2neon_vaddv_u8(res_hi) << 8);
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
 }
 
-static uint16_t _sse2neon_cmp_word_equal_each(__m128i a,
-                                              int la,
-                                              __m128i b,
-                                              int lb)
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
 {
     uint16x8_t mtx =
         vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    uint16_t m0 = (uint16_t) ((la < lb) ? 0 : ((1 << la) - (1 << lb)));
-    uint16_t m1 = (uint16_t) (0x100 - (1 << la));
-    uint16_t tb = (uint16_t) (0x100 - (1 << lb));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
     uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
     uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
     uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
@@ -8185,22 +8162,18 @@ static uint16_t _sse2neon_cmp_word_equal_each(__m128i a,
 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
 
 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
-    static uint16_t                                                            \
-        _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(          \
-            int bound, int la, int lb, __m128i mtx[16])                        \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
     {                                                                          \
-        uint16_t res = 0;                                                      \
-        uint16_t m1 =                                                          \
-            (uint16_t) (SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la));  \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
         uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
             vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
             vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
         uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
-            vcombine_u##size(                                                  \
-                vtst_u##size(vdup_n_u##size((uint##size##_t) m1), vec_mask),   \
-                vtst_u##size(vdup_n_u##size((uint##size##_t)(m1 >> 8)),        \
-                             vec_mask)),                                       \
-            vtstq_u##size(vdupq_n_u##size((uint##size##_t) m1), vec_mask));    \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
         uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
         uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
         for (int j = 0; j < lb; j++) {                                         \
@@ -8217,7 +8190,7 @@ static uint16_t _sse2neon_cmp_word_equal_each(__m128i a,
             int val = 1;                                                       \
             for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
                 val &= ptr[k * bound + j];                                     \
-            res += (uint16_t) (val << i);                                      \
+            res += val << i;                                                   \
         }                                                                      \
         return res;                                                            \
     }
@@ -8264,17 +8237,14 @@ enum {
     SSE2NEON_CMPESTR_LIST
 #undef _
 };
-typedef uint16_t (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
 static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
 #define _(name, func_suffix) _sse2neon_##func_suffix,
     SSE2NEON_CMPESTR_LIST
 #undef _
 };
 
-FORCE_INLINE uint16_t _sse2neon_sido_negative(int res,
-                                              int lb,
-                                              int imm8,
-                                              int bound)
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
 {
     switch (imm8 & 0x30) {
     case _SIDD_NEGATIVE_POLARITY:
@@ -8287,7 +8257,7 @@ FORCE_INLINE uint16_t _sse2neon_sido_negative(int res,
         break;
     }
 
-    return (uint16_t) (res & ((bound == 8) ? 0xFF : 0xFFFF));
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
 }
 
 FORCE_INLINE int _sse2neon_clz(unsigned int x)
@@ -8336,7 +8306,7 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 #define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
 
 #define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
-    const int var = ((imm) & 0x01) ? 8 : 16
+    const int var = (imm & 0x01) ? 8 : 16
 
 #define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
     int tmp1 = la ^ (la >> 31);                  \
@@ -8351,28 +8321,28 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
 // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
 // string a and b.
-#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                         \
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                               \
-    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                               \
-    uint16_t r2 = (_sse2neon_cmpfunc_table[(imm8) & 0x0f])(a, la, b, lb); \
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
     r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
 
-#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)            \
-    return (r2 == 0) ? bound                                       \
-                     : (((imm8) & 0x40) ? (31 - _sse2neon_clz(r2)) \
-                                        : _sse2neon_ctz(r2))
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
 
 #define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
     __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
-    if ((imm8) & 0x40) {                                                       \
+    if (imm8 & 0x40) {                                                         \
         if (bound == 8) {                                                      \
             uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
                                        vld1q_u16(_sse2neon_cmpestr_mask16b));  \
             dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
                 tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
         } else {                                                               \
-            uint8x16_t vec_r2 = vcombine_u8(vdup_n_u8((uint8_t) r2),           \
-                                            vdup_n_u8((uint8_t) (r2 >> 8)));   \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
             uint8x16_t tmp =                                                   \
                 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
             dst = vreinterpretq_m128i_u8(                                      \
@@ -8383,8 +8353,8 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
             dst = vreinterpretq_m128i_u16(                                     \
                 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
         } else {                                                               \
-            dst = vreinterpretq_m128i_u8(vsetq_lane_u8(                        \
-                (uint8_t) (r2 & 0xff), vreinterpretq_u8_m128i(dst), 0));       \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
         }                                                                      \
     }                                                                          \
     return dst
@@ -8487,7 +8457,7 @@ FORCE_INLINE int _mm_cmpestrz(__m128i a,
 
 #define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
     do {                                                                 \
-        if ((imm8) & 0x01) {                                             \
+        if (imm8 & 0x01) {                                               \
             uint16x8_t equal_mask_##str =                                \
                 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
             uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
@@ -8585,7 +8555,7 @@ FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
 // in b for greater than.
 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128i_u64(
         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -8605,11 +8575,11 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
+    (defined(_M_ARM64) && !defined(__clang__))
     crc = __crc32ch(crc, v);
 #else
-    crc = _mm_crc32_u8(crc, (uint8_t) (v & 0xff));
-    crc = _mm_crc32_u8(crc, (uint8_t) ((v >> 8) & 0xff));
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
 #endif
     return crc;
 }
@@ -8624,11 +8594,11 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
+    (defined(_M_ARM64) && !defined(__clang__))
     crc = __crc32cw(crc, v);
 #else
-    crc = _mm_crc32_u16(crc, (uint16_t) (v & 0xffff));
-    crc = _mm_crc32_u16(crc, (uint16_t) ((v >> 16) & 0xffff));
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
 #endif
     return crc;
 }
@@ -8642,11 +8612,11 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
-#elif ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
+#elif (defined(_M_ARM64) && !defined(__clang__))
     crc = __crc32cd((uint32_t) crc, v);
 #else
-    crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) (v & 0xffffffff));
-    crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) ((v >> 32) & 0xffffffff));
+    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
 #endif
     return crc;
 }
@@ -8661,7 +8631,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
-    ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
+    (defined(_M_ARM64) && !defined(__clang__))
     crc = __crc32cb(crc, v);
 #else
     crc ^= v;
@@ -8712,8 +8682,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
 
 /* AES */
 
-#if !defined(__ARM_FEATURE_CRYPTO) && \
-    ((!defined(_M_ARM64) && !defined(_M_ARM64EC)) || defined(__clang__))
+#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
 /* clang-format off */
 #define SSE2NEON_AES_SBOX(w)                                           \
     {                                                                  \
@@ -8804,7 +8773,7 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
 #undef SSE2NEON_AES_H0
 
 /* x_time function and matrix multiply function */
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
 #define SSE2NEON_MULTIPLY(x, y)                                  \
     (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
@@ -8820,7 +8789,7 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
 // for more information.
 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static const uint8_t shift_rows[] = {
         0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
         0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
@@ -8979,8 +8948,7 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
                   SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
     }
 
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)),
-                         RoundKey);
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
 #endif
 }
 
@@ -9030,7 +8998,7 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
         _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
     };
 
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8(v)), RoundKey);
+    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
 #endif
 }
 
@@ -9068,8 +9036,7 @@ FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
         v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
     }
 
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)),
-                         RoundKey);
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
 #endif
 }
 
@@ -9294,14 +9261,14 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
 {
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -9315,7 +9282,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #if __has_builtin(__builtin_popcount)
     return __builtin_popcount(a);
 #elif defined(_MSC_VER)
@@ -9344,7 +9311,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #if __has_builtin(__builtin_popcountll)
     return __builtin_popcountll(a);
 #elif defined(_MSC_VER)
@@ -9375,14 +9342,14 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
     // regardless of the value of the FZ bit.
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
@@ -9390,7 +9357,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 
     r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
 
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     _sse2neon_set_fpcr(r.value);
 #else
     __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
@@ -9401,7 +9368,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
 FORCE_INLINE uint64_t _rdtsc(void)
 {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
     uint64_t val;
 
     /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the