From 29a1299fb1fd3a3bac6755f93bbf54621e36251b Mon Sep 17 00:00:00 2001 From: lizzie Date: Wed, 23 Jul 2025 22:33:46 +0100 Subject: [PATCH] [sse2neon] update to stable --- externals/sse2neon/sse2neon.h | 749 ++++++++++++++++------------------ 1 file changed, 358 insertions(+), 391 deletions(-) diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h index 4626e923fd..79b90fe864 100755 --- a/externals/sse2neon/sse2neon.h +++ b/externals/sse2neon/sse2neon.h @@ -1,6 +1,3 @@ -// SPDX-FileCopyrightText: Copyright 2015-2024 SSE2NEON Contributors -// SPDX-License-Identifier: MIT - #ifndef SSE2NEON_H #define SSE2NEON_H @@ -131,17 +128,17 @@ #include #include -FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val) +FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64) { - double tmp; - memcpy(&tmp, &val, sizeof(uint64_t)); - return tmp; + double f64; + memcpy(&f64, &u64, sizeof(uint64_t)); + return f64; } -FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val) +FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64) { - int64_t tmp; - memcpy(&tmp, &val, sizeof(uint64_t)); - return tmp; + int64_t i64; + memcpy(&i64, &f64, sizeof(uint64_t)); + return i64; } #if defined(_WIN32) && !defined(__MINGW32__) @@ -151,9 +148,6 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val) /* If using MSVC */ #ifdef _MSC_VER -#if defined(_M_ARM64EC) -#define _DISABLE_SOFTINTRIN_ 1 -#endif #include #if SSE2NEON_INCLUDE_WINDOWS_H #include @@ -169,7 +163,7 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val) #endif #if (defined(_M_AMD64) || defined(__x86_64__)) || \ - (defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__arm64__)) + (defined(_M_ARM64) || defined(__arm64__)) #define SSE2NEON_HAS_BITSCAN64 #endif #endif @@ -252,7 +246,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void) #pragma GCC push_options #pragma GCC target("fpu=neon") #endif -#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#elif defined(__aarch64__) || defined(_M_ARM64) #if !defined(__clang__) && !defined(_MSC_VER) #pragma GCC push_options #pragma GCC target("+simd") @@ -273,8 +267,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void) #endif #include -#if (!defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)) && \ - (__ARM_ARCH == 8) +#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8) #if defined __has_include && __has_include() #include #endif @@ -292,7 +285,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void) #endif /* Rounding functions require either Aarch64 instructions or libm fallback */ -#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC) +#if !defined(__aarch64__) && !defined(_M_ARM64) #include #endif @@ -301,7 +294,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void) * To write or access to these registers in user mode, * we have to perform syscall instead. */ -#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC) +#if (!defined(__aarch64__) && !defined(_M_ARM64)) #include #endif @@ -410,7 +403,7 @@ typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ // On ARM 32-bit architecture, the float64x2_t is not supported. // The data type __m128d should be represented in a different way for related // intrinsic conversion. -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ #else typedef float32x4_t __m128d; @@ -511,7 +504,7 @@ typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t; #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) @@ -643,7 +636,7 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) } #endif -#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC) +#if !defined(__aarch64__) && !defined(_M_ARM64) /* emulate vaddv u8 variant */ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) { @@ -658,7 +651,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) } #endif -#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC) +#if !defined(__aarch64__) && !defined(_M_ARM64) /* emulate vaddvq u8 variant */ FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) { @@ -676,7 +669,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) } #endif -#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC) +#if !defined(__aarch64__) && !defined(_M_ARM64) /* emulate vaddvq u16 variant */ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) { @@ -731,13 +724,6 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) */ /* Constants for use with _mm_prefetch. */ -#if defined(_M_ARM64EC) -/* winnt.h already defines these constants as macros, so undefine them first. */ -#undef _MM_HINT_NTA -#undef _MM_HINT_T0 -#undef _MM_HINT_T1 -#undef _MM_HINT_T2 -#endif enum _mm_hint { _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ @@ -753,7 +739,7 @@ typedef struct { uint8_t bit23 : 1; uint8_t bit24 : 1; uint8_t res2 : 7; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) uint32_t res3; #endif } fpcr_bitfield; @@ -897,8 +883,8 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) // supported by WoA has crypto extensions. If this changes in the future, // this can be verified via the runtime-only method of: // IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) -#if ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) || \ - (defined(__ARM_FEATURE_CRYPTO) && \ +#if (defined(_M_ARM64) && !defined(__clang__)) || \ + (defined(__ARM_FEATURE_CRYPTO) && \ (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))) // Wraps vmull_p64 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) @@ -1023,8 +1009,8 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) // __m128i _mm_shuffle_epi32_default(__m128i a, // __constrange(0, 255) int imm) { // __m128i ret; -// ret[0] = a[(imm) & 0x3]; ret[1] = a[((imm) >> 2) & 0x3]; -// ret[2] = a[((imm) >> 4) & 0x03]; ret[3] = a[((imm) >> 6) & 0x03]; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; // return ret; // } #define _mm_shuffle_epi32_default(a, imm) \ @@ -1122,7 +1108,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); } -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) #define _mm_shuffle_epi32_splat(a, imm) \ vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))) #else @@ -1139,8 +1125,8 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, // __constrange(0, 255) int imm) { // __m128 ret; -// ret[0] = a[(imm) & 0x3]; ret[1] = a[((imm) >> 2) & 0x3]; -// ret[2] = b[((imm) >> 4) & 0x03]; ret[3] = b[((imm) >> 6) & 0x03]; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; // return ret; // } // @@ -1562,7 +1548,7 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) { -#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vreinterpret_m64_s32( vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))))); @@ -1587,7 +1573,7 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si FORCE_INLINE int _mm_cvt_ss2si(__m128 a) { -#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))), 0); @@ -1718,7 +1704,7 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) { -#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0); #else @@ -1771,7 +1757,7 @@ FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else @@ -1845,14 +1831,14 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) { union { fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) uint64_t value; #else uint32_t value; #endif } r; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ @@ -1991,18 +1977,20 @@ FORCE_INLINE __m128i _mm_loadu_si64(const void *p) #if !defined(SSE2NEON_ALLOC_DEFINED) FORCE_INLINE void *_mm_malloc(size_t size, size_t align) { -#if defined(_WIN32) - return _aligned_malloc(size, align); -#else void *ptr; if (align == 1) return malloc(size); if (align == 2 || (sizeof(void *) == 8 && align == 4)) align = sizeof(void *); +#if defined(_WIN32) + ptr = _aligned_malloc(size, align); + if (ptr) + return ptr; +#else if (!posix_memalign(&ptr, align, size)) return ptr; - return NULL; #endif + return NULL; } #endif @@ -2166,7 +2154,7 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) FORCE_INLINE int _mm_movemask_pi8(__m64 a) { uint8x8_t input = vreinterpret_u8_m64(a); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7}; uint8x8_t tmp = vshr_n_u8(input, 7); return vaddv_u8(vshl_u8(tmp, vld1_s8(shift))); @@ -2187,7 +2175,7 @@ FORCE_INLINE int _mm_movemask_pi8(__m64 a) FORCE_INLINE int _mm_movemask_ps(__m128 a) { uint32x4_t input = vreinterpretq_u32_m128(a); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) static const int32_t shift[4] = {0, 1, 2, 3}; uint32x4_t tmp = vshrq_n_u32(input, 31); return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift))); @@ -2421,7 +2409,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) uint64x1_t t = vpaddl_u32(vpaddl_u16( vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))))); return vreinterpret_m64_u16( - vset_lane_u16((uint16_t) vget_lane_u64(t, 0), vdup_n_u16(0), 0)); + vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0)); } // Macro: Set the flush zero bits of the MXCSR control and status register to @@ -2434,14 +2422,14 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) // regardless of the value of the FZ bit. union { fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) uint64_t value; #else uint32_t value; #endif } r; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ @@ -2449,7 +2437,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ @@ -2555,10 +2543,10 @@ FORCE_INLINE __m128 _mm_setzero_ps(void) // in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16 #ifdef _sse2neon_shuffle -#define _mm_shuffle_pi16(a, imm) \ - vreinterpret_m64_s16(vshuffle_s16( \ - vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), ((imm) & 0x3), \ - (((imm) >> 2) & 0x3), (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3))) +#define _mm_shuffle_pi16(a, imm) \ + vreinterpret_m64_s16(vshuffle_s16( \ + vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ + ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))) #else #define _mm_shuffle_pi16(a, imm) \ _sse2neon_define1( \ @@ -2689,8 +2677,7 @@ FORCE_INLINE void _mm_lfence(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) { -#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \ - !SSE2NEON_PRECISE_SQRT +#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); #else float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); @@ -2919,7 +2906,7 @@ FORCE_INLINE __m128 _mm_undefined_ps(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else @@ -2935,7 +2922,7 @@ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else @@ -2994,7 +2981,7 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -3019,7 +3006,7 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_add_pd(a, b)); #else double a0, a1, b0; @@ -3180,7 +3167,7 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); #else return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); @@ -3252,7 +3239,7 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -3278,7 +3265,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -3304,7 +3291,7 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpge_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations @@ -3352,7 +3339,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -3378,7 +3365,7 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations @@ -3399,7 +3386,7 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -3425,7 +3412,7 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmple_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations @@ -3476,7 +3463,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -3502,7 +3489,7 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmplt_pd(a, b)); #else double a0, b0; @@ -3522,7 +3509,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); #else @@ -3548,7 +3535,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64(veorq_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); @@ -3583,7 +3570,7 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64(veorq_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); @@ -3618,7 +3605,7 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64(veorq_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); @@ -3653,7 +3640,7 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64(veorq_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); @@ -3688,7 +3675,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) // Excluding NaNs, any two floating point numbers can be compared. uint64x2_t not_nan_a = vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); @@ -3718,7 +3705,7 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpord_pd(a, b)); #else double a0, b0; @@ -3738,7 +3725,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) // Two NaNs are not equal in comparison operation. uint64x2_t not_nan_a = vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); @@ -3769,7 +3756,7 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); #else double a0, b0; @@ -3789,7 +3776,7 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; #else double a0, b0; @@ -3804,7 +3791,7 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; #else double a0, b0; @@ -3820,7 +3807,7 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; #else double a0, b0; @@ -3836,7 +3823,7 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; #else double a0, b0; @@ -3852,7 +3839,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1; #else uint32x4_t a_not_nan = @@ -3881,7 +3868,7 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); #else @@ -3942,7 +3929,7 @@ FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); #else @@ -3958,7 +3945,7 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); #else @@ -3977,7 +3964,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) { #if defined(__ARM_FEATURE_FRINT) return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a))); -#elif (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \ +#elif (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) switch (_MM_GET_ROUNDING_MODE()) { case _MM_ROUND_NEAREST: @@ -4031,7 +4018,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); #else @@ -4045,7 +4032,7 @@ FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64 FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); #else double _a = @@ -4059,7 +4046,7 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); @@ -4074,7 +4061,7 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); @@ -4096,7 +4083,7 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32(vsetq_lane_f32( vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), vreinterpretq_f32_m128(a), 0)); @@ -4132,7 +4119,7 @@ FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else @@ -4160,7 +4147,7 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else @@ -4197,7 +4184,7 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) { double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); #else @@ -4252,7 +4239,7 @@ FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); #else double _a = @@ -4271,7 +4258,7 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -4297,7 +4284,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) float64x2_t tmp = vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_f64( @@ -4329,7 +4316,7 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd FORCE_INLINE __m128d _mm_load_pd(const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vld1q_f64(p)); #else const float *fp = (const float *) p; @@ -4349,7 +4336,7 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd FORCE_INLINE __m128d _mm_load_sd(const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); #else const float *fp = (const float *) p; @@ -4371,7 +4358,7 @@ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd FORCE_INLINE __m128d _mm_load1_pd(const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); #else return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); @@ -4384,7 +4371,7 @@ FORCE_INLINE __m128d _mm_load1_pd(const double *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); #else @@ -4410,7 +4397,7 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); #else @@ -4426,7 +4413,7 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd FORCE_INLINE __m128d _mm_loadr_pd(const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) float64x2_t v = vld1q_f64(p); return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); #else @@ -4466,7 +4453,7 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) { int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), vget_low_s16(vreinterpretq_s16_m128i(b))); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) int32x4_t high = vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); @@ -4520,7 +4507,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) #if SSE2NEON_PRECISE_MINMAX float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); @@ -4552,7 +4539,7 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_max_pd(a, b)); #else double a0, a1, b0; @@ -4587,7 +4574,7 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) #if SSE2NEON_PRECISE_MINMAX float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); @@ -4618,7 +4605,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_min_pd(a, b)); #else double a0, a1, b0; @@ -4776,7 +4763,7 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -4843,7 +4830,7 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); uint32x4_t ab3210 = vmull_u16(a3210, b3210); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) uint32x4_t ab7654 = vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), @@ -5013,7 +5000,7 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15, FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) { double ALIGN_STRUCT(16) data[2] = {e0, e1}; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); #else return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); @@ -5030,7 +5017,7 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd FORCE_INLINE __m128d _mm_set_sd(double a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0)); #else return _mm_set_pd(0, a); @@ -5077,7 +5064,7 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd FORCE_INLINE __m128d _mm_set1_pd(double d) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vdupq_n_f64(d)); #else int64_t _d = sse2neon_recast_f64_s64(d); @@ -5154,7 +5141,7 @@ FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd FORCE_INLINE __m128d _mm_setzero_pd(void) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vdupq_n_f64(0)); #else return vreinterpretq_m128d_f32(vdupq_n_f32(0)); @@ -5241,12 +5228,12 @@ FORCE_INLINE __m128i _mm_setzero_si128(void) #define _mm_shuffle_pd(a, b, imm8) \ vreinterpretq_m128d_s64( \ vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \ - (imm8) & 0x1, (((imm8) & 0x2) >> 1) + 2)) + imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2)) #else -#define _mm_shuffle_pd(a, b, imm8) \ - _mm_castsi128_pd(_mm_set_epi64x( \ - vgetq_lane_s64(vreinterpretq_s64_m128d(b), ((imm8) & 0x2) >> 1), \ - vgetq_lane_s64(vreinterpretq_s64_m128d(a), (imm8) & 0x1))) +#define _mm_shuffle_pd(a, b, imm8) \ + _mm_castsi128_pd(_mm_set_epi64x( \ + vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ + vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) #endif // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, @@ -5327,7 +5314,7 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) if (_sse2neon_unlikely(imm & ~15)) return _mm_setzero_si128(); return vreinterpretq_m128i_s16( - vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16((int16_t) imm))); + vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm))); } // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and @@ -5355,13 +5342,13 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) // Shift a left by imm8 bytes while shifting in zeros, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128 -#define _mm_slli_si128(a, imm) \ - _sse2neon_define1( \ - __m128i, a, int8x16_t ret; \ - if (_sse2neon_unlikely((imm) == 0)) ret = vreinterpretq_s8_m128i(_a); \ - else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ - else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \ - (((imm) <= 0 || (imm) > 15) ? 0 : (16 - (imm)))); \ +#define _mm_slli_si128(a, imm) \ + _sse2neon_define1( \ + __m128i, a, int8x16_t ret; \ + if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \ + else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ + else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \ + ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \ _sse2neon_return(vreinterpretq_m128i_s8(ret));) // Compute the square root of packed double-precision (64-bit) floating-point @@ -5369,7 +5356,7 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); #else double a0, a1; @@ -5387,7 +5374,7 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_sqrt_pd(b)); #else double _a, _b; @@ -5406,7 +5393,7 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) if (_sse2neon_unlikely(c & ~15)) return _mm_cmplt_epi16(a, _mm_setzero_si128()); return vreinterpretq_m128i_s16( - vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t) -c))); + vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c))); } // Shift packed 32-bit integers in a right by count while shifting in sign bits, @@ -5426,7 +5413,7 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) { - const int16_t count = (imm & ~15) ? 15 : (int16_t) imm; + const int count = (imm & ~15) ? 15 : imm; return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); } @@ -5488,13 +5475,13 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16 -#define _mm_srli_epi16(a, imm) \ - _sse2neon_define0( \ - __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \ - ret = _mm_setzero_si128(); \ - } else { \ - ret = vreinterpretq_m128i_u16(vshlq_u16( \ - vreinterpretq_u16_m128i(_a), vdupq_n_s16((int16_t) - (imm)))); \ +#define _mm_srli_epi16(a, imm) \ + _sse2neon_define0( \ + __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u16( \ + vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \ } _sse2neon_return(ret);) // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and @@ -5530,7 +5517,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) __m128i, a, int8x16_t ret; \ if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \ - ((imm) > 15 ? 0 : (imm))); \ + (imm > 15 ? 0 : imm)); \ _sse2neon_return(vreinterpretq_m128i_s8(ret));) // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point @@ -5539,7 +5526,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); #else vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); @@ -5552,7 +5539,7 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); @@ -5568,7 +5555,7 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); #else vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); @@ -5594,7 +5581,7 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); #else vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); @@ -5613,7 +5600,7 @@ FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); #else vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); @@ -5664,7 +5651,7 @@ FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, (__m128d *) p); -#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#elif defined(__aarch64__) || defined(_M_ARM64) vst1q_f64(p, vreinterpretq_f64_m128d(a)); #else vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a)); @@ -5744,7 +5731,7 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -5847,7 +5834,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s16( vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else @@ -5863,7 +5850,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s32( vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else @@ -5879,7 +5866,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s64( vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else @@ -5894,7 +5881,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s8( vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else @@ -5912,7 +5899,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -5927,7 +5914,7 @@ FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s16( vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else @@ -5943,7 +5930,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s32( vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else @@ -5959,7 +5946,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s64( vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else @@ -5974,7 +5961,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s8( vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else @@ -5990,7 +5977,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -6027,7 +6014,7 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) { _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(mask))); @@ -6043,7 +6030,7 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) { _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f); -#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_FMA) /* VFPv4+ */ return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(mask), @@ -6058,7 +6045,7 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -6080,7 +6067,7 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else @@ -6098,7 +6085,7 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); return vreinterpretq_m128d_f64( @@ -6124,7 +6111,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) { float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b))); #else @@ -6149,7 +6136,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); #else @@ -6163,7 +6150,7 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); #elif defined(_sse2neon_shuffle) @@ -6182,7 +6169,7 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); #elif defined(_sse2neon_shuffle) @@ -6250,32 +6237,32 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) // the result right by imm8 bytes, and store the low 16 bytes in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8 #if defined(__GNUC__) && !defined(__clang__) -#define _mm_alignr_epi8(a, b, imm) \ - __extension__({ \ - uint8x16_t _a = vreinterpretq_u8_m128i(a); \ - uint8x16_t _b = vreinterpretq_u8_m128i(b); \ - __m128i ret; \ - if (_sse2neon_unlikely((imm) & ~31)) \ - ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ - else if ((imm) >= 16) \ - ret = _mm_srli_si128(a, (imm) >= 16 ? (imm) - 16 : 0); \ - else \ - ret = vreinterpretq_m128i_u8( \ - vextq_u8(_b, _a, (imm) < 16 ? (imm) : 0)); \ - ret; \ +#define _mm_alignr_epi8(a, b, imm) \ + __extension__({ \ + uint8x16_t _a = vreinterpretq_u8_m128i(a); \ + uint8x16_t _b = vreinterpretq_u8_m128i(b); \ + __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~31)) \ + ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + else if (imm >= 16) \ + ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \ + else \ + ret = \ + vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \ + ret; \ }) #else -#define _mm_alignr_epi8(a, b, imm) \ - _sse2neon_define2( \ - __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \ - uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \ - if (_sse2neon_unlikely((imm) & ~31)) ret = \ - vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ - else if ((imm) >= 16) ret = \ - _mm_srli_si128(_a, (imm) >= 16 ? (imm) - 16 : 0); \ - else ret = vreinterpretq_m128i_u8( \ - vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0)); \ +#define _mm_alignr_epi8(a, b, imm) \ + _sse2neon_define2( \ + __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \ + uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~31)) ret = \ + vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + else if (imm >= 16) ret = \ + _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \ + else ret = \ + vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \ _sse2neon_return(ret);) #endif @@ -6310,7 +6297,7 @@ FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); #else return vreinterpretq_m128i_s16( @@ -6326,7 +6313,7 @@ FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s32(vpaddq_s32(a, b)); #else return vreinterpretq_m128i_s32( @@ -6358,7 +6345,7 @@ FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); return vreinterpretq_s64_s16( @@ -6383,7 +6370,7 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t res = vuzp_s16(a, b); @@ -6398,7 +6385,7 @@ FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s16( vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else @@ -6414,7 +6401,7 @@ FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s32( vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b))); #else @@ -6430,7 +6417,7 @@ FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t c = vuzp_s16(a, b); @@ -6445,7 +6432,7 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) { int32x2_t a = vreinterpret_s32_m64(_a); int32x2_t b = vreinterpret_s32_m64(_b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b))); #else int32x2x2_t c = vuzp_s32(a, b); @@ -6460,7 +6447,7 @@ FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s16( vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else @@ -6476,7 +6463,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t c = vuzp_s16(a, b); @@ -6491,7 +6478,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) uint8x16_t a = vreinterpretq_u8_m128i(_a); int8x16_t b = vreinterpretq_s8_m128i(_b); int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), @@ -6595,7 +6582,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); #elif defined(__GNUC__) int8x16_t ret; @@ -6641,7 +6628,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) // (b < 0) ? 0xFFFF : 0 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); #else int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); @@ -6670,7 +6657,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); #else int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); @@ -6699,7 +6686,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); #else int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); @@ -6728,7 +6715,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); #else int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); @@ -6757,7 +6744,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); #else int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); @@ -6786,7 +6773,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); #else int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); @@ -6844,9 +6831,11 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) { - const uint32_t ALIGN_STRUCT(16) data[4] = { - (imm8 & (1 << 0)) ? UINT32_MAX : 0, (imm8 & (1 << 1)) ? UINT32_MAX : 0, - (imm8 & (1 << 2)) ? UINT32_MAX : 0, (imm8 & (1 << 3)) ? UINT32_MAX : 0}; + const uint32_t + ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, + ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; uint32x4_t mask = vld1q_u32(data); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); @@ -6873,7 +6862,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) { uint64x2_t mask = vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) float64x2_t a = vreinterpretq_f64_m128d(_a); float64x2_t b = vreinterpretq_f64_m128d(_b); return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); @@ -6903,7 +6892,7 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); #else double a0, a1; @@ -6919,7 +6908,7 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) { -#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); #else @@ -6952,7 +6941,7 @@ FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) // in dst FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_u64( vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); #else @@ -7109,7 +7098,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask)); __m128d tmp = _mm_and_pd(mul, mulMask); #else -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) * vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0) : 0; @@ -7131,7 +7120,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) __m128d tmp = _mm_set_pd(d1, d0); #endif // Sum the products -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); #else double _tmp0 = sse2neon_recast_u64_f64( @@ -7155,7 +7144,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) { float32x4_t elementwise_prod = _mm_mul_ps(a, b); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) /* shortcuts */ if (imm == 0xFF) { return _mm_set1_ps(vaddvq_f32(elementwise_prod)); @@ -7225,7 +7214,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd FORCE_INLINE __m128d _mm_floor_pd(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); #else double a0, a1; @@ -7241,7 +7230,7 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps FORCE_INLINE __m128 _mm_floor_ps(__m128 a) { -#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); #else @@ -7300,24 +7289,24 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) // element from b into tmp using the control in imm8. Store tmp to dst using // the mask in imm8 (elements are zeroed out when the corresponding bit is set). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps -#define _mm_insert_ps(a, b, imm8) \ - _sse2neon_define2( \ - __m128, a, b, \ - float32x4_t tmp1 = \ - vsetq_lane_f32(vgetq_lane_f32(_b, ((imm8) >> 6) & 0x3), \ - vreinterpretq_f32_m128(_a), 0); \ - float32x4_t tmp2 = \ - vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \ - vreinterpretq_f32_m128(_a), (((imm8) >> 4) & 0x3)); \ - const uint32_t data[4] = \ - _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ - ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ - ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ - ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \ - uint32x4_t mask = vld1q_u32(data); \ - float32x4_t all_zeros = vdupq_n_f32(0); \ - \ - _sse2neon_return(vreinterpretq_m128_f32( \ +#define _mm_insert_ps(a, b, imm8) \ + _sse2neon_define2( \ + __m128, a, b, \ + float32x4_t tmp1 = \ + vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \ + vreinterpretq_f32_m128(_a), 0); \ + float32x4_t tmp2 = \ + vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \ + vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \ + const uint32_t data[4] = \ + _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \ + uint32x4_t mask = vld1q_u32(data); \ + float32x4_t all_zeros = vdupq_n_f32(0); \ + \ + _sse2neon_return(vreinterpretq_m128_f32( \ vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));) // Compare packed signed 32-bit integers in a and b, and store packed maximum @@ -7399,7 +7388,7 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) { __m128i dst; uint16_t min, idx = 0; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) // Find the minimum value min = vminvq_u16(vreinterpretq_u16_m128i(a)); @@ -7502,7 +7491,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b)); uint8x16_t _a_3 = vextq_u8(_a, _a, 3); c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b)); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) // |0|4|2|6| c04 = vpaddq_s16(c04, c26); // |1|5|3|7| @@ -7562,7 +7551,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) switch (rounding) { case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a))); @@ -7631,7 +7620,7 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) { -#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) switch (rounding) { case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): @@ -7778,9 +7767,9 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) { - int64x2_t s64_vec = + int64x2_t s64 = vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)); - return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, @@ -7798,9 +7787,9 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) { - int64x2_t s64_vec = + int64x2_t s64 = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); - return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } /* SSE4.2 */ @@ -7968,40 +7957,40 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { SSE2NEON_CAT(u, size))) \ } while (0) -#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \ - static uint16_t _sse2neon_cmp_##type##_equal_any(__m128i a, int la, \ - __m128i b, int lb) \ - { \ - __m128i mtx[16]; \ - PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ - return SSE2NEON_CAT( \ - _sse2neon_aggregate_equal_any_, \ - SSE2NEON_CAT( \ - SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ - type))))(la, lb, mtx); \ +#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \ + static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \ + int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_equal_any_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ + type))))(la, lb, mtx); \ } -#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ - static uint16_t _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, \ - __m128i b, int lb) \ - { \ - __m128i mtx[16]; \ - PCMPSTR_RANGES( \ - a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \ - return SSE2NEON_CAT( \ - _sse2neon_aggregate_ranges_, \ - SSE2NEON_CAT( \ - SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ - type))))(la, lb, mtx); \ +#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ + static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \ + int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_RANGES( \ + a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_ranges_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ + type))))(la, lb, mtx); \ } #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \ - static uint16_t _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \ - __m128i b, int lb) \ + static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \ + __m128i b, int lb) \ { \ __m128i mtx[16]; \ PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ @@ -8015,34 +8004,29 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \ } -static uint16_t _sse2neon_aggregate_equal_any_8x16(int la, - int lb, - __m128i mtx[16]) +static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16]) { - uint16_t res = 0; + int res = 0; int m = (1 << la) - 1; uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); - uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask); - uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask); + uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); + uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); uint8x16_t vec = vcombine_u8(t_lo, t_hi); for (int j = 0; j < lb; j++) { mtx[j] = vreinterpretq_m128i_u8( vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); mtx[j] = vreinterpretq_m128i_u8( vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); - uint16_t tmp = - _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0; + int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0; res |= (tmp << j); } return res; } -static uint16_t _sse2neon_aggregate_equal_any_16x8(int la, - int lb, - __m128i mtx[16]) +static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) { - uint16_t res = 0; - uint16_t m = (uint16_t) (1 << la) - 1; + int res = 0; + int m = (1 << la) - 1; uint16x8_t vec = vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); for (int j = 0; j < lb; j++) { @@ -8050,8 +8034,7 @@ static uint16_t _sse2neon_aggregate_equal_any_16x8(int la, vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); mtx[j] = vreinterpretq_m128i_u16( vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); - uint16_t tmp = - _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0; + int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0; res |= (tmp << j); } return res; @@ -8065,10 +8048,10 @@ static uint16_t _sse2neon_aggregate_equal_any_16x8(int la, SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_) -static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) +static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) { - uint16_t res = 0; - uint16_t m = (uint16_t) (1 << la) - 1; + int res = 0; + int m = (1 << la) - 1; uint16x8_t vec = vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); for (int j = 0; j < lb; j++) { @@ -8080,24 +8063,24 @@ static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16)); uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]), vreinterpretq_u32_m128i(tmp)); -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) - uint16_t t = vaddvq_u32(vec_res) ? 1 : 0; +#if defined(__aarch64__) || defined(_M_ARM64) + int t = vaddvq_u32(vec_res) ? 1 : 0; #else uint64x2_t sumh = vpaddlq_u32(vec_res); - uint16_t t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); + int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); #endif res |= (t << j); } return res; } -static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) +static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) { - uint16_t res = 0; - uint16_t m = (uint16_t) ((1 << la) - 1); + int res = 0; + int m = (1 << la) - 1; uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); - uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask); - uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask); + uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); + uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); uint8x16_t vec = vcombine_u8(t_lo, t_hi); for (int j = 0; j < lb; j++) { mtx[j] = vreinterpretq_m128i_u8( @@ -8108,7 +8091,7 @@ static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8)); uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]), vreinterpretq_u16_m128i(tmp)); - uint16_t t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0; + int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0; res |= (t << j); } return res; @@ -8130,25 +8113,22 @@ SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_) #undef SSE2NEON_CMP_RANGES_IS_BYTE #undef SSE2NEON_CMP_RANGES_IS_WORD -static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a, - int la, - __m128i b, - int lb) +static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) { uint8x16_t mtx = vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)); - uint16_t m0 = (la < lb) ? 0 : (uint16_t) ((1 << la) - (1 << lb)); - uint16_t m1 = (uint16_t) (0x10000 - (1 << la)); - uint16_t tb = (uint16_t) (0x10000 - (1 << lb)); + int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); + int m1 = 0x10000 - (1 << la); + int tb = 0x10000 - (1 << lb); uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi; uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi; vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); - vec0_lo = vtst_u8(vdup_n_u8((uint8_t) m0), vec_mask); - vec0_hi = vtst_u8(vdup_n_u8((uint8_t) (m0 >> 8)), vec_mask); - vec1_lo = vtst_u8(vdup_n_u8((uint8_t) m1), vec_mask); - vec1_hi = vtst_u8(vdup_n_u8((uint8_t) (m1 >> 8)), vec_mask); - tmp_lo = vtst_u8(vdup_n_u8((uint8_t) tb), vec_mask); - tmp_hi = vtst_u8(vdup_n_u8((uint8_t) (tb >> 8)), vec_mask); + vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask); + vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask); + vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask); + vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask); + tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask); + tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask); res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx)); res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx)); @@ -8157,20 +8137,17 @@ static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a, res_lo = vand_u8(res_lo, vec_mask); res_hi = vand_u8(res_hi, vec_mask); - return _sse2neon_vaddv_u8(res_lo) + - (uint16_t) (_sse2neon_vaddv_u8(res_hi) << 8); + int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8); + return res; } -static uint16_t _sse2neon_cmp_word_equal_each(__m128i a, - int la, - __m128i b, - int lb) +static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) { uint16x8_t mtx = vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); - uint16_t m0 = (uint16_t) ((la < lb) ? 0 : ((1 << la) - (1 << lb))); - uint16_t m1 = (uint16_t) (0x100 - (1 << la)); - uint16_t tb = (uint16_t) (0x100 - (1 << lb)); + int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); + int m1 = 0x100 - (1 << la); + int tb = 0x100 - (1 << lb); uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b); uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask); uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask); @@ -8185,22 +8162,18 @@ static uint16_t _sse2neon_cmp_word_equal_each(__m128i a, #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \ - static uint16_t \ - _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \ - int bound, int la, int lb, __m128i mtx[16]) \ + static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \ + int bound, int la, int lb, __m128i mtx[16]) \ { \ - uint16_t res = 0; \ - uint16_t m1 = \ - (uint16_t) (SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la)); \ + int res = 0; \ + int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \ uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \ vld1_u##size(_sse2neon_cmpestr_mask##size##b), \ vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \ uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \ - vcombine_u##size( \ - vtst_u##size(vdup_n_u##size((uint##size##_t) m1), vec_mask), \ - vtst_u##size(vdup_n_u##size((uint##size##_t)(m1 >> 8)), \ - vec_mask)), \ - vtstq_u##size(vdupq_n_u##size((uint##size##_t) m1), vec_mask)); \ + vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \ + vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \ + vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \ uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \ uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \ for (int j = 0; j < lb; j++) { \ @@ -8217,7 +8190,7 @@ static uint16_t _sse2neon_cmp_word_equal_each(__m128i a, int val = 1; \ for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \ val &= ptr[k * bound + j]; \ - res += (uint16_t) (val << i); \ + res += val << i; \ } \ return res; \ } @@ -8264,17 +8237,14 @@ enum { SSE2NEON_CMPESTR_LIST #undef _ }; -typedef uint16_t (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); +typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); static cmpestr_func_t _sse2neon_cmpfunc_table[] = { #define _(name, func_suffix) _sse2neon_##func_suffix, SSE2NEON_CMPESTR_LIST #undef _ }; -FORCE_INLINE uint16_t _sse2neon_sido_negative(int res, - int lb, - int imm8, - int bound) +FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) { switch (imm8 & 0x30) { case _SIDD_NEGATIVE_POLARITY: @@ -8287,7 +8257,7 @@ FORCE_INLINE uint16_t _sse2neon_sido_negative(int res, break; } - return (uint16_t) (res & ((bound == 8) ? 0xFF : 0xFFFF)); + return res & ((bound == 8) ? 0xFF : 0xFFFF); } FORCE_INLINE int _sse2neon_clz(unsigned int x) @@ -8336,7 +8306,7 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) #define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y) #define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \ - const int var = ((imm) & 0x01) ? 8 : 16 + const int var = (imm & 0x01) ? 8 : 16 #define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \ int tmp1 = la ^ (la >> 31); \ @@ -8351,28 +8321,28 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of // string a and b. -#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \ - SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \ - SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \ - uint16_t r2 = (_sse2neon_cmpfunc_table[(imm8) & 0x0f])(a, la, b, lb); \ +#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \ + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \ + SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \ + int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \ r2 = _sse2neon_sido_negative(r2, lb, imm8, bound) -#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \ - return (r2 == 0) ? bound \ - : (((imm8) & 0x40) ? (31 - _sse2neon_clz(r2)) \ - : _sse2neon_ctz(r2)) +#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \ + return (r2 == 0) ? bound \ + : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \ + : _sse2neon_ctz(r2)) #define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \ __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ - if ((imm8) & 0x40) { \ + if (imm8 & 0x40) { \ if (bound == 8) { \ uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \ vld1q_u16(_sse2neon_cmpestr_mask16b)); \ dst = vreinterpretq_m128i_u16(vbslq_u16( \ tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \ } else { \ - uint8x16_t vec_r2 = vcombine_u8(vdup_n_u8((uint8_t) r2), \ - vdup_n_u8((uint8_t) (r2 >> 8))); \ + uint8x16_t vec_r2 = \ + vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \ uint8x16_t tmp = \ vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \ dst = vreinterpretq_m128i_u8( \ @@ -8383,8 +8353,8 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) dst = vreinterpretq_m128i_u16( \ vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \ } else { \ - dst = vreinterpretq_m128i_u8(vsetq_lane_u8( \ - (uint8_t) (r2 & 0xff), vreinterpretq_u8_m128i(dst), 0)); \ + dst = vreinterpretq_m128i_u8( \ + vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \ } \ } \ return dst @@ -8487,7 +8457,7 @@ FORCE_INLINE int _mm_cmpestrz(__m128i a, #define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \ do { \ - if ((imm8) & 0x01) { \ + if (imm8 & 0x01) { \ uint16x8_t equal_mask_##str = \ vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \ uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ @@ -8585,7 +8555,7 @@ FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8) // in b for greater than. FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_u64( vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else @@ -8605,11 +8575,11 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) : [c] "+r"(crc) : [v] "r"(v)); #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ - ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) + (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32ch(crc, v); #else - crc = _mm_crc32_u8(crc, (uint8_t) (v & 0xff)); - crc = _mm_crc32_u8(crc, (uint8_t) ((v >> 8) & 0xff)); + crc = _mm_crc32_u8(crc, v & 0xff); + crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); #endif return crc; } @@ -8624,11 +8594,11 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) : [c] "+r"(crc) : [v] "r"(v)); #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ - ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) + (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32cw(crc, v); #else - crc = _mm_crc32_u16(crc, (uint16_t) (v & 0xffff)); - crc = _mm_crc32_u16(crc, (uint16_t) ((v >> 16) & 0xffff)); + crc = _mm_crc32_u16(crc, v & 0xffff); + crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); #endif return crc; } @@ -8642,11 +8612,11 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); -#elif ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) +#elif (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32cd((uint32_t) crc, v); #else - crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) (v & 0xffffffff)); - crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) ((v >> 32) & 0xffffffff)); + crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff); + crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff); #endif return crc; } @@ -8661,7 +8631,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) : [c] "+r"(crc) : [v] "r"(v)); #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ - ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) + (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32cb(crc, v); #else crc ^= v; @@ -8712,8 +8682,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) /* AES */ -#if !defined(__ARM_FEATURE_CRYPTO) && \ - ((!defined(_M_ARM64) && !defined(_M_ARM64EC)) || defined(__clang__)) +#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__)) /* clang-format off */ #define SSE2NEON_AES_SBOX(w) \ { \ @@ -8804,7 +8773,7 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0); #undef SSE2NEON_AES_H0 /* x_time function and matrix multiply function */ -#if !defined(__aarch64__) +#if !defined(__aarch64__) && !defined(_M_ARM64) #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) #define SSE2NEON_MULTIPLY(x, y) \ (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \ @@ -8820,7 +8789,7 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0); // for more information. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) { -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) static const uint8_t shift_rows[] = { 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, @@ -8979,8 +8948,7 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); } - return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)), - RoundKey); + return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; #endif } @@ -9030,7 +8998,7 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)], }; - return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8(v)), RoundKey); + return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey; #endif } @@ -9068,8 +9036,7 @@ FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; } - return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)), - RoundKey); + return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; #endif } @@ -9294,14 +9261,14 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void) { union { fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) uint64_t value; #else uint32_t value; #endif } r; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ @@ -9315,7 +9282,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32 FORCE_INLINE int _mm_popcnt_u32(unsigned int a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) #if __has_builtin(__builtin_popcount) return __builtin_popcount(a); #elif defined(_MSC_VER) @@ -9344,7 +9311,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) #if __has_builtin(__builtin_popcountll) return __builtin_popcountll(a); #elif defined(_MSC_VER) @@ -9375,14 +9342,14 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) // regardless of the value of the FZ bit. union { fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) uint64_t value; #else uint32_t value; #endif } r; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ @@ -9390,7 +9357,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ @@ -9401,7 +9368,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc FORCE_INLINE uint64_t _rdtsc(void) { -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(__aarch64__) || defined(_M_ARM64) uint64_t val; /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the