diff --git a/externals/nx_tzdb/CMakeLists.txt b/externals/nx_tzdb/CMakeLists.txt index 2d6b2fcc66..35d3e6d2a8 100644 --- a/externals/nx_tzdb/CMakeLists.txt +++ b/externals/nx_tzdb/CMakeLists.txt @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2025 Eden Emulator Project +# SPDX-License-Identifier: GPL-3.0-or-later + # SPDX-FileCopyrightText: 2023 yuzu Emulator Project # SPDX-License-Identifier: GPL-2.0-or-later @@ -15,35 +18,58 @@ find_program(DATE_PROG date) set(CAN_BUILD_NX_TZDB true) -if (NOT GIT) - set(CAN_BUILD_NX_TZDB false) -endif() -if (NOT GNU_MAKE) - set(CAN_BUILD_NX_TZDB false) -endif() -if (NOT DATE_PROG) - set(CAN_BUILD_NX_TZDB false) -endif() -if (CMAKE_SYSTEM_NAME STREQUAL "Windows" OR ANDROID) +if (NOT (GIT AND GNU_MAKE AND DATE_PROG) OR CMAKE_SYSTEM_NAME STREQUAL "Windows" OR ANDROID) # tzdb_to_nx currently requires a posix-compliant host # MinGW and Android are handled here due to the executable format being different from the host system # TODO (lat9nq): cross-compiling support + set(CAN_BUILD_NX_TZDB false) endif() -set(NX_TZDB_VERSION "250725") -set(NX_TZDB_ROMFS_DIR "${CPM_SOURCE_CACHE}/nx_tzdb") - -if ((NOT CAN_BUILD_NX_TZDB OR YUZU_DOWNLOAD_TIME_ZONE_DATA) AND NOT EXISTS ${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}) - message(STATUS "Downloading time zone data...") - AddJsonPackage(tzdb) -elseif (CAN_BUILD_NX_TZDB AND NOT YUZU_DOWNLOAD_TIME_ZONE_DATA) - # TODO(crueter): this sucked to do with cpm, see if i can get it to work again +if (CAN_BUILD_NX_TZDB AND NOT YUZU_DOWNLOAD_TIME_ZONE_DATA) message(FATAL_ERROR "Building tzdb is currently unsupported. Check back later.") add_subdirectory(tzdb_to_nx) add_dependencies(nx_tzdb x80e) - set(NX_TZDB_ROMFS_DIR "${NX_TZDB_DIR}") + set(NX_TZDB_BASE_DIR "${NX_TZDB_DIR}") + set(NX_TZDB_TZ_DIR "${NX_TZDB_BASE_DIR}/zoneinfo") +endif() + +# TODO(crueter): This is a terrible solution, but MSVC fails to link without it +# Need to investigate further but I still can't reproduce... +if (MSVC) + set(NX_TZDB_VERSION "250725") + set(NX_TZDB_ARCHIVE "${CPM_SOURCE_CACHE}/nx_tzdb/${NX_TZDB_VERSION}.zip") + + set(NX_TZDB_BASE_DIR "${CPM_SOURCE_CACHE}/nx_tzdb/tz") + set(NX_TZDB_TZ_DIR "${NX_TZDB_BASE_DIR}/zoneinfo") + + set(NX_TZDB_DOWNLOAD_URL "https://github.com/crueter/tzdb_to_nx/releases/download/${NX_TZDB_VERSION}/${NX_TZDB_VERSION}.zip") + + message(STATUS "Downloading time zone data from ${NX_TZDB_DOWNLOAD_URL}...") + file(DOWNLOAD ${NX_TZDB_DOWNLOAD_URL} ${NX_TZDB_ARCHIVE} + STATUS NX_TZDB_DOWNLOAD_STATUS) + + list(GET NX_TZDB_DOWNLOAD_STATUS 0 NX_TZDB_DOWNLOAD_STATUS_CODE) + if (NOT NX_TZDB_DOWNLOAD_STATUS_CODE EQUAL 0) + message(FATAL_ERROR "Time zone data download failed (status code ${NX_TZDB_DOWNLOAD_STATUS_CODE})") + endif() + + file(ARCHIVE_EXTRACT + INPUT + ${NX_TZDB_ARCHIVE} + DESTINATION + ${NX_TZDB_BASE_DIR}) +else() + message(STATUS "Downloading time zone data...") + AddJsonPackage(tzdb) + + target_include_directories(nx_tzdb + INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include + INTERFACE ${NX_TZDB_INCLUDE_DIR}) + + set(NX_TZDB_BASE_DIR "${CPM_SOURCE_CACHE}/nx_tzdb") + set(NX_TZDB_TZ_DIR "${nx_tzdb_SOURCE_DIR}") endif() target_include_directories(nx_tzdb @@ -68,25 +94,25 @@ function(CreateHeader ZONE_PATH HEADER_NAME) target_sources(nx_tzdb PRIVATE ${HEADER_PATH}) endfunction() -CreateHeader(${NX_TZDB_ROMFS_DIR} base) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION} zoneinfo) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Africa africa) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/America america) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/America/Argentina america_argentina) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/America/Indiana america_indiana) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/America/Kentucky america_kentucky) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/America/North_Dakota america_north_dakota) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Antarctica antarctica) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Arctic arctic) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Asia asia) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Atlantic atlantic) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Australia australia) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Brazil brazil) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Canada canada) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Chile chile) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Etc etc) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Europe europe) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Indian indian) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Mexico mexico) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/Pacific pacific) -CreateHeader(${NX_TZDB_ROMFS_DIR}/${NX_TZDB_VERSION}/US us) +CreateHeader(${NX_TZDB_BASE_DIR} base) +CreateHeader(${NX_TZDB_TZ_DIR} zoneinfo) +CreateHeader(${NX_TZDB_TZ_DIR}/Africa africa) +CreateHeader(${NX_TZDB_TZ_DIR}/America america) +CreateHeader(${NX_TZDB_TZ_DIR}/America/Argentina america_argentina) +CreateHeader(${NX_TZDB_TZ_DIR}/America/Indiana america_indiana) +CreateHeader(${NX_TZDB_TZ_DIR}/America/Kentucky america_kentucky) +CreateHeader(${NX_TZDB_TZ_DIR}/America/North_Dakota america_north_dakota) +CreateHeader(${NX_TZDB_TZ_DIR}/Antarctica antarctica) +CreateHeader(${NX_TZDB_TZ_DIR}/Arctic arctic) +CreateHeader(${NX_TZDB_TZ_DIR}/Asia asia) +CreateHeader(${NX_TZDB_TZ_DIR}/Atlantic atlantic) +CreateHeader(${NX_TZDB_TZ_DIR}/Australia australia) +CreateHeader(${NX_TZDB_TZ_DIR}/Brazil brazil) +CreateHeader(${NX_TZDB_TZ_DIR}/Canada canada) +CreateHeader(${NX_TZDB_TZ_DIR}/Chile chile) +CreateHeader(${NX_TZDB_TZ_DIR}/Etc etc) +CreateHeader(${NX_TZDB_TZ_DIR}/Europe europe) +CreateHeader(${NX_TZDB_TZ_DIR}/Indian indian) +CreateHeader(${NX_TZDB_TZ_DIR}/Mexico mexico) +CreateHeader(${NX_TZDB_TZ_DIR}/Pacific pacific) +CreateHeader(${NX_TZDB_TZ_DIR}/US us) diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h index 66b93c1c74..79b90fe864 100755 --- a/externals/sse2neon/sse2neon.h +++ b/externals/sse2neon/sse2neon.h @@ -1,6 +1,3 @@ -// SPDX-FileCopyrightText: Copyright 2015-2024 SSE2NEON Contributors -// SPDX-License-Identifier: MIT - #ifndef SSE2NEON_H #define SSE2NEON_H @@ -54,6 +51,7 @@ // Cuda Chen // Aymen Qader // Anthony Roberts +// Sean Luchen /* Tunable configurations */ @@ -65,7 +63,7 @@ #ifndef SSE2NEON_PRECISE_MINMAX #define SSE2NEON_PRECISE_MINMAX (0) #endif -/* _mm_rcp_ps and _mm_div_ps */ +/* _mm_rcp_ps */ #ifndef SSE2NEON_PRECISE_DIV #define SSE2NEON_PRECISE_DIV (0) #endif @@ -113,6 +111,11 @@ #warning "GCC versions earlier than 10 are not supported." #endif +#if defined(__OPTIMIZE__) && !defined(SSE2NEON_SUPPRESS_WARNINGS) +#warning \ + "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon." +#endif + /* C language does not allow initializing a variable with a function call. */ #ifdef __cplusplus #define _sse2neon_const static const @@ -120,13 +123,26 @@ #define _sse2neon_const const #endif +#include #include #include +#include -#if defined(_WIN32) -/* Definitions for _mm_{malloc,free} are provided by - * from both MinGW-w64 and MSVC. - */ +FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64) +{ + double f64; + memcpy(&f64, &u64, sizeof(uint64_t)); + return f64; +} +FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64) +{ + int64_t i64; + memcpy(&i64, &f64, sizeof(uint64_t)); + return i64; +} + +#if defined(_WIN32) && !defined(__MINGW32__) +/* Definitions for _mm_{malloc,free} are provided by from MSVC. */ #define SSE2NEON_ALLOC_DEFINED #endif @@ -183,7 +199,7 @@ } /* Compiler barrier */ -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) #define SSE2NEON_BARRIER() _ReadWriteBarrier() #else #define SSE2NEON_BARRIER() \ @@ -244,7 +260,9 @@ FORCE_INLINE void _sse2neon_smp_mb(void) #pragma GCC push_options #endif #else -#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." +#error \ + "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A \ +(you could try setting target explicitly with -march or -mcpu)" #endif #endif @@ -315,6 +333,15 @@ FORCE_INLINE void _sse2neon_smp_mb(void) #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) +/** + * MACRO for shuffle parameter for _mm_shuffle_pd(). + * Argument fp1 is a digit[01] that represents the fp from argument "b" + * of mm_shuffle_pd that will be placed in fp1 of result. + * fp0 is a digit[01] that represents the fp from argument "a" of mm_shuffle_pd + * that will be placed in fp0 of result. + */ +#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0)) + #if __has_builtin(__builtin_shufflevector) #define _sse2neon_shuffle(type, a, b, ...) \ __builtin_shufflevector(a, b, __VA_ARGS__) @@ -383,6 +410,11 @@ typedef float32x4_t __m128d; #endif typedef int64x2_t __m128i; /* 128-bit vector containing integers */ +// Some intrinsics operate on unaligned data types. +typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t; +typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t; +typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t; + // __int64 is defined in the Intrinsics Guide which maps to different datatype // in different data model #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) @@ -859,7 +891,7 @@ FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) __n64 a1 = {a}, b1 = {b}; return vreinterpretq_u64_p128(vmull_p64(a1, b1)); #else @@ -1725,7 +1757,7 @@ FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) { -#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_DIV +#if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else @@ -1763,14 +1795,18 @@ FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) #if !defined(SSE2NEON_ALLOC_DEFINED) FORCE_INLINE void _mm_free(void *addr) { +#if defined(_WIN32) + _aligned_free(addr); +#else free(addr); +#endif } #endif FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) { uint64_t value; -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) value = _ReadStatusReg(ARM64_FPCR); #else __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */ @@ -1780,10 +1816,10 @@ FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) { -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) _WriteStatusReg(ARM64_FPCR, value); #else - __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ + __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ #endif } @@ -1817,25 +1853,20 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - - if (r.field.bit22) { - return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; - } else { - return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; + switch (fegetround()) { + case FE_TONEAREST: + return _MM_ROUND_NEAREST; + case FE_DOWNWARD: + return _MM_ROUND_DOWN; + case FE_UPWARD: + return _MM_ROUND_UP; + case FE_TOWARDZERO: + return _MM_ROUND_TOWARD_ZERO; + default: + // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN, + // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error + // cases we treat them as FE_TOWARDZERO (truncate). + return _MM_ROUND_TOWARD_ZERO; } } @@ -1928,7 +1959,7 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p) FORCE_INLINE __m128i _mm_loadu_si16(const void *p) { return vreinterpretq_m128i_s16( - vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); + vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0)); } // Load unaligned 64-bit integer from memory into the first element of dst. @@ -1936,7 +1967,7 @@ FORCE_INLINE __m128i _mm_loadu_si16(const void *p) FORCE_INLINE __m128i _mm_loadu_si64(const void *p) { return vreinterpretq_m128i_s64( - vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); + vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0)); } // Allocate size bytes of memory, aligned to the alignment specified in align, @@ -1951,8 +1982,14 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align) return malloc(size); if (align == 2 || (sizeof(void *) == 8 && align == 4)) align = sizeof(void *); +#if defined(_WIN32) + ptr = _aligned_malloc(size, align); + if (ptr) + return ptr; +#else if (!posix_memalign(&ptr, align, size)) return ptr; +#endif return NULL; } #endif @@ -2249,7 +2286,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) FORCE_INLINE void _mm_prefetch(char const *p, int i) { (void) i; -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) switch (i) { case _MM_HINT_NTA: __prefetch2(p, 1); @@ -2403,7 +2440,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) #if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } @@ -2431,44 +2468,26 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - switch (rounding) { - case _MM_ROUND_TOWARD_ZERO: - r.field.bit22 = 1; - r.field.bit23 = 1; + case _MM_ROUND_NEAREST: + rounding = FE_TONEAREST; break; case _MM_ROUND_DOWN: - r.field.bit22 = 0; - r.field.bit23 = 1; + rounding = FE_DOWNWARD; break; case _MM_ROUND_UP: - r.field.bit22 = 1; - r.field.bit23 = 0; + rounding = FE_UPWARD; break; - default: //_MM_ROUND_NEAREST - r.field.bit22 = 0; - r.field.bit23 = 0; + case _MM_ROUND_TOWARD_ZERO: + rounding = FE_TOWARDZERO; + break; + default: + // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, + // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as + // FE_TOWARDZERO (truncate). + rounding = FE_TOWARDZERO; } - -#if defined(__aarch64__) || defined(_M_ARM64) - _sse2neon_set_fpcr(r.value); -#else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ -#endif + fesetround(rounding); } // Copy single-precision (32-bit) floating-point element a to the lower element @@ -2966,11 +2985,17 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = da[0] + db[0]; - c[1] = da[1] + db[1]; + c[0] = a0 + b0; + c[1] = a1 + b1; return vld1q_f32((float32_t *) c); #endif } @@ -2984,11 +3009,13 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_add_pd(a, b)); #else - double *da = (double *) &a; - double *db = (double *) &b; + double a0, a1, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); double c[2]; - c[0] = da[0] + db[0]; - c[1] = da[1]; + c[0] = a0 + b0; + c[1] = a1; return vld1q_f32((float32_t *) c); #endif } @@ -3242,13 +3269,17 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3264,11 +3295,12 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) return _mm_move_sd(a, _mm_cmpge_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3311,13 +3343,17 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3333,11 +3369,12 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3353,13 +3390,17 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3375,11 +3416,12 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) return _mm_move_sd(a, _mm_cmple_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3425,13 +3467,17 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3446,11 +3492,12 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmplt_pd(a, b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3493,15 +3540,17 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = - !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = - !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3526,15 +3575,17 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = - !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = - !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3559,15 +3610,17 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = - !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = - !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3592,15 +3645,17 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = - !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = - !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3628,19 +3683,17 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) - ? ~UINT64_C(0) - : UINT64_C(0); - d[1] = ((*(double *) &a1) == (*(double *) &a1) && - (*(double *) &b1) == (*(double *) &b1)) - ? ~UINT64_C(0) - : UINT64_C(0); + d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3655,14 +3708,12 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpord_pd(a, b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) - ? ~UINT64_C(0) - : UINT64_C(0); + d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3683,19 +3734,17 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) return vreinterpretq_m128d_s32( vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) - ? UINT64_C(0) - : ~UINT64_C(0); - d[1] = ((*(double *) &a1) == (*(double *) &a1) && - (*(double *) &b1) == (*(double *) &b1)) - ? UINT64_C(0) - : ~UINT64_C(0); + d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0); + d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3710,14 +3759,12 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) - ? UINT64_C(0) - : ~UINT64_C(0); + d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3732,10 +3779,10 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - - return (*(double *) &a0 >= *(double *) &b0); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return a0 >= b0; #endif } @@ -3747,10 +3794,11 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); - return (*(double *) &a0 > *(double *) &b0); + return a0 > b0; #endif } @@ -3762,10 +3810,11 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); - return (*(double *) &a0 <= *(double *) &b0); + return a0 <= b0; #endif } @@ -3777,10 +3826,11 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); - return (*(double *) &a0 < *(double *) &b0); + return a0 < b0; #endif } @@ -3849,8 +3899,11 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) vcombine_s32(vmovn_s64(integers), vdup_n_s32(0))); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double d0 = ((double *) &rnd)[0]; - double d1 = ((double *) &rnd)[1]; + double d0, d1; + d0 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + d1 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1)); return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); #endif } @@ -3861,8 +3914,11 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double d0 = ((double *) &rnd)[0]; - double d1 = ((double *) &rnd)[1]; + double d0, d1; + d0 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + d1 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1)); int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; return vreinterpret_m64_s32(vld1_s32(data)); } @@ -3877,9 +3933,10 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); #else - float a0 = (float) ((double *) &a)[0]; - float a1 = (float) ((double *) &a)[1]; - return _mm_set_ps(0, 0, a1, a0); + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_ps(0, 0, (float) a1, (float) a0); #endif } @@ -3978,7 +4035,9 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); #else - return ((double *) &a)[0]; + double _a = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return _a; #endif } @@ -3991,7 +4050,8 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double ret = ((double *) &rnd)[0]; + double ret = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); return (int32_t) ret; #endif } @@ -4005,7 +4065,8 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double ret = ((double *) &rnd)[0]; + double ret = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); return (int64_t) ret; #endif } @@ -4027,8 +4088,10 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), vreinterpretq_f32_m128(a), 0)); #else - return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0], - vreinterpretq_f32_m128(a), 0)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0)); #endif } @@ -4060,9 +4123,9 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else - double bf = (double) b; + int64_t _b = sse2neon_recast_f64_s64((double) b); return vreinterpretq_m128d_s64( - vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); + vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4088,9 +4151,9 @@ FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else - double bf = (double) b; + int64_t _b = sse2neon_recast_f64_s64((double) b); return vreinterpretq_m128d_s64( - vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); + vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4125,8 +4188,8 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) return vreinterpretq_m128d_f64( vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); #else - return vreinterpretq_m128d_s64( - vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); + return vreinterpretq_m128d_s64(vsetq_lane_s64( + sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4135,8 +4198,9 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) { - double a0 = ((double *) &a)[0]; - double a1 = ((double *) &a)[1]; + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0); } @@ -4145,8 +4209,9 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) { - double a0 = ((double *) &a)[0]; - double a1 = ((double *) &a)[1]; + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; return vreinterpret_m64_s32(vld1_s32(data)); } @@ -4164,8 +4229,9 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) { - double ret = *((double *) &a); - return (int32_t) ret; + double _a = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return (int32_t) _a; } // Convert the lower double-precision (64-bit) floating-point element in a to a @@ -4176,8 +4242,9 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); #else - double ret = *((double *) &a); - return (int64_t) ret; + double _a = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return (int64_t) _a; #endif } @@ -4195,11 +4262,17 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = da[0] / db[0]; - c[1] = da[1] / db[1]; + c[0] = a0 / b0; + c[1] = a1 / b1; return vld1q_f32((float32_t *) c); #endif } @@ -4361,7 +4434,7 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) { - return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); + return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p)); } // Load unaligned 32-bit integer from memory into the first element of dst. @@ -4369,7 +4442,7 @@ FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) FORCE_INLINE __m128i _mm_loadu_si32(const void *p) { return vreinterpretq_m128i_s32( - vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); + vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate @@ -4444,15 +4517,19 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); - uint64_t d[2]; - d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0; - d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + int64_t d[2]; + d[0] = a0 > b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0); + d[1] = a1 > b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1); - return vreinterpretq_m128d_u64(vld1q_u64(d)); + return vreinterpretq_m128d_s64(vld1q_s64(d)); #endif } @@ -4465,9 +4542,11 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_max_pd(a, b)); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]}; + double a0, a1, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double c[2] = {a0 > b0 ? a0 : b0, a1}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } @@ -4505,14 +4584,18 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); - uint64_t d[2]; - d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0; - d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1; - return vreinterpretq_m128d_u64(vld1q_u64(d)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + int64_t d[2]; + d[0] = a0 < b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0); + d[1] = a1 < b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1); + return vreinterpretq_m128d_s64(vld1q_s64(d)); #endif } @@ -4525,9 +4608,11 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_min_pd(a, b)); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]}; + double a0, a1, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double c[2] = {a0 < b0 ? a0 : b0, a1}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } @@ -4682,11 +4767,17 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = da[0] * db[0]; - c[1] = da[1] * db[1]; + c[0] = a0 * b0; + c[1] = a1 * b1; return vld1q_f32((float32_t *) c); #endif } @@ -4820,7 +4911,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause FORCE_INLINE void _mm_pause(void) { -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) __isb(_ARM64_BARRIER_SY); #else __asm__ __volatile__("isb\n"); @@ -4895,11 +4986,11 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b1, signed char b0) { - int8_t ALIGN_STRUCT(16) - data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, - (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, - (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, - (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + int8_t ALIGN_STRUCT(16) data[16] = { + (int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } @@ -4976,7 +5067,8 @@ FORCE_INLINE __m128d _mm_set1_pd(double d) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vdupq_n_f64(d)); #else - return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); + int64_t _d = sse2neon_recast_f64_s64(d); + return vreinterpretq_m128d_s64(vdupq_n_s64(_d)); #endif } @@ -5029,11 +5121,11 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b14, signed char b15) { - int8_t ALIGN_STRUCT(16) - data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, - (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, - (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, - (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + int8_t ALIGN_STRUCT(16) data[16] = { + (int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } @@ -5267,9 +5359,12 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); #else - double a0 = sqrt(((double *) &a)[0]); - double a1 = sqrt(((double *) &a)[1]); - return _mm_set_pd(a1, a0); + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double _a0 = sqrt(a0); + double _a1 = sqrt(a1); + return _mm_set_pd(_a1, _a0); #endif } @@ -5282,7 +5377,10 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_sqrt_pd(b)); #else - return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0])); + double _a, _b; + _a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + _b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return _mm_set_pd(_a, sqrt(_b)); #endif } @@ -5637,11 +5735,17 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = da[0] - db[0]; - c[1] = da[1] - db[1]; + c[0] = a0 - b0; + c[1] = a1 - b1; return vld1q_f32((float32_t *) c); #endif } @@ -5716,7 +5820,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void) #pragma GCC diagnostic ignored "-Wuninitialized" #endif __m128d a; -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) a = _mm_setzero_pd(); #endif return a; @@ -5945,9 +6049,15 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[] = {da[0] + da[1], db[0] + db[1]}; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[] = {a0 + a1, b0 + b1}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } @@ -5973,17 +6083,23 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) // Horizontally subtract adjacent pairs of double-precision (64-bit) // floating-point elements in a and b, and pack the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd -FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) +FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) - float64x2_t a = vreinterpretq_f64_m128d(_a); - float64x2_t b = vreinterpretq_f64_m128d(_b); + float64x2_t _a = vreinterpretq_f64_m128d(a); + float64x2_t _b = vreinterpretq_f64_m128d(b); return vreinterpretq_m128d_f64( - vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b))); + vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b))); #else - double *da = (double *) &_a; - double *db = (double *) &_b; - double c[] = {da[0] - da[1], db[0] - db[1]}; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[] = {a0 - a1, b0 - b1}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } @@ -6162,7 +6278,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) uint8x8_t tmp_low; \ uint8x8_t tmp_high; \ if ((imm) >= 8) { \ - const int idx = (imm) -8; \ + const int idx = (imm) - 8; \ tmp_low = vreinterpret_u8_m64(_a); \ tmp_high = vdup_n_u8(0); \ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ @@ -6683,14 +6799,14 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) _sse2neon_define2( \ __m128i, a, b, \ const uint16_t _mask[8] = \ - _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \ + _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 1)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 2)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 3)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 4)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 5)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 6)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 7)) ? (uint16_t) - 1 : 0x0); \ uint16x8_t _mask_vec = vld1q_u16(_mask); \ uint16x8_t __a = vreinterpretq_u16_m128i(_a); \ uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \ @@ -6715,11 +6831,11 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) { - const uint32_t ALIGN_STRUCT(16) - data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, - ((imm8) & (1 << 1)) ? UINT32_MAX : 0, - ((imm8) & (1 << 2)) ? UINT32_MAX : 0, - ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; + const uint32_t + ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, + ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; uint32x4_t mask = vld1q_u32(data); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); @@ -6779,8 +6895,10 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); #else - double *f = (double *) &a; - return _mm_set_pd(ceil(f[1]), ceil(f[0])); + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_pd(ceil(a1), ceil(a0)); #endif } @@ -6988,8 +7106,16 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1) : 0; #else - double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0; - double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double d0 = (imm & 0x10) ? a0 * b0 : 0; + double d1 = (imm & 0x20) ? a1 * b1 : 0; #endif __m128d tmp = _mm_set_pd(d1, d0); #endif @@ -6997,7 +7123,11 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) #if defined(__aarch64__) || defined(_M_ARM64) double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); #else - double sum = *((double *) &tmp) + *(((double *) &tmp) + 1); + double _tmp0 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0)); + double _tmp1 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1)); + double sum = _tmp0 + _tmp1; #endif // Conditionally store the sum const __m128d sumMask = @@ -7087,8 +7217,10 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); #else - double *f = (double *) &a; - return _mm_set_pd(floor(f[1]), floor(f[0])); + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_pd(floor(a1), floor(a0)); #endif } @@ -7621,7 +7753,7 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) uint64x2_t zeros = vbicq_u64(m, v); // If both 128-bit variables are populated (non-zero) then return 1. - // For comparision purposes, first compact each var down to 32-bits. + // For comparison purposes, first compact each var down to 32-bits. uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros)); // if folding minimum is non-zero then both vars must be non-zero @@ -8130,7 +8262,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) FORCE_INLINE int _sse2neon_clz(unsigned int x) { -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) unsigned long cnt = 0; if (_BitScanReverse(&cnt, x)) return 31 - cnt; @@ -8142,7 +8274,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x) FORCE_INLINE int _sse2neon_ctz(unsigned int x) { -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) unsigned long cnt = 0; if (_BitScanForward(&cnt, x)) return cnt; @@ -8530,7 +8662,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1); #else // Fall back to the generic table lookup approach // Adapted from: https://create.stephan-brumme.com/crc32/ - // Apply half-byte comparision algorithm for the best ratio between + // Apply half-byte comparison algorithm for the best ratio between // performance and lookup table. // The lookup table just needs to store every 16th entry @@ -8697,9 +8829,9 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \ ((uint32_t) (b1) << 8) | (uint32_t) (b0)) -// muliplying 'x' by 2 in GF(2^8) +// multiplying 'x' by 2 in GF(2^8) #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) -// muliplying 'x' by 3 in GF(2^8) +// multiplying 'x' by 3 in GF(2^8) #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) #define SSE2NEON_AES_U0(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) @@ -8784,7 +8916,7 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & - 0x1b); // muliplying 'v' by 2 in GF(2^8) + 0x1b); // multiplying 'v' by 2 in GF(2^8) w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); @@ -9058,7 +9190,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) // AESE does ShiftRows and SubBytes on A uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__clang__) uint8x16_t dest = { // Undo ShiftRows step from AESE and extract X1 and X3 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) @@ -9228,7 +9360,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) #if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } @@ -9245,7 +9377,7 @@ FORCE_INLINE uint64_t _rdtsc(void) * bits wide and it is attributed with the flag 'cap_user_time_short' * is true. */ -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2)); #else __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val)); diff --git a/src/common/ring_buffer.h b/src/common/ring_buffer.h index 14f6eceeb8..86de96b43e 100644 --- a/src/common/ring_buffer.h +++ b/src/common/ring_buffer.h @@ -15,6 +15,7 @@ #include #include #include +#include namespace Common {