diff --git a/simde/x86/sse.h b/simde/x86/sse.h index b547b93dd..1258ab30d 100644 --- a/simde/x86/sse.h +++ b/simde/x86/sse.h @@ -3119,6 +3119,15 @@ simde_mm_max_ps (simde__m128 a, simde__m128 b) { r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(a_.altivec_f32, b_.altivec_f32)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) r_.lsx_f32 = __lsx_vfmax_s(a_.lsx_f32, b_.lsx_f32); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + uint32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32 > b_.f32); + r_.f32 = + HEDLEY_REINTERPRET_CAST( + __typeof__(r_.f32), + ( (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32) & m) | + (HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f32) & ~m) + ) + ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -3236,17 +3245,19 @@ simde_mm_min_ps (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(SIMDE_FAST_NANS) && defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_NANS) r_.neon_f32 = vminq_f32(a_.neon_f32, b_.neon_f32); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vbslq_f32(vcltq_f32(a_.neon_f32, b_.neon_f32), a_.neon_f32, b_.neon_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS) + r_.wasm_v128 = wasm_f32x4_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_pmin(b_.wasm_v128, a_.wasm_v128); + r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128)); + #elif (defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)) && defined(SIMDE_FAST_NANS) + r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - #if defined(SIMDE_FAST_NANS) - r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32); - #else - r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(b_.altivec_f32, a_.altivec_f32)); - #endif - #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmplt(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) r_.lsx_f32 = __lsx_vfmin_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) uint32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32 < b_.f32); diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index 89bf9dbd1..8f646cab5 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -424,6 +424,137 @@ simde__m128d_to_private(simde__m128d v) { SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v2f64, lsx, f64) #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_x_mm_round_pd (simde__m128d a, int rounding, int lax_rounding) + SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lax_rounding, 0, 1) { + simde__m128d_private + r_, + a_ = simde__m128d_to_private(a); + + (void) lax_rounding; + + /* For architectures which lack a current direction SIMD instruction. + * + * Note that NEON actually has a current rounding mode instruction, + * but in ARMv8+ the rounding mode is ignored and nearest is always + * used, so we treat ARMv7 as having a rounding mode but ARMv8 as + * not. */ + #if \ + defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ARM_NEON_A32V8) + if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION) + rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13; + #endif + + switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { + case SIMDE_MM_FROUND_CUR_DIRECTION: + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vrndiq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); + #elif defined(simde_math_nearbyint) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = simde_math_nearbyint(a_.f64[i]); + } + #else + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + #endif + break; + + case SIMDE_MM_FROUND_TO_NEAREST_INT: + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_rint(a_.altivec_f64)); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vrndnq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); + #elif defined(simde_math_roundeven) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = simde_math_roundeven(a_.f64[i]); + } + #else + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + #endif + break; + + case SIMDE_MM_FROUND_TO_NEG_INF: + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64)); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vrndmq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_floor(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrm_d(a_.lsx_f64); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = simde_math_floor(a_.f64[i]); + } + #endif + break; + + case SIMDE_MM_FROUND_TO_POS_INF: + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64)); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vrndpq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_ceil(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrp_d(a_.lsx_f64); + #elif defined(simde_math_ceil) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = simde_math_ceil(a_.f64[i]); + } + #else + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + #endif + break; + + case SIMDE_MM_FROUND_TO_ZERO: + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64)); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vrndq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_trunc(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrz_d(a_.lsx_f64); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = simde_math_trunc(a_.f64[i]); + } + #endif + break; + + default: + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + } + + return simde__m128d_from_private(r_); +} +#if defined(SIMDE_X86_SSE4_1_NATIVE) + #define simde_mm_round_pd(a, rounding) _mm_round_pd((a), (rounding)) +#else + #define simde_mm_round_pd(a, rounding) simde_x_mm_round_pd((a), (rounding), 0) +#endif +#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) + #define _mm_round_pd(a, rounding) simde_mm_round_pd((a), (rounding)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_pd (simde_float64 e1, simde_float64 e0) { @@ -3051,8 +3182,9 @@ simde_mm_cvtpd_pi32 (simde__m128d a) { return _mm_cvtpd_pi32(a); #else simde__m64_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); + simde__m128d_private a_; + a_ = simde__m128d_to_private(simde_x_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1)); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { simde_float64 v = simde_math_round(a_.f64[i]); @@ -4622,14 +4754,29 @@ simde_mm_min_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_NANS) r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vbslq_f64(vcltq_f64(a_.neon_f64, b_.neon_f64), a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS) r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128)); + #elif (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) && defined(SIMDE_FAST_NANS) + r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64); + #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = vec_sel(b_.altivec_f64, a_.altivec_f64, vec_cmplt(a_.altivec_f64, b_.altivec_f64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) r_.lsx_f64 = __lsx_vfmin_d(a_.lsx_f64, b_.lsx_f64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + uint64_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64 < b_.f64); + r_.f64 = + HEDLEY_REINTERPRET_CAST( + __typeof__(r_.f64), + ( (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64) & m) | + (HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f64) & ~m) + ) + ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -4753,14 +4900,29 @@ simde_mm_max_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_NANS) r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vbslq_f64(vcgtq_f64(a_.neon_f64, b_.neon_f64), a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS) + r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128)); + #elif (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) && defined(SIMDE_FAST_NANS) + r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64); + #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = vec_sel(b_.altivec_f64, a_.altivec_f64, vec_cmpgt(a_.altivec_f64, b_.altivec_f64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) r_.lsx_f64 = __lsx_vfmax_d(a_.lsx_f64, b_.lsx_f64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + uint64_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64 > b_.f64); + r_.f64 = + HEDLEY_REINTERPRET_CAST( + __typeof__(r_.f64), + ( (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64) & m) | + (HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f64) & ~m) + ) + ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { diff --git a/simde/x86/sse4.1.h b/simde/x86/sse4.1.h index a5cabf7d0..b3697dfb8 100644 --- a/simde/x86/sse4.1.h +++ b/simde/x86/sse4.1.h @@ -416,126 +416,6 @@ simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) { #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask) #endif -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_round_pd (simde__m128d a, int rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - /* For architectures which lack a current direction SIMD instruction. */ - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION) - rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13; - #endif - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_CUR_DIRECTION: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndiq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); - #elif defined(simde_math_nearbyint) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_nearbyint(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEAREST_INT: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndaq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); - #elif defined(simde_math_roundeven) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_roundeven(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEG_INF: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndmq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_floor(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfrintrm_d(a_.lsx_f64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_floor(a_.f64[i]); - } - #endif - break; - - case SIMDE_MM_FROUND_TO_POS_INF: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndpq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_ceil(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfrintrp_d(a_.lsx_f64); - #elif defined(simde_math_ceil) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_ceil(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_ZERO: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_trunc(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfrintrz_d(a_.lsx_f64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_trunc(a_.f64[i]); - } - #endif - break; - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_round_pd - #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding) -#endif - SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_ceil_pd (simde__m128d a) { diff --git a/test/x86/sse2.c b/test/x86/sse2.c index 4844c4123..eeecabdf1 100644 --- a/test/x86/sse2.c +++ b/test/x86/sse2.c @@ -3270,6 +3270,16 @@ test_simde_mm_cvtpd_epi32 (SIMDE_MUNIT_TEST_ARGS) { { { HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) - 1), HEDLEY_STATIC_CAST(int64_t, INT32_MIN) + 100 }, { INT32_MIN, -INT32_C( 2147483548), INT32_C( 0), INT32_C( 0) } }, #endif + #if !defined(SIMDE_FAST_ROUND_TIES) + { { SIMDE_FLOAT64_C(-1.50), SIMDE_FLOAT64_C(1.50) }, + { -INT32_C( 2), INT32_C( 2) } }, + { { SIMDE_FLOAT64_C(-2.50), SIMDE_FLOAT64_C(2.50) }, + { -INT32_C( 2), INT32_C( 2) } }, + { { SIMDE_FLOAT64_C(-3.50), SIMDE_FLOAT64_C(3.50) }, + { -INT32_C( 4), INT32_C( 4) } }, + { { SIMDE_FLOAT64_C(-4.50), SIMDE_FLOAT64_C(4.50) }, + { -INT32_C( 4), INT32_C( 4) } }, + #endif { { SIMDE_FLOAT64_C( -220.31), SIMDE_FLOAT64_C( 685.08) }, { -INT32_C( 220), INT32_C( 685), INT32_C( 0), INT32_C( 0) } }, { { SIMDE_FLOAT64_C( -164.88), SIMDE_FLOAT64_C( 725.51) }, @@ -3309,6 +3319,16 @@ test_simde_mm_cvtpd_pi32 (SIMDE_MUNIT_TEST_ARGS) { { { HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) - 1), HEDLEY_STATIC_CAST(int64_t, INT32_MIN) + 100 }, { INT32_MIN, -INT32_C( 2147483548) } }, #endif + #if !defined(SIMDE_FAST_ROUND_TIES) + { { SIMDE_FLOAT64_C(-1.50), SIMDE_FLOAT64_C(1.50) }, + { -INT32_C( 2), INT32_C( 2) } }, + { { SIMDE_FLOAT64_C(-2.50), SIMDE_FLOAT64_C(2.50) }, + { -INT32_C( 2), INT32_C( 2) } }, + { { SIMDE_FLOAT64_C(-3.50), SIMDE_FLOAT64_C(3.50) }, + { -INT32_C( 4), INT32_C( 4) } }, + { { SIMDE_FLOAT64_C(-4.50), SIMDE_FLOAT64_C(4.50) }, + { -INT32_C( 4), INT32_C( 4) } }, + #endif { { SIMDE_FLOAT64_C( -220.31), SIMDE_FLOAT64_C( 685.08) }, { -INT32_C( 220), INT32_C( 685) } }, { { SIMDE_FLOAT64_C( -164.88), SIMDE_FLOAT64_C( 725.51) }, @@ -4981,7 +5001,15 @@ test_simde_mm_min_pd(SIMDE_MUNIT_TEST_ARGS) { simde__m128d a; simde__m128d b; simde__m128d r; - } test_vec[10] = { + } test_vec[] = { + #if !defined(SIMDE_FAST_NANS) && !defined(SIMDE_ARCH_WASM_RELAXED_SIMD) + { simde_mm_set_pd(SIMDE_FLOAT64_C(SIMDE_MATH_NAN), SIMDE_FLOAT64_C( -480.90)), + simde_mm_set_pd(SIMDE_FLOAT64_C( 319.11), SIMDE_FLOAT64_C( SIMDE_MATH_NAN)), + simde_mm_set_pd(SIMDE_FLOAT64_C( 319.11), SIMDE_FLOAT64_C( SIMDE_MATH_NAN)) }, + { simde_mm_set_pd(SIMDE_FLOAT64_C(SIMDE_MATH_NAN), SIMDE_FLOAT64_C( SIMDE_MATH_NAN)), + simde_mm_set_pd(SIMDE_FLOAT64_C(SIMDE_MATH_NAN), SIMDE_FLOAT64_C(SIMDE_MATH_INFINITY)), + simde_mm_set_pd(SIMDE_FLOAT64_C(SIMDE_MATH_NAN), SIMDE_FLOAT64_C(SIMDE_MATH_INFINITY)) }, + #endif { simde_mm_set_pd(SIMDE_FLOAT64_C( -927.67), SIMDE_FLOAT64_C( -514.32)), simde_mm_set_pd(SIMDE_FLOAT64_C( 342.71), SIMDE_FLOAT64_C( 927.58)), simde_mm_set_pd(SIMDE_FLOAT64_C( -927.67), SIMDE_FLOAT64_C( -514.32)) }, @@ -5199,7 +5227,15 @@ test_simde_mm_max_pd(SIMDE_MUNIT_TEST_ARGS) { simde__m128d a; simde__m128d b; simde__m128d r; - } test_vec[8] = { + } test_vec[] = { + #if !defined(SIMDE_FAST_NANS) && !defined(SIMDE_ARCH_WASM_RELAXED_SIMD) + { simde_mm_set_pd(SIMDE_FLOAT64_C(SIMDE_MATH_NAN), SIMDE_FLOAT64_C( -480.90)), + simde_mm_set_pd(SIMDE_FLOAT64_C( 319.11), SIMDE_FLOAT64_C( SIMDE_MATH_NAN)), + simde_mm_set_pd(SIMDE_FLOAT64_C( 319.11), SIMDE_FLOAT64_C( SIMDE_MATH_NAN)) }, + { simde_mm_set_pd(SIMDE_FLOAT64_C(SIMDE_MATH_NAN), SIMDE_FLOAT64_C( SIMDE_MATH_NAN)), + simde_mm_set_pd(SIMDE_FLOAT64_C(SIMDE_MATH_NAN), SIMDE_FLOAT64_C(SIMDE_MATH_INFINITY)), + simde_mm_set_pd(SIMDE_FLOAT64_C(SIMDE_MATH_NAN), SIMDE_FLOAT64_C(SIMDE_MATH_INFINITY)) }, + #endif { simde_mm_set_pd(SIMDE_FLOAT64_C( -303.58), SIMDE_FLOAT64_C( -480.90)), simde_mm_set_pd(SIMDE_FLOAT64_C( 319.11), SIMDE_FLOAT64_C( 666.53)), simde_mm_set_pd(SIMDE_FLOAT64_C( 319.11), SIMDE_FLOAT64_C( 666.53)) }, diff --git a/test/x86/sse4.1.c b/test/x86/sse4.1.c index 1342b8a21..0af67520b 100644 --- a/test/x86/sse4.1.c +++ b/test/x86/sse4.1.c @@ -2859,6 +2859,54 @@ test_simde_mm_packus_epi32(SIMDE_MUNIT_TEST_ARGS) { return 0; } +static int +test_simde_mm_round_pd_nearest (SIMDE_MUNIT_TEST_ARGS) { +#if 1 + static const struct { + const simde_float64 a[2]; + const simde_float64 r[2]; + } test_vec[] = { + { { SIMDE_FLOAT64_C( -569.50), SIMDE_FLOAT64_C( -418.50) }, + { SIMDE_FLOAT64_C( -570.00), SIMDE_FLOAT64_C( -418.00) } }, + { { SIMDE_FLOAT64_C( -678.00), SIMDE_FLOAT64_C( -253.50) }, + { SIMDE_FLOAT64_C( -678.00), SIMDE_FLOAT64_C( -254.00) } }, + { { SIMDE_FLOAT64_C( 654.23), SIMDE_FLOAT64_C( 246.50) }, + { SIMDE_FLOAT64_C( 654.00), SIMDE_FLOAT64_C( 246.00) } }, + { { SIMDE_FLOAT64_C( 505.25), SIMDE_FLOAT64_C( 625.50) }, + { SIMDE_FLOAT64_C( 505.00), SIMDE_FLOAT64_C( 626.00) } }, + { { SIMDE_FLOAT64_C( -983.50), SIMDE_FLOAT64_C( 733.52) }, + { SIMDE_FLOAT64_C( -984.00), SIMDE_FLOAT64_C( 734.00) } }, + { { SIMDE_FLOAT64_C( 669.50), SIMDE_FLOAT64_C( 205.50) }, + { SIMDE_FLOAT64_C( 670.00), SIMDE_FLOAT64_C( 206.00) } }, + { { SIMDE_FLOAT64_C( 634.50), SIMDE_FLOAT64_C( 152.54) }, + { SIMDE_FLOAT64_C( 634.00), SIMDE_FLOAT64_C( 153.00) } }, + { { SIMDE_FLOAT64_C( -142.50), SIMDE_FLOAT64_C( 450.50) }, + { SIMDE_FLOAT64_C( -142.00), SIMDE_FLOAT64_C( 450.00) } }, + }; + + for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { + simde__m128d a = simde_mm_loadu_pd(test_vec[i].a); + simde__m128d r = simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEAREST_INT); + simde_test_x86_assert_equal_f64x2(r, simde_mm_loadu_pd(test_vec[i].r), 1); + } + + return 0; +#else + fputc('\n', stdout); + simde_float64 values[8 * 2 * sizeof(simde__m128d)]; + simde_test_x86_random_f64x2_full(8, 2, values, -1000.0f, 1000.0f, SIMDE_TEST_VEC_FLOAT_ROUND); + + for (size_t i = 0 ; i < 8 ; i++) { + simde__m128d a = simde_test_x86_random_extract_f64x2(i, 1, 0, values); + simde__m128d r = simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEAREST_INT); + + simde_test_x86_write_f64x2(2, a, SIMDE_TEST_VEC_POS_FIRST); + simde_test_x86_write_f64x2(2, r, SIMDE_TEST_VEC_POS_LAST); + } + return 1; +#endif +} + static int test_simde_mm_round_pd(SIMDE_MUNIT_TEST_ARGS) { const struct { @@ -3666,6 +3714,8 @@ SIMDE_TEST_FUNC_LIST_BEGIN SIMDE_TEST_FUNC_LIST_ENTRY(mm_round_sd) SIMDE_TEST_FUNC_LIST_ENTRY(mm_round_ss) + SIMDE_TEST_FUNC_LIST_ENTRY(mm_round_pd_nearest) + SIMDE_TEST_FUNC_LIST_ENTRY(mm_round_ps_nearest) SIMDE_TEST_FUNC_LIST_ENTRY(mm_round_ps_ninf) SIMDE_TEST_FUNC_LIST_ENTRY(mm_round_ps_pinf)