diff --git a/crates/core_arch/src/x86/adx.rs b/crates/core_arch/src/x86/adx.rs index 5ba7664616..9ce65b7643 100644 --- a/crates/core_arch/src/x86/adx.rs +++ b/crates/core_arch/src/x86/adx.rs @@ -5,8 +5,6 @@ use stdarch_test::assert_instr; unsafe extern "unadjusted" { #[link_name = "llvm.x86.addcarry.32"] fn llvm_addcarry_u32(a: u8, b: u32, c: u32) -> (u8, u32); - #[link_name = "llvm.x86.addcarryx.u32"] - fn llvm_addcarryx_u32(a: u8, b: u32, c: u32, d: *mut u32) -> u8; #[link_name = "llvm.x86.subborrow.32"] fn llvm_subborrow_u32(a: u8, b: u32, c: u32) -> (u8, u32); } @@ -35,7 +33,7 @@ pub unsafe fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 { #[cfg_attr(test, assert_instr(adc))] #[stable(feature = "simd_x86_adx", since = "1.33.0")] pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 { - llvm_addcarryx_u32(c_in, a, b, out as *mut _) + _addcarry_u32(c_in, a, b, out) } /// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in` diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index 24e0cf6ba1..c1bb897ce0 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -587,7 +587,11 @@ pub fn _mm256_dp_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vhaddpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d { - unsafe { vhaddpd(a, b) } + unsafe { + let even = simd_shuffle!(a, b, [0, 4, 2, 6]); + let odd = simd_shuffle!(a, b, [1, 5, 3, 7]); + simd_add(even, odd) + } } /// Horizontal addition of adjacent pairs in the two packed vectors @@ -602,7 +606,11 @@ pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vhaddps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 { - unsafe { vhaddps(a, b) } + unsafe { + let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]); + let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]); + simd_add(even, odd) + } } /// Horizontal subtraction of adjacent pairs in the two packed vectors @@ -616,7 +624,11 @@ pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vhsubpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d { - unsafe { vhsubpd(a, b) } + unsafe { + let even = simd_shuffle!(a, b, [0, 4, 2, 6]); + let odd = simd_shuffle!(a, b, [1, 5, 3, 7]); + simd_sub(even, odd) + } } /// Horizontal subtraction of adjacent pairs in the two packed vectors @@ -631,7 +643,11 @@ pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vhsubps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 { - unsafe { vhsubps(a, b) } + unsafe { + let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]); + let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]); + simd_sub(even, odd) + } } /// Computes the bitwise XOR of packed double-precision (64-bit) floating-point @@ -1218,7 +1234,10 @@ pub fn _mm_permute_pd(a: __m128d) -> __m128d { #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_permute2f128_ps(a: __m256, b: __m256) -> __m256 { static_assert_uimm_bits!(IMM8, 8); - unsafe { vperm2f128ps256(a, b, IMM8 as i8) } + _mm256_castsi256_ps(_mm256_permute2f128_si256::( + _mm256_castps_si256(a), + _mm256_castps_si256(b), + )) } /// Shuffles 256 bits (composed of 4 packed double-precision (64-bit) @@ -1232,7 +1251,10 @@ pub fn _mm256_permute2f128_ps(a: __m256, b: __m256) -> __m256 { #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_permute2f128_pd(a: __m256d, b: __m256d) -> __m256d { static_assert_uimm_bits!(IMM8, 8); - unsafe { vperm2f128pd256(a, b, IMM8 as i8) } + _mm256_castsi256_pd(_mm256_permute2f128_si256::( + _mm256_castpd_si256(a), + _mm256_castpd_si256(b), + )) } /// Shuffles 128-bits (composed of integer data) selected by `imm8` @@ -1246,7 +1268,35 @@ pub fn _mm256_permute2f128_pd(a: __m256d, b: __m256d) -> __m256 #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_permute2f128_si256(a: __m256i, b: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - unsafe { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) } + const fn idx(imm8: i32, pos: u32) -> u32 { + let part = if pos < 2 { + imm8 & 0xf + } else { + (imm8 & 0xf0) >> 4 + }; + 2 * (part as u32 & 0b11) + (pos & 1) + } + const fn idx0(imm8: i32, pos: u32) -> u32 { + let part = if pos < 2 { + imm8 & 0xf + } else { + (imm8 & 0xf0) >> 4 + }; + if part & 0b1000 != 0 { 4 } else { pos } + } + unsafe { + let r = simd_shuffle!( + a.as_i64x4(), + b.as_i64x4(), + [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)] + ); + let r: i64x4 = simd_shuffle!( + r, + i64x4::ZERO, + [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)] + ); + r.as_m256i() + } } /// Broadcasts a single-precision (32-bit) floating-point element from memory @@ -1933,7 +1983,10 @@ pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vptest))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 { - unsafe { ptestz256(a.as_i64x4(), b.as_i64x4()) } + unsafe { + let r = simd_and(a.as_i64x4(), b.as_i64x4()); + (0i64 == simd_reduce_or(r)) as i32 + } } /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and @@ -1947,7 +2000,10 @@ pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 { #[cfg_attr(test, assert_instr(vptest))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 { - unsafe { ptestc256(a.as_i64x4(), b.as_i64x4()) } + unsafe { + let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4()); + (0i64 == simd_reduce_or(r)) as i32 + } } /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and @@ -2031,7 +2087,10 @@ pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 { #[cfg_attr(test, assert_instr(vtestpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 { - unsafe { vtestzpd(a, b) } + unsafe { + let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO); + (0i64 == simd_reduce_or(r)) as i32 + } } /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) @@ -2048,7 +2107,10 @@ pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 { #[cfg_attr(test, assert_instr(vtestpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 { - unsafe { vtestcpd(a, b) } + unsafe { + let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO); + (0i64 == simd_reduce_or(r)) as i32 + } } /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) @@ -2135,7 +2197,10 @@ pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 { #[cfg_attr(test, assert_instr(vtestps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 { - unsafe { vtestzps(a, b) } + unsafe { + let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO); + (0i32 == simd_reduce_or(r)) as i32 + } } /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) @@ -2152,7 +2217,10 @@ pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 { #[cfg_attr(test, assert_instr(vtestps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 { - unsafe { vtestcps(a, b) } + unsafe { + let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO); + (0i32 == simd_reduce_or(r)) as i32 + } } /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) @@ -3044,14 +3112,6 @@ unsafe extern "C" { fn roundps256(a: __m256, b: i32) -> __m256; #[link_name = "llvm.x86.avx.dp.ps.256"] fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256; - #[link_name = "llvm.x86.avx.hadd.pd.256"] - fn vhaddpd(a: __m256d, b: __m256d) -> __m256d; - #[link_name = "llvm.x86.avx.hadd.ps.256"] - fn vhaddps(a: __m256, b: __m256) -> __m256; - #[link_name = "llvm.x86.avx.hsub.pd.256"] - fn vhsubpd(a: __m256d, b: __m256d) -> __m256d; - #[link_name = "llvm.x86.avx.hsub.ps.256"] - fn vhsubps(a: __m256, b: __m256) -> __m256; #[link_name = "llvm.x86.sse2.cmp.pd"] fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d; #[link_name = "llvm.x86.avx.cmp.pd.256"] @@ -3084,12 +3144,6 @@ unsafe extern "C" { fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d; #[link_name = "llvm.x86.avx.vpermilvar.pd"] fn vpermilpd(a: __m128d, b: i64x2) -> __m128d; - #[link_name = "llvm.x86.avx.vperm2f128.ps.256"] - fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256; - #[link_name = "llvm.x86.avx.vperm2f128.pd.256"] - fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d; - #[link_name = "llvm.x86.avx.vperm2f128.si.256"] - fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8; #[link_name = "llvm.x86.avx.maskload.pd.256"] fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d; #[link_name = "llvm.x86.avx.maskstore.pd.256"] @@ -3112,10 +3166,6 @@ unsafe extern "C" { fn vrcpps(a: __m256) -> __m256; #[link_name = "llvm.x86.avx.rsqrt.ps.256"] fn vrsqrtps(a: __m256) -> __m256; - #[link_name = "llvm.x86.avx.ptestz.256"] - fn ptestz256(a: i64x4, b: i64x4) -> i32; - #[link_name = "llvm.x86.avx.ptestc.256"] - fn ptestc256(a: i64x4, b: i64x4) -> i32; #[link_name = "llvm.x86.avx.ptestnzc.256"] fn ptestnzc256(a: i64x4, b: i64x4) -> i32; #[link_name = "llvm.x86.avx.vtestz.pd.256"] @@ -3124,10 +3174,6 @@ unsafe extern "C" { fn vtestcpd256(a: __m256d, b: __m256d) -> i32; #[link_name = "llvm.x86.avx.vtestnzc.pd.256"] fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32; - #[link_name = "llvm.x86.avx.vtestz.pd"] - fn vtestzpd(a: __m128d, b: __m128d) -> i32; - #[link_name = "llvm.x86.avx.vtestc.pd"] - fn vtestcpd(a: __m128d, b: __m128d) -> i32; #[link_name = "llvm.x86.avx.vtestnzc.pd"] fn vtestnzcpd(a: __m128d, b: __m128d) -> i32; #[link_name = "llvm.x86.avx.vtestz.ps.256"] @@ -3136,10 +3182,6 @@ unsafe extern "C" { fn vtestcps256(a: __m256, b: __m256) -> i32; #[link_name = "llvm.x86.avx.vtestnzc.ps.256"] fn vtestnzcps256(a: __m256, b: __m256) -> i32; - #[link_name = "llvm.x86.avx.vtestz.ps"] - fn vtestzps(a: __m128, b: __m128) -> i32; - #[link_name = "llvm.x86.avx.vtestc.ps"] - fn vtestcps(a: __m128, b: __m128) -> i32; #[link_name = "llvm.x86.avx.vtestnzc.ps"] fn vtestnzcps(a: __m128, b: __m128) -> i32; #[link_name = "llvm.x86.avx.min.ps.256"] diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 739de2b341..8be6629f79 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -891,7 +891,21 @@ pub fn _mm256_extracti128_si256(a: __m256i) -> __m128i { #[cfg_attr(test, assert_instr(vphaddw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) } + let a = a.as_i16x16(); + let b = b.as_i16x16(); + unsafe { + let even: i16x16 = simd_shuffle!( + a, + b, + [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30] + ); + let odd: i16x16 = simd_shuffle!( + a, + b, + [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31] + ); + simd_add(even, odd).as_m256i() + } } /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`. @@ -902,7 +916,13 @@ pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vphaddd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) } + let a = a.as_i32x8(); + let b = b.as_i32x8(); + unsafe { + let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]); + let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]); + simd_add(even, odd).as_m256i() + } } /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b` @@ -925,7 +945,21 @@ pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vphsubw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) } + let a = a.as_i16x16(); + let b = b.as_i16x16(); + unsafe { + let even: i16x16 = simd_shuffle!( + a, + b, + [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30] + ); + let odd: i16x16 = simd_shuffle!( + a, + b, + [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31] + ); + simd_sub(even, odd).as_m256i() + } } /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`. @@ -936,7 +970,13 @@ pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vphsubd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) } + let a = a.as_i32x8(); + let b = b.as_i32x8(); + unsafe { + let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]); + let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]); + simd_sub(even, odd).as_m256i() + } } /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b` @@ -1714,7 +1754,12 @@ pub fn _mm256_inserti128_si256(a: __m256i, b: __m128i) -> __m25 #[cfg_attr(test, assert_instr(vpmaddwd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) } + unsafe { + let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16())); + let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]); + let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]); + simd_add(even, odd).as_m256i() + } } /// Vertically multiplies each unsigned 8-bit integer from `a` with the @@ -2285,7 +2330,7 @@ pub fn _mm256_permute4x64_epi64(a: __m256i) -> __m256i { #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_permute2x128_si256(a: __m256i, b: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - unsafe { transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) } + _mm256_permute2f128_si256::(a, b) } /// Shuffles 64-bit floating-point elements in `a` across lanes using the @@ -2733,7 +2778,7 @@ pub fn _mm256_bslli_epi128(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsllvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) } + unsafe { transmute(simd_shl(a.as_u32x4(), count.as_u32x4())) } } /// Shifts packed 32-bit integers in `a` left by the amount @@ -2746,7 +2791,7 @@ pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsllvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) } + unsafe { transmute(simd_shl(a.as_u32x8(), count.as_u32x8())) } } /// Shifts packed 64-bit integers in `a` left by the amount @@ -2759,7 +2804,7 @@ pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsllvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) } + unsafe { transmute(simd_shl(a.as_u64x2(), count.as_u64x2())) } } /// Shifts packed 64-bit integers in `a` left by the amount @@ -2772,7 +2817,7 @@ pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsllvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) } + unsafe { transmute(simd_shl(a.as_u64x4(), count.as_u64x4())) } } /// Shifts packed 16-bit integers in `a` right by `count` while @@ -2836,7 +2881,7 @@ pub fn _mm256_srai_epi32(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsravd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) } + unsafe { transmute(simd_shr(a.as_i32x4(), count.as_i32x4())) } } /// Shifts packed 32-bit integers in `a` right by the amount specified by the @@ -2848,7 +2893,7 @@ pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsravd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) } + unsafe { transmute(simd_shr(a.as_i32x8(), count.as_i32x8())) } } /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. @@ -3031,7 +3076,7 @@ pub fn _mm256_srli_epi64(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrlvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) } + unsafe { transmute(simd_shr(a.as_u32x4(), count.as_u32x4())) } } /// Shifts packed 32-bit integers in `a` right by the amount specified by @@ -3043,7 +3088,7 @@ pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsrlvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) } + unsafe { transmute(simd_shr(a.as_u32x8(), count.as_u32x8())) } } /// Shifts packed 64-bit integers in `a` right by the amount specified by @@ -3055,7 +3100,7 @@ pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrlvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) } + unsafe { transmute(simd_shr(a.as_u64x2(), count.as_u64x2())) } } /// Shifts packed 64-bit integers in `a` right by the amount specified by @@ -3067,7 +3112,7 @@ pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsrlvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) } + unsafe { transmute(simd_shr(a.as_u64x4(), count.as_u64x4())) } } /// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr @@ -3594,20 +3639,10 @@ pub fn _mm256_extract_epi16(a: __m256i) -> i32 { #[allow(improper_ctypes)] unsafe extern "C" { - #[link_name = "llvm.x86.avx2.phadd.w"] - fn phaddw(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx2.phadd.d"] - fn phaddd(a: i32x8, b: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.phadd.sw"] fn phaddsw(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx2.phsub.w"] - fn phsubw(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx2.phsub.d"] - fn phsubd(a: i32x8, b: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.phsub.sw"] fn phsubsw(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx2.pmadd.wd"] - fn pmaddwd(a: i16x16, b: i16x16) -> i32x8; #[link_name = "llvm.x86.avx2.pmadd.ub.sw"] fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16; #[link_name = "llvm.x86.avx2.maskload.d"] @@ -3652,44 +3687,22 @@ unsafe extern "C" { fn pslld(a: i32x8, count: i32x4) -> i32x8; #[link_name = "llvm.x86.avx2.psll.q"] fn psllq(a: i64x4, count: i64x2) -> i64x4; - #[link_name = "llvm.x86.avx2.psllv.d"] - fn psllvd(a: i32x4, count: i32x4) -> i32x4; - #[link_name = "llvm.x86.avx2.psllv.d.256"] - fn psllvd256(a: i32x8, count: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx2.psllv.q"] - fn psllvq(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx2.psllv.q.256"] - fn psllvq256(a: i64x4, count: i64x4) -> i64x4; #[link_name = "llvm.x86.avx2.psra.w"] fn psraw(a: i16x16, count: i16x8) -> i16x16; #[link_name = "llvm.x86.avx2.psra.d"] fn psrad(a: i32x8, count: i32x4) -> i32x8; - #[link_name = "llvm.x86.avx2.psrav.d"] - fn psravd(a: i32x4, count: i32x4) -> i32x4; - #[link_name = "llvm.x86.avx2.psrav.d.256"] - fn psravd256(a: i32x8, count: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.psrl.w"] fn psrlw(a: i16x16, count: i16x8) -> i16x16; #[link_name = "llvm.x86.avx2.psrl.d"] fn psrld(a: i32x8, count: i32x4) -> i32x8; #[link_name = "llvm.x86.avx2.psrl.q"] fn psrlq(a: i64x4, count: i64x2) -> i64x4; - #[link_name = "llvm.x86.avx2.psrlv.d"] - fn psrlvd(a: i32x4, count: i32x4) -> i32x4; - #[link_name = "llvm.x86.avx2.psrlv.d.256"] - fn psrlvd256(a: i32x8, count: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx2.psrlv.q"] - fn psrlvq(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx2.psrlv.q.256"] - fn psrlvq256(a: i64x4, count: i64x4) -> i64x4; #[link_name = "llvm.x86.avx2.pshuf.b"] fn pshufb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.permd"] fn permd(a: u32x8, b: u32x8) -> u32x8; #[link_name = "llvm.x86.avx2.permps"] fn permps(a: __m256, b: i32x8) -> __m256; - #[link_name = "llvm.x86.avx2.vperm2i128"] - fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4; #[link_name = "llvm.x86.avx2.gather.d.d"] fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4; #[link_name = "llvm.x86.avx2.gather.d.d.256"] diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index 8139b8cd6f..1771f19659 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -5835,7 +5835,20 @@ pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmaddwd))] pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) } + unsafe { + let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32())); + let even: i32x16 = simd_shuffle!( + r, + r, + [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] + ); + let odd: i32x16 = simd_shuffle!( + r, + r, + [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31] + ); + simd_add(even, odd).as_m512i() + } } /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6839,7 +6852,7 @@ pub fn _mm_maskz_slli_epi16(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvw))] pub fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsllvw(a.as_i16x32(), count.as_i16x32())) } + unsafe { transmute(simd_shl(a.as_u16x32(), count.as_u16x32())) } } /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6878,7 +6891,7 @@ pub fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvw))] pub fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16())) } + unsafe { transmute(simd_shl(a.as_u16x16(), count.as_u16x16())) } } /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6917,7 +6930,7 @@ pub fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvw))] pub fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8())) } + unsafe { transmute(simd_shl(a.as_u16x8(), count.as_u16x8())) } } /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7175,7 +7188,7 @@ pub fn _mm_maskz_srli_epi16(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvw))] pub fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32())) } + unsafe { transmute(simd_shr(a.as_u16x32(), count.as_u16x32())) } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7214,7 +7227,7 @@ pub fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvw))] pub fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16())) } + unsafe { transmute(simd_shr(a.as_u16x16(), count.as_u16x16())) } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7253,7 +7266,7 @@ pub fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvw))] pub fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8())) } + unsafe { transmute(simd_shr(a.as_u16x8(), count.as_u16x8())) } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7498,7 +7511,7 @@ pub fn _mm_maskz_srai_epi16(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravw))] pub fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsravw(a.as_i16x32(), count.as_i16x32())) } + unsafe { transmute(simd_shr(a.as_i16x32(), count.as_i16x32())) } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7537,7 +7550,7 @@ pub fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravw))] pub fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsravw256(a.as_i16x16(), count.as_i16x16())) } + unsafe { transmute(simd_shr(a.as_i16x16(), count.as_i16x16())) } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7576,7 +7589,7 @@ pub fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravw))] pub fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsravw128(a.as_i16x8(), count.as_i16x8())) } + unsafe { transmute(simd_shr(a.as_i16x8(), count.as_i16x8())) } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -11617,8 +11630,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"] fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.pmaddw.d.512"] - fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16; #[link_name = "llvm.x86.avx512.pmaddubs.w.512"] fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32; @@ -11634,33 +11645,12 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.psll.w.512"] fn vpsllw(a: i16x32, count: i16x8) -> i16x32; - #[link_name = "llvm.x86.avx512.psllv.w.512"] - fn vpsllvw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.psllv.w.256"] - fn vpsllvw256(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx512.psllv.w.128"] - fn vpsllvw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.avx512.psrl.w.512"] fn vpsrlw(a: i16x32, count: i16x8) -> i16x32; - #[link_name = "llvm.x86.avx512.psrlv.w.512"] - fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.psrlv.w.256"] - fn vpsrlvw256(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx512.psrlv.w.128"] - fn vpsrlvw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.avx512.psra.w.512"] fn vpsraw(a: i16x32, count: i16x8) -> i16x32; - #[link_name = "llvm.x86.avx512.psrav.w.512"] - fn vpsravw(a: i16x32, count: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.psrav.w.256"] - fn vpsravw256(a: i16x16, count: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx512.psrav.w.128"] - fn vpsravw128(a: i16x8, count: i16x8) -> i16x8; - #[link_name = "llvm.x86.avx512.vpermi2var.hi.512"] fn vpermi2w(a: i16x32, idx: i16x32, b: i16x32) -> i16x32; #[link_name = "llvm.x86.avx512.vpermi2var.hi.256"] diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 52c6a11a43..002534a65d 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -19077,12 +19077,8 @@ pub fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(1)] pub fn _mm512_rol_epi32(a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x16(); - let r = vprold(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_rolv_epi32(a, _mm512_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19094,12 +19090,8 @@ pub fn _mm512_rol_epi32(a: __m512i) -> __m512i { #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(3)] pub fn _mm512_mask_rol_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x16(); - let r = vprold(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i32x16())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_mask_rolv_epi32(src, k, a, _mm512_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19111,12 +19103,8 @@ pub fn _mm512_mask_rol_epi32(src: __m512i, k: __mmask16, a: __m #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(2)] pub fn _mm512_maskz_rol_epi32(k: __mmask16, a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x16(); - let r = vprold(a, IMM8); - transmute(simd_select_bitmask(k, r, i32x16::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_maskz_rolv_epi32(k, a, _mm512_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst. @@ -19128,12 +19116,8 @@ pub fn _mm512_maskz_rol_epi32(k: __mmask16, a: __m512i) -> __m5 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(1)] pub fn _mm256_rol_epi32(a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x8(); - let r = vprold256(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_rolv_epi32(a, _mm256_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19145,12 +19129,8 @@ pub fn _mm256_rol_epi32(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(3)] pub fn _mm256_mask_rol_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x8(); - let r = vprold256(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i32x8())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_mask_rolv_epi32(src, k, a, _mm256_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19162,12 +19142,8 @@ pub fn _mm256_mask_rol_epi32(src: __m256i, k: __mmask8, a: __m2 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(2)] pub fn _mm256_maskz_rol_epi32(k: __mmask8, a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x8(); - let r = vprold256(a, IMM8); - transmute(simd_select_bitmask(k, r, i32x8::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_maskz_rolv_epi32(k, a, _mm256_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst. @@ -19179,12 +19155,8 @@ pub fn _mm256_maskz_rol_epi32(k: __mmask8, a: __m256i) -> __m25 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(1)] pub fn _mm_rol_epi32(a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x4(); - let r = vprold128(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_rolv_epi32(a, _mm_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19196,12 +19168,8 @@ pub fn _mm_rol_epi32(a: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(3)] pub fn _mm_mask_rol_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x4(); - let r = vprold128(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i32x4())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_mask_rolv_epi32(src, k, a, _mm_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19213,12 +19181,8 @@ pub fn _mm_mask_rol_epi32(src: __m128i, k: __mmask8, a: __m128i #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(2)] pub fn _mm_maskz_rol_epi32(k: __mmask8, a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x4(); - let r = vprold128(a, IMM8); - transmute(simd_select_bitmask(k, r, i32x4::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_maskz_rolv_epi32(k, a, _mm_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst. @@ -19230,12 +19194,8 @@ pub fn _mm_maskz_rol_epi32(k: __mmask8, a: __m128i) -> __m128i #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(1)] pub fn _mm512_ror_epi32(a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x16(); - let r = vprord(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_rorv_epi32(a, _mm512_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19247,12 +19207,8 @@ pub fn _mm512_ror_epi32(a: __m512i) -> __m512i { #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))] #[rustc_legacy_const_generics(3)] pub fn _mm512_mask_ror_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x16(); - let r = vprord(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i32x16())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_mask_rorv_epi32(src, k, a, _mm512_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19264,12 +19220,8 @@ pub fn _mm512_mask_ror_epi32(src: __m512i, k: __mmask16, a: __m #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))] #[rustc_legacy_const_generics(2)] pub fn _mm512_maskz_ror_epi32(k: __mmask16, a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x16(); - let r = vprord(a, IMM8); - transmute(simd_select_bitmask(k, r, i32x16::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_maskz_rorv_epi32(k, a, _mm512_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst. @@ -19281,12 +19233,8 @@ pub fn _mm512_maskz_ror_epi32(k: __mmask16, a: __m512i) -> __m5 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(1)] pub fn _mm256_ror_epi32(a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x8(); - let r = vprord256(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_rorv_epi32(a, _mm256_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19298,12 +19246,8 @@ pub fn _mm256_ror_epi32(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))] #[rustc_legacy_const_generics(3)] pub fn _mm256_mask_ror_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x8(); - let r = vprord256(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i32x8())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_mask_rorv_epi32(src, k, a, _mm256_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19315,12 +19259,8 @@ pub fn _mm256_mask_ror_epi32(src: __m256i, k: __mmask8, a: __m2 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))] #[rustc_legacy_const_generics(2)] pub fn _mm256_maskz_ror_epi32(k: __mmask8, a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x8(); - let r = vprord256(a, IMM8); - transmute(simd_select_bitmask(k, r, i32x8::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_maskz_rorv_epi32(k, a, _mm256_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst. @@ -19332,12 +19272,8 @@ pub fn _mm256_maskz_ror_epi32(k: __mmask8, a: __m256i) -> __m25 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))] #[rustc_legacy_const_generics(1)] pub fn _mm_ror_epi32(a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x4(); - let r = vprord128(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_rorv_epi32(a, _mm_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19349,12 +19285,8 @@ pub fn _mm_ror_epi32(a: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))] #[rustc_legacy_const_generics(3)] pub fn _mm_mask_ror_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x4(); - let r = vprord128(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i32x4())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_mask_rorv_epi32(src, k, a, _mm_set1_epi32(IMM8)) } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19366,12 +19298,8 @@ pub fn _mm_mask_ror_epi32(src: __m128i, k: __mmask8, a: __m128i #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))] #[rustc_legacy_const_generics(2)] pub fn _mm_maskz_ror_epi32(k: __mmask8, a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x4(); - let r = vprord128(a, IMM8); - transmute(simd_select_bitmask(k, r, i32x4::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_maskz_rorv_epi32(k, a, _mm_set1_epi32(IMM8)) } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst. @@ -19383,12 +19311,8 @@ pub fn _mm_maskz_ror_epi32(k: __mmask8, a: __m128i) -> __m128i #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))] #[rustc_legacy_const_generics(1)] pub fn _mm512_rol_epi64(a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x8(); - let r = vprolq(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_rolv_epi64(a, _mm512_set1_epi64(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19400,12 +19324,8 @@ pub fn _mm512_rol_epi64(a: __m512i) -> __m512i { #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))] #[rustc_legacy_const_generics(3)] pub fn _mm512_mask_rol_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x8(); - let r = vprolq(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i64x8())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_mask_rolv_epi64(src, k, a, _mm512_set1_epi64(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19417,12 +19337,8 @@ pub fn _mm512_mask_rol_epi64(src: __m512i, k: __mmask8, a: __m5 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))] #[rustc_legacy_const_generics(2)] pub fn _mm512_maskz_rol_epi64(k: __mmask8, a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x8(); - let r = vprolq(a, IMM8); - transmute(simd_select_bitmask(k, r, i64x8::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_maskz_rolv_epi64(k, a, _mm512_set1_epi64(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst. @@ -19434,12 +19350,8 @@ pub fn _mm512_maskz_rol_epi64(k: __mmask8, a: __m512i) -> __m51 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))] #[rustc_legacy_const_generics(1)] pub fn _mm256_rol_epi64(a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x4(); - let r = vprolq256(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_rolv_epi64(a, _mm256_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19451,12 +19363,8 @@ pub fn _mm256_rol_epi64(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))] #[rustc_legacy_const_generics(3)] pub fn _mm256_mask_rol_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x4(); - let r = vprolq256(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i64x4())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_mask_rolv_epi64(src, k, a, _mm256_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19468,12 +19376,8 @@ pub fn _mm256_mask_rol_epi64(src: __m256i, k: __mmask8, a: __m2 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))] #[rustc_legacy_const_generics(2)] pub fn _mm256_maskz_rol_epi64(k: __mmask8, a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x4(); - let r = vprolq256(a, IMM8); - transmute(simd_select_bitmask(k, r, i64x4::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_maskz_rolv_epi64(k, a, _mm256_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst. @@ -19485,12 +19389,8 @@ pub fn _mm256_maskz_rol_epi64(k: __mmask8, a: __m256i) -> __m25 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))] #[rustc_legacy_const_generics(1)] pub fn _mm_rol_epi64(a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x2(); - let r = vprolq128(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_rolv_epi64(a, _mm_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19502,12 +19402,8 @@ pub fn _mm_rol_epi64(a: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))] #[rustc_legacy_const_generics(3)] pub fn _mm_mask_rol_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x2(); - let r = vprolq128(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i64x2())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_mask_rolv_epi64(src, k, a, _mm_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19519,12 +19415,8 @@ pub fn _mm_mask_rol_epi64(src: __m128i, k: __mmask8, a: __m128i #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))] #[rustc_legacy_const_generics(2)] pub fn _mm_maskz_rol_epi64(k: __mmask8, a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x2(); - let r = vprolq128(a, IMM8); - transmute(simd_select_bitmask(k, r, i64x2::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_maskz_rolv_epi64(k, a, _mm_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst. @@ -19536,12 +19428,8 @@ pub fn _mm_maskz_rol_epi64(k: __mmask8, a: __m128i) -> __m128i #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))] #[rustc_legacy_const_generics(1)] pub fn _mm512_ror_epi64(a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x8(); - let r = vprorq(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_rorv_epi64(a, _mm512_set1_epi64(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19553,12 +19441,8 @@ pub fn _mm512_ror_epi64(a: __m512i) -> __m512i { #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))] #[rustc_legacy_const_generics(3)] pub fn _mm512_mask_ror_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x8(); - let r = vprorq(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i64x8())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_mask_rorv_epi64(src, k, a, _mm512_set1_epi64(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19570,12 +19454,8 @@ pub fn _mm512_mask_ror_epi64(src: __m512i, k: __mmask8, a: __m5 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))] #[rustc_legacy_const_generics(2)] pub fn _mm512_maskz_ror_epi64(k: __mmask8, a: __m512i) -> __m512i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x8(); - let r = vprorq(a, IMM8); - transmute(simd_select_bitmask(k, r, i64x8::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm512_maskz_rorv_epi64(k, a, _mm512_set1_epi64(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst. @@ -19587,12 +19467,8 @@ pub fn _mm512_maskz_ror_epi64(k: __mmask8, a: __m512i) -> __m51 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))] #[rustc_legacy_const_generics(1)] pub fn _mm256_ror_epi64(a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x4(); - let r = vprorq256(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_rorv_epi64(a, _mm256_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19604,12 +19480,8 @@ pub fn _mm256_ror_epi64(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))] #[rustc_legacy_const_generics(3)] pub fn _mm256_mask_ror_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x4(); - let r = vprorq256(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i64x4())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_mask_rorv_epi64(src, k, a, _mm256_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19621,12 +19493,8 @@ pub fn _mm256_mask_ror_epi64(src: __m256i, k: __mmask8, a: __m2 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))] #[rustc_legacy_const_generics(2)] pub fn _mm256_maskz_ror_epi64(k: __mmask8, a: __m256i) -> __m256i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x4(); - let r = vprorq256(a, IMM8); - transmute(simd_select_bitmask(k, r, i64x4::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm256_maskz_rorv_epi64(k, a, _mm256_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst. @@ -19638,12 +19506,8 @@ pub fn _mm256_maskz_ror_epi64(k: __mmask8, a: __m256i) -> __m25 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))] #[rustc_legacy_const_generics(1)] pub fn _mm_ror_epi64(a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x2(); - let r = vprorq128(a, IMM8); - transmute(r) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_rorv_epi64(a, _mm_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -19655,12 +19519,8 @@ pub fn _mm_ror_epi64(a: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))] #[rustc_legacy_const_generics(3)] pub fn _mm_mask_ror_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x2(); - let r = vprorq128(a, IMM8); - transmute(simd_select_bitmask(k, r, src.as_i64x2())) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_mask_rorv_epi64(src, k, a, _mm_set1_epi64x(IMM8 as i64)) } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -19672,12 +19532,8 @@ pub fn _mm_mask_ror_epi64(src: __m128i, k: __mmask8, a: __m128i #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))] #[rustc_legacy_const_generics(2)] pub fn _mm_maskz_ror_epi64(k: __mmask8, a: __m128i) -> __m128i { - unsafe { - static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i64x2(); - let r = vprorq128(a, IMM8); - transmute(simd_select_bitmask(k, r, i64x2::ZERO)) - } + static_assert_uimm_bits!(IMM8, 8); + _mm_maskz_rorv_epi64(k, a, _mm_set1_epi64x(IMM8 as i64)) } /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst. @@ -21084,7 +20940,7 @@ pub fn _mm_maskz_srai_epi64(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravd))] pub fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsravd(a.as_i32x16(), count.as_i32x16())) } + unsafe { transmute(simd_shr(a.as_i32x16(), count.as_i32x16())) } } /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21179,7 +21035,7 @@ pub fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravq))] pub fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsravq(a.as_i64x8(), count.as_i64x8())) } + unsafe { transmute(simd_shr(a.as_i64x8(), count.as_i64x8())) } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21218,7 +21074,7 @@ pub fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m51 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravq))] pub fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsravq256(a.as_i64x4(), count.as_i64x4())) } + unsafe { transmute(simd_shr(a.as_i64x4(), count.as_i64x4())) } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21257,7 +21113,7 @@ pub fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m25 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravq))] pub fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsravq128(a.as_i64x2(), count.as_i64x2())) } + unsafe { transmute(simd_shr(a.as_i64x2(), count.as_i64x2())) } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21296,7 +21152,7 @@ pub fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprolvd))] pub fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vprolvd(a.as_i32x16(), b.as_i32x16())) } + unsafe { transmute(simd_funnel_shl(a.as_u32x16(), a.as_u32x16(), b.as_u32x16())) } } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21335,7 +21191,7 @@ pub fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprolvd))] pub fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vprolvd256(a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(simd_funnel_shl(a.as_u32x8(), a.as_u32x8(), b.as_u32x8())) } } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21374,7 +21230,7 @@ pub fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprolvd))] pub fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vprolvd128(a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(simd_funnel_shl(a.as_u32x4(), a.as_u32x4(), b.as_u32x4())) } } /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21413,7 +21269,7 @@ pub fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprorvd))] pub fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vprorvd(a.as_i32x16(), b.as_i32x16())) } + unsafe { transmute(simd_funnel_shr(a.as_u32x16(), a.as_u32x16(), b.as_u32x16())) } } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21452,7 +21308,7 @@ pub fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprorvd))] pub fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vprorvd256(a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(simd_funnel_shr(a.as_u32x8(), a.as_u32x8(), b.as_u32x8())) } } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21491,7 +21347,7 @@ pub fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprorvd))] pub fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vprorvd128(a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(simd_funnel_shr(a.as_u32x4(), a.as_u32x4(), b.as_u32x4())) } } /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21530,7 +21386,7 @@ pub fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprolvq))] pub fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vprolvq(a.as_i64x8(), b.as_i64x8())) } + unsafe { transmute(simd_funnel_shl(a.as_u64x8(), a.as_u64x8(), b.as_u64x8())) } } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21569,7 +21425,7 @@ pub fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprolvq))] pub fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vprolvq256(a.as_i64x4(), b.as_i64x4())) } + unsafe { transmute(simd_funnel_shl(a.as_u64x4(), a.as_u64x4(), b.as_u64x4())) } } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21608,7 +21464,7 @@ pub fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprolvq))] pub fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vprolvq128(a.as_i64x2(), b.as_i64x2())) } + unsafe { transmute(simd_funnel_shl(a.as_u64x2(), a.as_u64x2(), b.as_u64x2())) } } /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21647,7 +21503,7 @@ pub fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprorvq))] pub fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vprorvq(a.as_i64x8(), b.as_i64x8())) } + unsafe { transmute(simd_funnel_shr(a.as_u64x8(), a.as_u64x8(), b.as_u64x8())) } } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21686,7 +21542,7 @@ pub fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprorvq))] pub fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vprorvq256(a.as_i64x4(), b.as_i64x4())) } + unsafe { transmute(simd_funnel_shr(a.as_u64x4(), a.as_u64x4(), b.as_u64x4())) } } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21725,7 +21581,7 @@ pub fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vprorvq))] pub fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vprorvq128(a.as_i64x2(), b.as_i64x2())) } + unsafe { transmute(simd_funnel_shr(a.as_u64x2(), a.as_u64x2(), b.as_u64x2())) } } /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21764,7 +21620,7 @@ pub fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvd))] pub fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsllvd(a.as_i32x16(), count.as_i32x16())) } + unsafe { transmute(simd_shl(a.as_u32x16(), count.as_u32x16())) } } /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21859,7 +21715,7 @@ pub fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvd))] pub fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16())) } + unsafe { transmute(simd_shr(a.as_u32x16(), count.as_u32x16())) } } /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21954,7 +21810,7 @@ pub fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvq))] pub fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsllvq(a.as_i64x8(), count.as_i64x8())) } + unsafe { transmute(simd_shl(a.as_u64x8(), count.as_u64x8())) } } /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -22049,7 +21905,7 @@ pub fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvq))] pub fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8())) } + unsafe { transmute(simd_shr(a.as_u64x8(), count.as_u64x8())) } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -42902,71 +42758,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"] fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.mask.prol.d.512"] - fn vprold(a: i32x16, i8: i32) -> i32x16; - #[link_name = "llvm.x86.avx512.mask.prol.d.256"] - fn vprold256(a: i32x8, i8: i32) -> i32x8; - #[link_name = "llvm.x86.avx512.mask.prol.d.128"] - fn vprold128(a: i32x4, i8: i32) -> i32x4; - - #[link_name = "llvm.x86.avx512.mask.pror.d.512"] - fn vprord(a: i32x16, i8: i32) -> i32x16; - #[link_name = "llvm.x86.avx512.mask.pror.d.256"] - fn vprord256(a: i32x8, i8: i32) -> i32x8; - #[link_name = "llvm.x86.avx512.mask.pror.d.128"] - fn vprord128(a: i32x4, i8: i32) -> i32x4; - - #[link_name = "llvm.x86.avx512.mask.prol.q.512"] - fn vprolq(a: i64x8, i8: i32) -> i64x8; - #[link_name = "llvm.x86.avx512.mask.prol.q.256"] - fn vprolq256(a: i64x4, i8: i32) -> i64x4; - #[link_name = "llvm.x86.avx512.mask.prol.q.128"] - fn vprolq128(a: i64x2, i8: i32) -> i64x2; - - #[link_name = "llvm.x86.avx512.mask.pror.q.512"] - fn vprorq(a: i64x8, i8: i32) -> i64x8; - #[link_name = "llvm.x86.avx512.mask.pror.q.256"] - fn vprorq256(a: i64x4, i8: i32) -> i64x4; - #[link_name = "llvm.x86.avx512.mask.pror.q.128"] - fn vprorq128(a: i64x2, i8: i32) -> i64x2; - - #[link_name = "llvm.x86.avx512.mask.prolv.d.512"] - fn vprolvd(a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.mask.prolv.d.256"] - fn vprolvd256(a: i32x8, b: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx512.mask.prolv.d.128"] - fn vprolvd128(a: i32x4, b: i32x4) -> i32x4; - - #[link_name = "llvm.x86.avx512.mask.prorv.d.512"] - fn vprorvd(a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.mask.prorv.d.256"] - fn vprorvd256(a: i32x8, b: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx512.mask.prorv.d.128"] - fn vprorvd128(a: i32x4, b: i32x4) -> i32x4; - - #[link_name = "llvm.x86.avx512.mask.prolv.q.512"] - fn vprolvq(a: i64x8, b: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.mask.prolv.q.256"] - fn vprolvq256(a: i64x4, b: i64x4) -> i64x4; - #[link_name = "llvm.x86.avx512.mask.prolv.q.128"] - fn vprolvq128(a: i64x2, b: i64x2) -> i64x2; - - #[link_name = "llvm.x86.avx512.mask.prorv.q.512"] - fn vprorvq(a: i64x8, b: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.mask.prorv.q.256"] - fn vprorvq256(a: i64x4, b: i64x4) -> i64x4; - #[link_name = "llvm.x86.avx512.mask.prorv.q.128"] - fn vprorvq128(a: i64x2, b: i64x2) -> i64x2; - - #[link_name = "llvm.x86.avx512.psllv.d.512"] - fn vpsllvd(a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.psrlv.d.512"] - fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.psllv.q.512"] - fn vpsllvq(a: i64x8, b: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.psrlv.q.512"] - fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.psll.d.512"] fn vpslld(a: i32x16, count: i32x4) -> i32x16; #[link_name = "llvm.x86.avx512.psrl.d.512"] @@ -42986,16 +42777,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.psra.q.128"] fn vpsraq128(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx512.psrav.d.512"] - fn vpsravd(a: i32x16, count: i32x16) -> i32x16; - - #[link_name = "llvm.x86.avx512.psrav.q.512"] - fn vpsravq(a: i64x8, count: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.psrav.q.256"] - fn vpsravq256(a: i64x4, count: i64x4) -> i64x4; - #[link_name = "llvm.x86.avx512.psrav.q.128"] - fn vpsravq128(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"] fn vpermilps(a: f32x16, b: i32x16) -> f32x16; #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"] diff --git a/crates/core_arch/src/x86/avx512fp16.rs b/crates/core_arch/src/x86/avx512fp16.rs index a86fc7199b..a8cf1f246a 100644 --- a/crates/core_arch/src/x86/avx512fp16.rs +++ b/crates/core_arch/src/x86/avx512fp16.rs @@ -1615,7 +1615,7 @@ pub fn _mm_maskz_add_round_sh(k: __mmask8, a: __m128h, b: _ #[cfg_attr(test, assert_instr(vaddsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h { - _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b) + unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) } } /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the @@ -1628,7 +1628,16 @@ pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h { #[cfg_attr(test, assert_instr(vaddsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b) + unsafe { + let extractsrc: f16 = simd_extract!(src, 0); + let mut add: f16 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f16 = simd_extract!(a, 0); + let extractb: f16 = simd_extract!(b, 0); + add = extracta + extractb; + } + simd_insert!(a, 0, add) + } } /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the @@ -1641,7 +1650,15 @@ pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m #[cfg_attr(test, assert_instr(vaddsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b) + unsafe { + let mut add: f16 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f16 = simd_extract!(a, 0); + let extractb: f16 = simd_extract!(b, 0); + add = extracta + extractb; + } + simd_insert!(a, 0, add) + } } /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst. @@ -1927,7 +1944,7 @@ pub fn _mm_maskz_sub_round_sh(k: __mmask8, a: __m128h, b: _ #[cfg_attr(test, assert_instr(vsubsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h { - _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b) + unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) } } /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the @@ -1940,7 +1957,16 @@ pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h { #[cfg_attr(test, assert_instr(vsubsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b) + unsafe { + let extractsrc: f16 = simd_extract!(src, 0); + let mut add: f16 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f16 = simd_extract!(a, 0); + let extractb: f16 = simd_extract!(b, 0); + add = extracta - extractb; + } + simd_insert!(a, 0, add) + } } /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the @@ -1953,7 +1979,15 @@ pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m #[cfg_attr(test, assert_instr(vsubsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b) + unsafe { + let mut add: f16 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f16 = simd_extract!(a, 0); + let extractb: f16 = simd_extract!(b, 0); + add = extracta - extractb; + } + simd_insert!(a, 0, add) + } } /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst. @@ -2239,7 +2273,7 @@ pub fn _mm_maskz_mul_round_sh(k: __mmask8, a: __m128h, b: _ #[cfg_attr(test, assert_instr(vmulsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h { - _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b) + unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) } } /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the @@ -2252,7 +2286,16 @@ pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h { #[cfg_attr(test, assert_instr(vmulsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b) + unsafe { + let extractsrc: f16 = simd_extract!(src, 0); + let mut add: f16 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f16 = simd_extract!(a, 0); + let extractb: f16 = simd_extract!(b, 0); + add = extracta * extractb; + } + simd_insert!(a, 0, add) + } } /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the @@ -2265,7 +2308,15 @@ pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m #[cfg_attr(test, assert_instr(vmulsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b) + unsafe { + let mut add: f16 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f16 = simd_extract!(a, 0); + let extractb: f16 = simd_extract!(b, 0); + add = extracta * extractb; + } + simd_insert!(a, 0, add) + } } /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst. @@ -2551,7 +2602,7 @@ pub fn _mm_maskz_div_round_sh(k: __mmask8, a: __m128h, b: _ #[cfg_attr(test, assert_instr(vdivsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h { - _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b) + unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) } } /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the @@ -2564,7 +2615,16 @@ pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h { #[cfg_attr(test, assert_instr(vdivsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b) + unsafe { + let extractsrc: f16 = simd_extract!(src, 0); + let mut add: f16 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f16 = simd_extract!(a, 0); + let extractb: f16 = simd_extract!(b, 0); + add = extracta / extractb; + } + simd_insert!(a, 0, add) + } } /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the @@ -2577,7 +2637,15 @@ pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m #[cfg_attr(test, assert_instr(vdivsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b) + unsafe { + let mut add: f16 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f16 = simd_extract!(a, 0); + let extractb: f16 = simd_extract!(b, 0); + add = extracta / extractb; + } + simd_insert!(a, 0, add) + } } /// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs index 7686b317d4..519cc38294 100644 --- a/crates/core_arch/src/x86/f16c.rs +++ b/crates/core_arch/src/x86/f16c.rs @@ -3,16 +3,13 @@ //! [F16C intrinsics]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=fp16&expand=1769 use crate::core_arch::{simd::*, x86::*}; +use crate::intrinsics::simd::*; #[cfg(test)] use stdarch_test::assert_instr; #[allow(improper_ctypes)] unsafe extern "unadjusted" { - #[link_name = "llvm.x86.vcvtph2ps.128"] - fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4; - #[link_name = "llvm.x86.vcvtph2ps.256"] - fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8; #[link_name = "llvm.x86.vcvtps2ph.128"] fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8; #[link_name = "llvm.x86.vcvtps2ph.256"] @@ -29,7 +26,11 @@ unsafe extern "unadjusted" { #[cfg_attr(test, assert_instr("vcvtph2ps"))] #[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")] pub fn _mm_cvtph_ps(a: __m128i) -> __m128 { - unsafe { transmute(llvm_vcvtph2ps_128(transmute(a))) } + unsafe { + let a: f16x8 = transmute(a); + let a: f16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); + simd_cast(a) + } } /// Converts the 8 x 16-bit half-precision float values in the 128-bit vector @@ -41,7 +42,10 @@ pub fn _mm_cvtph_ps(a: __m128i) -> __m128 { #[cfg_attr(test, assert_instr("vcvtph2ps"))] #[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")] pub fn _mm256_cvtph_ps(a: __m128i) -> __m256 { - unsafe { transmute(llvm_vcvtph2ps_256(transmute(a))) } + unsafe { + let a: f16x8 = transmute(a); + simd_cast(a) + } } /// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index 1eca66adc2..c5c6dc26b5 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -882,7 +882,7 @@ pub fn _mm_cvtss_f32(a: __m128) -> f32 { #[cfg_attr(test, assert_instr(cvtsi2ss))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 { - unsafe { cvtsi2ss(a, b) } + unsafe { simd_insert!(a, 0, b as f32) } } /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html). @@ -1989,8 +1989,6 @@ unsafe extern "C" { fn cvtss2si(a: __m128) -> i32; #[link_name = "llvm.x86.sse.cvttss2si"] fn cvttss2si(a: __m128) -> i32; - #[link_name = "llvm.x86.sse.cvtsi2ss"] - fn cvtsi2ss(a: __m128, b: i32) -> __m128; #[link_name = "llvm.x86.sse.sfence"] fn sfence(); #[link_name = "llvm.x86.sse.stmxcsr"] diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 1eaa89663b..c9530a237a 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -201,7 +201,12 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmaddwd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) } + unsafe { + let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8())); + let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]); + let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]); + simd_add(even, odd).as_m128i() + } } /// Compares packed 16-bit integers in `a` and `b`, and returns the packed @@ -2417,7 +2422,10 @@ pub fn _mm_cvtsd_f64(a: __m128d) -> f64 { #[cfg_attr(test, assert_instr(cvtss2sd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d { - unsafe { cvtss2sd(a, b) } + unsafe { + let elt: f32 = simd_extract!(b, 0); + simd_insert!(a, 0, elt as f64) + } } /// Converts packed double-precision (64-bit) floating-point elements in `a` to @@ -3043,8 +3051,6 @@ unsafe extern "C" { fn lfence(); #[link_name = "llvm.x86.sse2.mfence"] fn mfence(); - #[link_name = "llvm.x86.sse2.pmadd.wd"] - fn pmaddwd(a: i16x8, b: i16x8) -> i32x4; #[link_name = "llvm.x86.sse2.psad.bw"] fn psadbw(a: u8x16, b: u8x16) -> u64x2; #[link_name = "llvm.x86.sse2.psll.w"] @@ -3115,8 +3121,6 @@ unsafe extern "C" { fn cvtsd2si(a: __m128d) -> i32; #[link_name = "llvm.x86.sse2.cvtsd2ss"] fn cvtsd2ss(a: __m128, b: __m128d) -> __m128; - #[link_name = "llvm.x86.sse2.cvtss2sd"] - fn cvtss2sd(a: __m128d, b: __m128) -> __m128d; #[link_name = "llvm.x86.sse2.cvttpd2dq"] fn cvttpd2dq(a: __m128d) -> i32x4; #[link_name = "llvm.x86.sse2.cvttsd2si"] diff --git a/crates/core_arch/src/x86/sse3.rs b/crates/core_arch/src/x86/sse3.rs index 7a32cfe472..79be7a7e9b 100644 --- a/crates/core_arch/src/x86/sse3.rs +++ b/crates/core_arch/src/x86/sse3.rs @@ -51,7 +51,11 @@ pub fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(haddpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d { - unsafe { haddpd(a, b) } + unsafe { + let even = simd_shuffle!(a, b, [0, 2]); + let odd = simd_shuffle!(a, b, [1, 3]); + simd_add(even, odd) + } } /// Horizontally adds adjacent pairs of single-precision (32-bit) @@ -63,7 +67,11 @@ pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(haddps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 { - unsafe { haddps(a, b) } + unsafe { + let even = simd_shuffle!(a, b, [0, 2, 4, 6]); + let odd = simd_shuffle!(a, b, [1, 3, 5, 7]); + simd_add(even, odd) + } } /// Horizontally subtract adjacent pairs of double-precision (64-bit) @@ -75,7 +83,11 @@ pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 { #[cfg_attr(test, assert_instr(hsubpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d { - unsafe { hsubpd(a, b) } + unsafe { + let even = simd_shuffle!(a, b, [0, 2]); + let odd = simd_shuffle!(a, b, [1, 3]); + simd_sub(even, odd) + } } /// Horizontally adds adjacent pairs of single-precision (32-bit) @@ -87,7 +99,11 @@ pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(hsubps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 { - unsafe { hsubps(a, b) } + unsafe { + let even = simd_shuffle!(a, b, [0, 2, 4, 6]); + let odd = simd_shuffle!(a, b, [1, 3, 5, 7]); + simd_sub(even, odd) + } } /// Loads 128-bits of integer data from unaligned memory. @@ -153,14 +169,6 @@ pub fn _mm_moveldup_ps(a: __m128) -> __m128 { #[allow(improper_ctypes)] unsafe extern "C" { - #[link_name = "llvm.x86.sse3.hadd.pd"] - fn haddpd(a: __m128d, b: __m128d) -> __m128d; - #[link_name = "llvm.x86.sse3.hadd.ps"] - fn haddps(a: __m128, b: __m128) -> __m128; - #[link_name = "llvm.x86.sse3.hsub.pd"] - fn hsubpd(a: __m128d, b: __m128d) -> __m128d; - #[link_name = "llvm.x86.sse3.hsub.ps"] - fn hsubps(a: __m128, b: __m128) -> __m128; #[link_name = "llvm.x86.sse3.ldu.dq"] fn lddqu(mem_addr: *const i8) -> i8x16; } diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index 9aa200dfc0..f457c74aa9 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -1006,7 +1006,10 @@ pub fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(ptest))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 { - unsafe { ptestz(a.as_i64x2(), mask.as_i64x2()) } + unsafe { + let r = simd_reduce_or(simd_and(a.as_i64x2(), mask.as_i64x2())); + (0i64 == r) as i32 + } } /// Tests whether the specified bits in a 128-bit integer vector are all @@ -1029,7 +1032,13 @@ pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 { #[cfg_attr(test, assert_instr(ptest))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 { - unsafe { ptestc(a.as_i64x2(), mask.as_i64x2()) } + unsafe { + let r = simd_reduce_or(simd_and( + simd_xor(a.as_i64x2(), i64x2::splat(!0)), + mask.as_i64x2(), + )); + (0i64 == r) as i32 + } } /// Tests whether the specified bits in a 128-bit integer vector are @@ -1165,10 +1174,6 @@ unsafe extern "C" { fn phminposuw(a: u16x8) -> u16x8; #[link_name = "llvm.x86.sse41.mpsadbw"] fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8; - #[link_name = "llvm.x86.sse41.ptestz"] - fn ptestz(a: i64x2, mask: i64x2) -> i32; - #[link_name = "llvm.x86.sse41.ptestc"] - fn ptestc(a: i64x2, mask: i64x2) -> i32; #[link_name = "llvm.x86.sse41.ptestnzc"] fn ptestnzc(a: i64x2, mask: i64x2) -> i32; } diff --git a/crates/core_arch/src/x86/ssse3.rs b/crates/core_arch/src/x86/ssse3.rs index 2be182e88f..ac067bd4b5 100644 --- a/crates/core_arch/src/x86/ssse3.rs +++ b/crates/core_arch/src/x86/ssse3.rs @@ -164,7 +164,13 @@ pub fn _mm_alignr_epi8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(phaddw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(phaddw128(a.as_i16x8(), b.as_i16x8())) } + let a = a.as_i16x8(); + let b = b.as_i16x8(); + unsafe { + let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]); + let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]); + simd_add(even, odd).as_m128i() + } } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -189,7 +195,13 @@ pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(phaddd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(phaddd128(a.as_i32x4(), b.as_i32x4())) } + let a = a.as_i32x4(); + let b = b.as_i32x4(); + unsafe { + let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]); + let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]); + simd_add(even, odd).as_m128i() + } } /// Horizontally subtract the adjacent pairs of values contained in 2 @@ -201,7 +213,13 @@ pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(phsubw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(phsubw128(a.as_i16x8(), b.as_i16x8())) } + let a = a.as_i16x8(); + let b = b.as_i16x8(); + unsafe { + let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]); + let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]); + simd_sub(even, odd).as_m128i() + } } /// Horizontally subtract the adjacent pairs of values contained in 2 @@ -227,7 +245,13 @@ pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(phsubd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(phsubd128(a.as_i32x4(), b.as_i32x4())) } + let a = a.as_i32x4(); + let b = b.as_i32x4(); + unsafe { + let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]); + let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]); + simd_sub(even, odd).as_m128i() + } } /// Multiplies corresponding pairs of packed 8-bit unsigned integer @@ -305,24 +329,12 @@ unsafe extern "C" { #[link_name = "llvm.x86.ssse3.pshuf.b.128"] fn pshufb128(a: u8x16, b: u8x16) -> u8x16; - #[link_name = "llvm.x86.ssse3.phadd.w.128"] - fn phaddw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.ssse3.phadd.sw.128"] fn phaddsw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.ssse3.phadd.d.128"] - fn phaddd128(a: i32x4, b: i32x4) -> i32x4; - - #[link_name = "llvm.x86.ssse3.phsub.w.128"] - fn phsubw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.ssse3.phsub.sw.128"] fn phsubsw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.ssse3.phsub.d.128"] - fn phsubd128(a: i32x4, b: i32x4) -> i32x4; - #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"] fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8; diff --git a/crates/core_arch/src/x86_64/adx.rs b/crates/core_arch/src/x86_64/adx.rs index bdc534b5a5..cf378cc169 100644 --- a/crates/core_arch/src/x86_64/adx.rs +++ b/crates/core_arch/src/x86_64/adx.rs @@ -5,8 +5,6 @@ use stdarch_test::assert_instr; unsafe extern "unadjusted" { #[link_name = "llvm.x86.addcarry.64"] fn llvm_addcarry_u64(a: u8, b: u64, c: u64) -> (u8, u64); - #[link_name = "llvm.x86.addcarryx.u64"] - fn llvm_addcarryx_u64(a: u8, b: u64, c: u64, d: *mut u64) -> u8; #[link_name = "llvm.x86.subborrow.64"] fn llvm_subborrow_u64(a: u8, b: u64, c: u64) -> (u8, u64); } @@ -35,7 +33,7 @@ pub unsafe fn _addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 { #[cfg_attr(test, assert_instr(adc))] #[stable(feature = "simd_x86_adx", since = "1.33.0")] pub unsafe fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 { - llvm_addcarryx_u64(c_in, a, b, out as *mut _) + _addcarry_u64(c_in, a, b, out) } /// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`. @@ -95,27 +93,27 @@ mod tests { let a = u64::MAX; let mut out = 0; - let r = _addcarry_u64(0, a, 1, &mut out); + let r = _addcarryx_u64(0, a, 1, &mut out); assert_eq!(r, 1); assert_eq!(out, 0); - let r = _addcarry_u64(0, a, 0, &mut out); + let r = _addcarryx_u64(0, a, 0, &mut out); assert_eq!(r, 0); assert_eq!(out, a); - let r = _addcarry_u64(1, a, 1, &mut out); + let r = _addcarryx_u64(1, a, 1, &mut out); assert_eq!(r, 1); assert_eq!(out, 1); - let r = _addcarry_u64(1, a, 0, &mut out); + let r = _addcarryx_u64(1, a, 0, &mut out); assert_eq!(r, 1); assert_eq!(out, 0); - let r = _addcarry_u64(0, 3, 4, &mut out); + let r = _addcarryx_u64(0, 3, 4, &mut out); assert_eq!(r, 0); assert_eq!(out, 7); - let r = _addcarry_u64(1, 3, 4, &mut out); + let r = _addcarryx_u64(1, 3, 4, &mut out); assert_eq!(r, 0); assert_eq!(out, 8); } diff --git a/crates/core_arch/src/x86_64/sse.rs b/crates/core_arch/src/x86_64/sse.rs index 863c3cd2e7..6bd7ec83ec 100644 --- a/crates/core_arch/src/x86_64/sse.rs +++ b/crates/core_arch/src/x86_64/sse.rs @@ -11,8 +11,6 @@ unsafe extern "C" { fn cvtss2si64(a: __m128) -> i64; #[link_name = "llvm.x86.sse.cvttss2si64"] fn cvttss2si64(a: __m128) -> i64; - #[link_name = "llvm.x86.sse.cvtsi642ss"] - fn cvtsi642ss(a: __m128, b: i64) -> __m128; } /// Converts the lowest 32 bit float in the input vector to a 64 bit integer. @@ -65,7 +63,7 @@ pub fn _mm_cvttss_si64(a: __m128) -> i64 { #[cfg_attr(test, assert_instr(cvtsi2ss))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 { - unsafe { cvtsi642ss(a, b) } + unsafe { simd_insert!(a, 0, b as f32) } } #[cfg(test)]