@@ -587,7 +587,11 @@ pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
587
587
#[ cfg_attr( test, assert_instr( vhaddpd) ) ]
588
588
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
589
589
pub fn _mm256_hadd_pd ( a : __m256d , b : __m256d ) -> __m256d {
590
- unsafe { vhaddpd ( a, b) }
590
+ unsafe {
591
+ let even = simd_shuffle ! ( a, b, [ 0 , 4 , 2 , 6 ] ) ;
592
+ let odd = simd_shuffle ! ( a, b, [ 1 , 5 , 3 , 7 ] ) ;
593
+ simd_add ( even, odd)
594
+ }
591
595
}
592
596
593
597
/// Horizontal addition of adjacent pairs in the two packed vectors
@@ -602,7 +606,11 @@ pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
602
606
#[ cfg_attr( test, assert_instr( vhaddps) ) ]
603
607
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
604
608
pub fn _mm256_hadd_ps ( a : __m256 , b : __m256 ) -> __m256 {
605
- unsafe { vhaddps ( a, b) }
609
+ unsafe {
610
+ let even = simd_shuffle ! ( a, b, [ 0 , 2 , 8 , 10 , 4 , 6 , 12 , 14 ] ) ;
611
+ let odd = simd_shuffle ! ( a, b, [ 1 , 3 , 9 , 11 , 5 , 7 , 13 , 15 ] ) ;
612
+ simd_add ( even, odd)
613
+ }
606
614
}
607
615
608
616
/// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -616,7 +624,11 @@ pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
616
624
#[ cfg_attr( test, assert_instr( vhsubpd) ) ]
617
625
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
618
626
pub fn _mm256_hsub_pd ( a : __m256d , b : __m256d ) -> __m256d {
619
- unsafe { vhsubpd ( a, b) }
627
+ unsafe {
628
+ let even = simd_shuffle ! ( a, b, [ 0 , 4 , 2 , 6 ] ) ;
629
+ let odd = simd_shuffle ! ( a, b, [ 1 , 5 , 3 , 7 ] ) ;
630
+ simd_sub ( even, odd)
631
+ }
620
632
}
621
633
622
634
/// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -631,7 +643,11 @@ pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
631
643
#[ cfg_attr( test, assert_instr( vhsubps) ) ]
632
644
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
633
645
pub fn _mm256_hsub_ps ( a : __m256 , b : __m256 ) -> __m256 {
634
- unsafe { vhsubps ( a, b) }
646
+ unsafe {
647
+ let even = simd_shuffle ! ( a, b, [ 0 , 2 , 8 , 10 , 4 , 6 , 12 , 14 ] ) ;
648
+ let odd = simd_shuffle ! ( a, b, [ 1 , 3 , 9 , 11 , 5 , 7 , 13 , 15 ] ) ;
649
+ simd_sub ( even, odd)
650
+ }
635
651
}
636
652
637
653
/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
@@ -1218,7 +1234,10 @@ pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1218
1234
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
1219
1235
pub fn _mm256_permute2f128_ps < const IMM8 : i32 > ( a : __m256 , b : __m256 ) -> __m256 {
1220
1236
static_assert_uimm_bits ! ( IMM8 , 8 ) ;
1221
- unsafe { vperm2f128ps256 ( a, b, IMM8 as i8 ) }
1237
+ _mm256_castsi256_ps ( _mm256_permute2f128_si256 :: < IMM8 > (
1238
+ _mm256_castps_si256 ( a) ,
1239
+ _mm256_castps_si256 ( b) ,
1240
+ ) )
1222
1241
}
1223
1242
1224
1243
/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
@@ -1232,7 +1251,10 @@ pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1232
1251
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
1233
1252
pub fn _mm256_permute2f128_pd < const IMM8 : i32 > ( a : __m256d , b : __m256d ) -> __m256d {
1234
1253
static_assert_uimm_bits ! ( IMM8 , 8 ) ;
1235
- unsafe { vperm2f128pd256 ( a, b, IMM8 as i8 ) }
1254
+ _mm256_castsi256_pd ( _mm256_permute2f128_si256 :: < IMM8 > (
1255
+ _mm256_castpd_si256 ( a) ,
1256
+ _mm256_castpd_si256 ( b) ,
1257
+ ) )
1236
1258
}
1237
1259
1238
1260
/// Shuffles 128-bits (composed of integer data) selected by `imm8`
@@ -1246,7 +1268,35 @@ pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256
1246
1268
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
1247
1269
pub fn _mm256_permute2f128_si256 < const IMM8 : i32 > ( a : __m256i , b : __m256i ) -> __m256i {
1248
1270
static_assert_uimm_bits ! ( IMM8 , 8 ) ;
1249
- unsafe { transmute ( vperm2f128si256 ( a. as_i32x8 ( ) , b. as_i32x8 ( ) , IMM8 as i8 ) ) }
1271
+ const fn idx ( imm8 : i32 , pos : u32 ) -> u32 {
1272
+ let part = if pos < 2 {
1273
+ imm8 & 0xf
1274
+ } else {
1275
+ ( imm8 & 0xf0 ) >> 4
1276
+ } ;
1277
+ 2 * ( part as u32 & 0b11 ) + ( pos & 1 )
1278
+ }
1279
+ const fn idx0 ( imm8 : i32 , pos : u32 ) -> u32 {
1280
+ let part = if pos < 2 {
1281
+ imm8 & 0xf
1282
+ } else {
1283
+ ( imm8 & 0xf0 ) >> 4
1284
+ } ;
1285
+ if part & 0b1000 != 0 { 4 } else { pos }
1286
+ }
1287
+ unsafe {
1288
+ let r = simd_shuffle ! (
1289
+ a. as_i64x4( ) ,
1290
+ b. as_i64x4( ) ,
1291
+ [ idx( IMM8 , 0 ) , idx( IMM8 , 1 ) , idx( IMM8 , 2 ) , idx( IMM8 , 3 ) ]
1292
+ ) ;
1293
+ let r: i64x4 = simd_shuffle ! (
1294
+ r,
1295
+ i64x4:: ZERO ,
1296
+ [ idx0( IMM8 , 0 ) , idx0( IMM8 , 1 ) , idx0( IMM8 , 2 ) , idx0( IMM8 , 3 ) ]
1297
+ ) ;
1298
+ r. as_m256i ( )
1299
+ }
1250
1300
}
1251
1301
1252
1302
/// Broadcasts a single-precision (32-bit) floating-point element from memory
@@ -1933,7 +1983,10 @@ pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
1933
1983
#[ cfg_attr( test, assert_instr( vptest) ) ]
1934
1984
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
1935
1985
pub fn _mm256_testz_si256 ( a : __m256i , b : __m256i ) -> i32 {
1936
- unsafe { ptestz256 ( a. as_i64x4 ( ) , b. as_i64x4 ( ) ) }
1986
+ unsafe {
1987
+ let r = simd_and ( a. as_i64x4 ( ) , b. as_i64x4 ( ) ) ;
1988
+ ( 0i64 == simd_reduce_or ( r) ) as i32
1989
+ }
1937
1990
}
1938
1991
1939
1992
/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -1947,7 +2000,10 @@ pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
1947
2000
#[ cfg_attr( test, assert_instr( vptest) ) ]
1948
2001
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
1949
2002
pub fn _mm256_testc_si256 ( a : __m256i , b : __m256i ) -> i32 {
1950
- unsafe { ptestc256 ( a. as_i64x4 ( ) , b. as_i64x4 ( ) ) }
2003
+ unsafe {
2004
+ let r = simd_and ( simd_xor ( a. as_i64x4 ( ) , i64x4:: splat ( !0 ) ) , b. as_i64x4 ( ) ) ;
2005
+ ( 0i64 == simd_reduce_or ( r) ) as i32
2006
+ }
1951
2007
}
1952
2008
1953
2009
/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -2031,7 +2087,10 @@ pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2031
2087
#[ cfg_attr( test, assert_instr( vtestpd) ) ]
2032
2088
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
2033
2089
pub fn _mm_testz_pd ( a : __m128d , b : __m128d ) -> i32 {
2034
- unsafe { vtestzpd ( a, b) }
2090
+ unsafe {
2091
+ let r: i64x2 = simd_lt ( transmute ( _mm_and_pd ( a, b) ) , i64x2:: ZERO ) ;
2092
+ ( 0i64 == simd_reduce_or ( r) ) as i32
2093
+ }
2035
2094
}
2036
2095
2037
2096
/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@@ -2048,7 +2107,10 @@ pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2048
2107
#[ cfg_attr( test, assert_instr( vtestpd) ) ]
2049
2108
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
2050
2109
pub fn _mm_testc_pd ( a : __m128d , b : __m128d ) -> i32 {
2051
- unsafe { vtestcpd ( a, b) }
2110
+ unsafe {
2111
+ let r: i64x2 = simd_lt ( transmute ( _mm_andnot_pd ( a, b) ) , i64x2:: ZERO ) ;
2112
+ ( 0i64 == simd_reduce_or ( r) ) as i32
2113
+ }
2052
2114
}
2053
2115
2054
2116
/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@@ -2135,7 +2197,10 @@ pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2135
2197
#[ cfg_attr( test, assert_instr( vtestps) ) ]
2136
2198
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
2137
2199
pub fn _mm_testz_ps ( a : __m128 , b : __m128 ) -> i32 {
2138
- unsafe { vtestzps ( a, b) }
2200
+ unsafe {
2201
+ let r: i32x4 = simd_lt ( transmute ( _mm_and_ps ( a, b) ) , i32x4:: ZERO ) ;
2202
+ ( 0i32 == simd_reduce_or ( r) ) as i32
2203
+ }
2139
2204
}
2140
2205
2141
2206
/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@@ -2152,7 +2217,10 @@ pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2152
2217
#[ cfg_attr( test, assert_instr( vtestps) ) ]
2153
2218
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
2154
2219
pub fn _mm_testc_ps ( a : __m128 , b : __m128 ) -> i32 {
2155
- unsafe { vtestcps ( a, b) }
2220
+ unsafe {
2221
+ let r: i32x4 = simd_lt ( transmute ( _mm_andnot_ps ( a, b) ) , i32x4:: ZERO ) ;
2222
+ ( 0i32 == simd_reduce_or ( r) ) as i32
2223
+ }
2156
2224
}
2157
2225
2158
2226
/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@@ -3044,14 +3112,6 @@ unsafe extern "C" {
3044
3112
fn roundps256 ( a : __m256 , b : i32 ) -> __m256 ;
3045
3113
#[ link_name = "llvm.x86.avx.dp.ps.256" ]
3046
3114
fn vdpps ( a : __m256 , b : __m256 , imm8 : i8 ) -> __m256 ;
3047
- #[ link_name = "llvm.x86.avx.hadd.pd.256" ]
3048
- fn vhaddpd ( a : __m256d , b : __m256d ) -> __m256d ;
3049
- #[ link_name = "llvm.x86.avx.hadd.ps.256" ]
3050
- fn vhaddps ( a : __m256 , b : __m256 ) -> __m256 ;
3051
- #[ link_name = "llvm.x86.avx.hsub.pd.256" ]
3052
- fn vhsubpd ( a : __m256d , b : __m256d ) -> __m256d ;
3053
- #[ link_name = "llvm.x86.avx.hsub.ps.256" ]
3054
- fn vhsubps ( a : __m256 , b : __m256 ) -> __m256 ;
3055
3115
#[ link_name = "llvm.x86.sse2.cmp.pd" ]
3056
3116
fn vcmppd ( a : __m128d , b : __m128d , imm8 : i8 ) -> __m128d ;
3057
3117
#[ link_name = "llvm.x86.avx.cmp.pd.256" ]
@@ -3084,12 +3144,6 @@ unsafe extern "C" {
3084
3144
fn vpermilpd256 ( a : __m256d , b : i64x4 ) -> __m256d ;
3085
3145
#[ link_name = "llvm.x86.avx.vpermilvar.pd" ]
3086
3146
fn vpermilpd ( a : __m128d , b : i64x2 ) -> __m128d ;
3087
- #[ link_name = "llvm.x86.avx.vperm2f128.ps.256" ]
3088
- fn vperm2f128ps256 ( a : __m256 , b : __m256 , imm8 : i8 ) -> __m256 ;
3089
- #[ link_name = "llvm.x86.avx.vperm2f128.pd.256" ]
3090
- fn vperm2f128pd256 ( a : __m256d , b : __m256d , imm8 : i8 ) -> __m256d ;
3091
- #[ link_name = "llvm.x86.avx.vperm2f128.si.256" ]
3092
- fn vperm2f128si256 ( a : i32x8 , b : i32x8 , imm8 : i8 ) -> i32x8 ;
3093
3147
#[ link_name = "llvm.x86.avx.maskload.pd.256" ]
3094
3148
fn maskloadpd256 ( mem_addr : * const i8 , mask : i64x4 ) -> __m256d ;
3095
3149
#[ link_name = "llvm.x86.avx.maskstore.pd.256" ]
@@ -3112,10 +3166,6 @@ unsafe extern "C" {
3112
3166
fn vrcpps ( a : __m256 ) -> __m256 ;
3113
3167
#[ link_name = "llvm.x86.avx.rsqrt.ps.256" ]
3114
3168
fn vrsqrtps ( a : __m256 ) -> __m256 ;
3115
- #[ link_name = "llvm.x86.avx.ptestz.256" ]
3116
- fn ptestz256 ( a : i64x4 , b : i64x4 ) -> i32 ;
3117
- #[ link_name = "llvm.x86.avx.ptestc.256" ]
3118
- fn ptestc256 ( a : i64x4 , b : i64x4 ) -> i32 ;
3119
3169
#[ link_name = "llvm.x86.avx.ptestnzc.256" ]
3120
3170
fn ptestnzc256 ( a : i64x4 , b : i64x4 ) -> i32 ;
3121
3171
#[ link_name = "llvm.x86.avx.vtestz.pd.256" ]
@@ -3124,10 +3174,6 @@ unsafe extern "C" {
3124
3174
fn vtestcpd256 ( a : __m256d , b : __m256d ) -> i32 ;
3125
3175
#[ link_name = "llvm.x86.avx.vtestnzc.pd.256" ]
3126
3176
fn vtestnzcpd256 ( a : __m256d , b : __m256d ) -> i32 ;
3127
- #[ link_name = "llvm.x86.avx.vtestz.pd" ]
3128
- fn vtestzpd ( a : __m128d , b : __m128d ) -> i32 ;
3129
- #[ link_name = "llvm.x86.avx.vtestc.pd" ]
3130
- fn vtestcpd ( a : __m128d , b : __m128d ) -> i32 ;
3131
3177
#[ link_name = "llvm.x86.avx.vtestnzc.pd" ]
3132
3178
fn vtestnzcpd ( a : __m128d , b : __m128d ) -> i32 ;
3133
3179
#[ link_name = "llvm.x86.avx.vtestz.ps.256" ]
@@ -3136,10 +3182,6 @@ unsafe extern "C" {
3136
3182
fn vtestcps256 ( a : __m256 , b : __m256 ) -> i32 ;
3137
3183
#[ link_name = "llvm.x86.avx.vtestnzc.ps.256" ]
3138
3184
fn vtestnzcps256 ( a : __m256 , b : __m256 ) -> i32 ;
3139
- #[ link_name = "llvm.x86.avx.vtestz.ps" ]
3140
- fn vtestzps ( a : __m128 , b : __m128 ) -> i32 ;
3141
- #[ link_name = "llvm.x86.avx.vtestc.ps" ]
3142
- fn vtestcps ( a : __m128 , b : __m128 ) -> i32 ;
3143
3185
#[ link_name = "llvm.x86.avx.vtestnzc.ps" ]
3144
3186
fn vtestnzcps ( a : __m128 , b : __m128 ) -> i32 ;
3145
3187
#[ link_name = "llvm.x86.avx.min.ps.256" ]
0 commit comments