@@ -1234,7 +1234,10 @@ pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1234
1234
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
1235
1235
pub fn _mm256_permute2f128_ps < const IMM8 : i32 > ( a : __m256 , b : __m256 ) -> __m256 {
1236
1236
static_assert_uimm_bits ! ( IMM8 , 8 ) ;
1237
- unsafe { vperm2f128ps256 ( a, b, IMM8 as i8 ) }
1237
+ _mm256_castsi256_ps ( _mm256_permute2f128_si256 :: < IMM8 > (
1238
+ _mm256_castps_si256 ( a) ,
1239
+ _mm256_castps_si256 ( b) ,
1240
+ ) )
1238
1241
}
1239
1242
1240
1243
/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
@@ -1248,7 +1251,10 @@ pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1248
1251
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
1249
1252
pub fn _mm256_permute2f128_pd < const IMM8 : i32 > ( a : __m256d , b : __m256d ) -> __m256d {
1250
1253
static_assert_uimm_bits ! ( IMM8 , 8 ) ;
1251
- unsafe { vperm2f128pd256 ( a, b, IMM8 as i8 ) }
1254
+ _mm256_castsi256_pd ( _mm256_permute2f128_si256 :: < IMM8 > (
1255
+ _mm256_castpd_si256 ( a) ,
1256
+ _mm256_castpd_si256 ( b) ,
1257
+ ) )
1252
1258
}
1253
1259
1254
1260
/// Shuffles 128-bits (composed of integer data) selected by `imm8`
@@ -1262,7 +1268,35 @@ pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256
1262
1268
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
1263
1269
pub fn _mm256_permute2f128_si256 < const IMM8 : i32 > ( a : __m256i , b : __m256i ) -> __m256i {
1264
1270
static_assert_uimm_bits ! ( IMM8 , 8 ) ;
1265
- unsafe { transmute ( vperm2f128si256 ( a. as_i32x8 ( ) , b. as_i32x8 ( ) , IMM8 as i8 ) ) }
1271
+ const fn idx ( imm8 : i32 , pos : u32 ) -> u32 {
1272
+ let part = if pos < 2 {
1273
+ imm8 & 0xf
1274
+ } else {
1275
+ ( imm8 & 0xf0 ) >> 4
1276
+ } ;
1277
+ 2 * ( part as u32 & 0b11 ) + ( pos & 1 )
1278
+ }
1279
+ const fn idx0 ( imm8 : i32 , pos : u32 ) -> u32 {
1280
+ let part = if pos < 2 {
1281
+ imm8 & 0xf
1282
+ } else {
1283
+ ( imm8 & 0xf0 ) >> 4
1284
+ } ;
1285
+ if part & 0b1000 != 0 { 4 } else { pos }
1286
+ }
1287
+ unsafe {
1288
+ let r = simd_shuffle ! (
1289
+ a. as_i64x4( ) ,
1290
+ b. as_i64x4( ) ,
1291
+ [ idx( IMM8 , 0 ) , idx( IMM8 , 1 ) , idx( IMM8 , 2 ) , idx( IMM8 , 3 ) ]
1292
+ ) ;
1293
+ let r: i64x4 = simd_shuffle ! (
1294
+ r,
1295
+ i64x4:: ZERO ,
1296
+ [ idx0( IMM8 , 0 ) , idx0( IMM8 , 1 ) , idx0( IMM8 , 2 ) , idx0( IMM8 , 3 ) ]
1297
+ ) ;
1298
+ r. as_m256i ( )
1299
+ }
1266
1300
}
1267
1301
1268
1302
/// Broadcasts a single-precision (32-bit) floating-point element from memory
@@ -3092,12 +3126,6 @@ unsafe extern "C" {
3092
3126
fn vpermilpd256 ( a : __m256d , b : i64x4 ) -> __m256d ;
3093
3127
#[ link_name = "llvm.x86.avx.vpermilvar.pd" ]
3094
3128
fn vpermilpd ( a : __m128d , b : i64x2 ) -> __m128d ;
3095
- #[ link_name = "llvm.x86.avx.vperm2f128.ps.256" ]
3096
- fn vperm2f128ps256 ( a : __m256 , b : __m256 , imm8 : i8 ) -> __m256 ;
3097
- #[ link_name = "llvm.x86.avx.vperm2f128.pd.256" ]
3098
- fn vperm2f128pd256 ( a : __m256d , b : __m256d , imm8 : i8 ) -> __m256d ;
3099
- #[ link_name = "llvm.x86.avx.vperm2f128.si.256" ]
3100
- fn vperm2f128si256 ( a : i32x8 , b : i32x8 , imm8 : i8 ) -> i32x8 ;
3101
3129
#[ link_name = "llvm.x86.avx.maskload.pd.256" ]
3102
3130
fn maskloadpd256 ( mem_addr : * const i8 , mask : i64x4 ) -> __m256d ;
3103
3131
#[ link_name = "llvm.x86.avx.maskstore.pd.256" ]
0 commit comments