@@ -891,7 +891,21 @@ pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
891
891
#[ cfg_attr( test, assert_instr( vphaddw) ) ]
892
892
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
893
893
pub fn _mm256_hadd_epi16 ( a : __m256i , b : __m256i ) -> __m256i {
894
- unsafe { transmute ( phaddw ( a. as_i16x16 ( ) , b. as_i16x16 ( ) ) ) }
894
+ let a = a. as_i16x16 ( ) ;
895
+ let b = b. as_i16x16 ( ) ;
896
+ unsafe {
897
+ let even: i16x16 = simd_shuffle ! (
898
+ a,
899
+ b,
900
+ [ 0 , 2 , 4 , 6 , 16 , 18 , 20 , 22 , 8 , 10 , 12 , 14 , 24 , 26 , 28 , 30 ]
901
+ ) ;
902
+ let odd: i16x16 = simd_shuffle ! (
903
+ a,
904
+ b,
905
+ [ 1 , 3 , 5 , 7 , 17 , 19 , 21 , 23 , 9 , 11 , 13 , 15 , 25 , 27 , 29 , 31 ]
906
+ ) ;
907
+ simd_add ( even, odd) . as_m256i ( )
908
+ }
895
909
}
896
910
897
911
/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
@@ -902,7 +916,13 @@ pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
902
916
#[ cfg_attr( test, assert_instr( vphaddd) ) ]
903
917
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
904
918
pub fn _mm256_hadd_epi32 ( a : __m256i , b : __m256i ) -> __m256i {
905
- unsafe { transmute ( phaddd ( a. as_i32x8 ( ) , b. as_i32x8 ( ) ) ) }
919
+ let a = a. as_i32x8 ( ) ;
920
+ let b = b. as_i32x8 ( ) ;
921
+ unsafe {
922
+ let even: i32x8 = simd_shuffle ! ( a, b, [ 0 , 2 , 8 , 10 , 4 , 6 , 12 , 14 ] ) ;
923
+ let odd: i32x8 = simd_shuffle ! ( a, b, [ 1 , 3 , 9 , 11 , 5 , 7 , 13 , 15 ] ) ;
924
+ simd_add ( even, odd) . as_m256i ( )
925
+ }
906
926
}
907
927
908
928
/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
@@ -925,7 +945,21 @@ pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
925
945
#[ cfg_attr( test, assert_instr( vphsubw) ) ]
926
946
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
927
947
pub fn _mm256_hsub_epi16 ( a : __m256i , b : __m256i ) -> __m256i {
928
- unsafe { transmute ( phsubw ( a. as_i16x16 ( ) , b. as_i16x16 ( ) ) ) }
948
+ let a = a. as_i16x16 ( ) ;
949
+ let b = b. as_i16x16 ( ) ;
950
+ unsafe {
951
+ let even: i16x16 = simd_shuffle ! (
952
+ a,
953
+ b,
954
+ [ 0 , 2 , 4 , 6 , 16 , 18 , 20 , 22 , 8 , 10 , 12 , 14 , 24 , 26 , 28 , 30 ]
955
+ ) ;
956
+ let odd: i16x16 = simd_shuffle ! (
957
+ a,
958
+ b,
959
+ [ 1 , 3 , 5 , 7 , 17 , 19 , 21 , 23 , 9 , 11 , 13 , 15 , 25 , 27 , 29 , 31 ]
960
+ ) ;
961
+ simd_sub ( even, odd) . as_m256i ( )
962
+ }
929
963
}
930
964
931
965
/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
@@ -936,7 +970,13 @@ pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
936
970
#[ cfg_attr( test, assert_instr( vphsubd) ) ]
937
971
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
938
972
pub fn _mm256_hsub_epi32 ( a : __m256i , b : __m256i ) -> __m256i {
939
- unsafe { transmute ( phsubd ( a. as_i32x8 ( ) , b. as_i32x8 ( ) ) ) }
973
+ let a = a. as_i32x8 ( ) ;
974
+ let b = b. as_i32x8 ( ) ;
975
+ unsafe {
976
+ let even: i32x8 = simd_shuffle ! ( a, b, [ 0 , 2 , 8 , 10 , 4 , 6 , 12 , 14 ] ) ;
977
+ let odd: i32x8 = simd_shuffle ! ( a, b, [ 1 , 3 , 9 , 11 , 5 , 7 , 13 , 15 ] ) ;
978
+ simd_sub ( even, odd) . as_m256i ( )
979
+ }
940
980
}
941
981
942
982
/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@@ -1714,7 +1754,12 @@ pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
1714
1754
#[ cfg_attr( test, assert_instr( vpmaddwd) ) ]
1715
1755
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
1716
1756
pub fn _mm256_madd_epi16 ( a : __m256i , b : __m256i ) -> __m256i {
1717
- unsafe { transmute ( pmaddwd ( a. as_i16x16 ( ) , b. as_i16x16 ( ) ) ) }
1757
+ unsafe {
1758
+ let r: i32x16 = simd_mul ( simd_cast ( a. as_i16x16 ( ) ) , simd_cast ( b. as_i16x16 ( ) ) ) ;
1759
+ let even: i32x8 = simd_shuffle ! ( r, r, [ 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 ] ) ;
1760
+ let odd: i32x8 = simd_shuffle ! ( r, r, [ 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 ] ) ;
1761
+ simd_add ( even, odd) . as_m256i ( )
1762
+ }
1718
1763
}
1719
1764
1720
1765
/// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -3594,20 +3639,10 @@ pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3594
3639
3595
3640
#[ allow( improper_ctypes) ]
3596
3641
unsafe extern "C" {
3597
- #[ link_name = "llvm.x86.avx2.phadd.w" ]
3598
- fn phaddw ( a : i16x16 , b : i16x16 ) -> i16x16 ;
3599
- #[ link_name = "llvm.x86.avx2.phadd.d" ]
3600
- fn phaddd ( a : i32x8 , b : i32x8 ) -> i32x8 ;
3601
3642
#[ link_name = "llvm.x86.avx2.phadd.sw" ]
3602
3643
fn phaddsw ( a : i16x16 , b : i16x16 ) -> i16x16 ;
3603
- #[ link_name = "llvm.x86.avx2.phsub.w" ]
3604
- fn phsubw ( a : i16x16 , b : i16x16 ) -> i16x16 ;
3605
- #[ link_name = "llvm.x86.avx2.phsub.d" ]
3606
- fn phsubd ( a : i32x8 , b : i32x8 ) -> i32x8 ;
3607
3644
#[ link_name = "llvm.x86.avx2.phsub.sw" ]
3608
3645
fn phsubsw ( a : i16x16 , b : i16x16 ) -> i16x16 ;
3609
- #[ link_name = "llvm.x86.avx2.pmadd.wd" ]
3610
- fn pmaddwd ( a : i16x16 , b : i16x16 ) -> i32x8 ;
3611
3646
#[ link_name = "llvm.x86.avx2.pmadd.ub.sw" ]
3612
3647
fn pmaddubsw ( a : u8x32 , b : u8x32 ) -> i16x16 ;
3613
3648
#[ link_name = "llvm.x86.avx2.maskload.d" ]
0 commit comments