14
14
//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
15
16
16
use super :: avx_handwritten:: * ;
17
+ use super :: sse:: * ;
17
18
use super :: sse2:: * ;
18
19
use super :: types:: * ;
19
20
use crate :: abstractions:: simd:: * ;
@@ -774,16 +775,15 @@ pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
774
775
/// using the control in `imm8`.
775
776
///
776
777
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_ps)
777
- // NOTE: Not modeled yet
778
- // pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
779
- // static_assert_uimm_bits!(IMM8, 8);
780
- // {
781
- // transmute(simd_shuffle(
782
- // a.as_f32x4(), _mm_undefined_ps(), [(IMM8 as u32 >> 0) & 0b11, (IMM8 as u32 >> 2) & 0b11,
783
- // (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
784
- // ))
785
- // }
786
- // }
778
+ pub fn _mm_permute_ps < const IMM8 : i32 > ( a : __m128 ) -> __m128 {
779
+ static_assert_uimm_bits ! ( IMM8 , 8 ) ;
780
+ {
781
+ transmute ( simd_shuffle (
782
+ a. as_f32x4 ( ) , _mm_undefined_ps ( ) . as_f32x4 ( ) , [ ( IMM8 as u32 >> 0 ) & 0b11 , ( IMM8 as u32 >> 2 ) & 0b11 ,
783
+ ( IMM8 as u32 >> 4 ) & 0b11 , ( IMM8 as u32 >> 6 ) & 0b11 , ] ,
784
+ ) )
785
+ }
786
+ }
787
787
/// Shuffles double-precision (64-bit) floating-point elements in `a`
788
788
/// within 256-bit lanes using the control in `b`.
789
789
///
@@ -886,10 +886,9 @@ pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
886
886
/// (32-bit) floating-point elements) to all elements of the returned vector.
887
887
///
888
888
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ps)
889
- // NOTE: Not modeled yet
890
- // pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
891
- // { transmute(simd_shuffle((*a).as_f32x4(), _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])) }
892
- // }
889
+ pub fn _mm256_broadcast_ps ( a : & __m128 ) -> __m256 {
890
+ { transmute ( simd_shuffle ( ( * a) . as_f32x4 ( ) , _mm_setzero_ps ( ) . as_f32x4 ( ) , [ 0 , 1 , 2 , 3 , 0 , 1 , 2 , 3 ] ) ) }
891
+ }
893
892
/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
894
893
/// (64-bit) floating-point elements) to all elements of the returned vector.
895
894
///
@@ -906,30 +905,29 @@ pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
906
905
/// at the location specified by `imm8`.
907
906
///
908
907
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_ps)
909
- // NOTE: Not modeled yet
910
- // pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
911
- // static_assert_uimm_bits!(IMM1, 1);
912
- // {
913
- // transmute(simd_shuffle(
914
- // a.as_f32x8(), _mm256_castps128_ps256(b), [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9,
915
- // 10, 11]] [IMM1 as usize],
916
- // ))
917
- // }
918
- // }
908
+ pub fn _mm256_insertf128_ps < const IMM1 : i32 > ( a : __m256 , b : __m128 ) -> __m256 {
909
+ static_assert_uimm_bits ! ( IMM1 , 1 ) ;
910
+ {
911
+ transmute ( simd_shuffle (
912
+ a. as_f32x8 ( ) , _mm256_castps128_ps256 ( b) . as_f32x8 ( ) , [ [ 8 , 9 , 10 , 11 , 4 , 5 , 6 , 7 ] , [ 0 , 1 , 2 , 3 , 8 , 9 ,
913
+ 10 , 11 ] ] [ IMM1 as usize ] ,
914
+ ) )
915
+ }
916
+ }
919
917
/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
920
918
/// double-precision (64-bit) floating-point elements) from `b` into result
921
919
/// at the location specified by `imm8`.
922
920
///
923
921
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_pd)
924
- // NOTE: Not modeled yet
925
- // pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
926
- // static_assert_uimm_bits!(IMM1, 1);
927
- // {
928
- // simd_shuffle(
929
- // a, _mm256_castpd128_pd256(b), [[4, 5, 2, 3], [0, 1, 4, 5]] [IMM1 as usize],
930
- // )
931
- // }
932
- // }
922
+ pub fn _mm256_insertf128_pd < const IMM1 : i32 > ( a : __m256d , b : __m128d ) -> __m256d {
923
+ static_assert_uimm_bits ! ( IMM1 , 1 ) ;
924
+ {
925
+ transmute ( simd_shuffle (
926
+ a . as_f64x4 ( ) , _mm256_castpd128_pd256 ( b ) . as_f64x4 ( ) ,
927
+ [ [ 4 , 5 , 2 , 3 ] , [ 0 , 1 , 4 , 5 ] ] [ IMM1 as usize ] ,
928
+ ) )
929
+ }
930
+ }
933
931
/// Copies `a` to result, then inserts 128 bits from `b` into result
934
932
/// at the location specified by `imm8`.
935
933
///
@@ -1600,10 +1598,9 @@ pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
1600
1598
/// the upper 128 bits of the result are undefined.
1601
1599
///
1602
1600
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps128_ps256)
1603
- // NOTE: Not modeled yet
1604
- // pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
1605
- // { simd_shuffle(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
1606
- // }
1601
+ pub fn _mm256_castps128_ps256 ( a : __m128 ) -> __m256 {
1602
+ { transmute ( simd_shuffle ( a. as_f32x4 ( ) , _mm_undefined_ps ( ) . as_f32x4 ( ) , [ 0 , 1 , 2 , 3 , 4 , 4 , 4 , 4 ] ) ) }
1603
+ }
1607
1604
/// Casts vector of type __m128d to type __m256d;
1608
1605
/// the upper 128 bits of the result are undefined.
1609
1606
///
@@ -1632,10 +1629,9 @@ pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
1632
1629
/// the value of the source vector. The upper 128 bits are set to zero.
1633
1630
///
1634
1631
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextps128_ps256)
1635
- // NOTE: Not modeled yet
1636
- // pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
1637
- // { simd_shuffle(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
1638
- // }
1632
+ pub fn _mm256_zextps128_ps256 ( a : __m128 ) -> __m256 {
1633
+ { transmute ( simd_shuffle ( a. as_f32x4 ( ) , _mm_setzero_ps ( ) . as_f32x4 ( ) , [ 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ] ) ) }
1634
+ }
1639
1635
/// Constructs a 256-bit integer vector from a 128-bit integer vector.
1640
1636
/// The lower 128 bits contain the value of the source vector. The upper
1641
1637
/// 128 bits are set to zero.
@@ -1655,9 +1651,9 @@ pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
1655
1651
///
1656
1652
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextpd128_pd256)
1657
1653
// NOTE: Not modeled yet
1658
- // pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
1659
- // { simd_shuffle(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
1660
- // }
1654
+ pub fn _mm256_zextpd128_pd256 ( a : __m128d ) -> __m256d {
1655
+ { transmute ( simd_shuffle ( a. as_f64x2 ( ) , _mm_setzero_pd ( ) . as_f64x2 ( ) , [ 0 , 1 , 2 , 3 ] ) ) }
1656
+ }
1661
1657
/// Returns vector of type `__m256` with indeterminate elements.
1662
1658
/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
1663
1659
/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
0 commit comments