1414//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
1515
1616use super :: avx_handwritten:: * ;
17+ use super :: sse:: * ;
1718use super :: sse2:: * ;
1819use super :: types:: * ;
1920use crate :: abstractions:: simd:: * ;
@@ -774,16 +775,15 @@ pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
774775/// using the control in `imm8`.
775776///
776777/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_ps)
777- // NOTE: Not modeled yet
778- // pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
779- // static_assert_uimm_bits!(IMM8, 8);
780- // {
781- // transmute(simd_shuffle(
782- // a.as_f32x4(), _mm_undefined_ps(), [(IMM8 as u32 >> 0) & 0b11, (IMM8 as u32 >> 2) & 0b11,
783- // (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
784- // ))
785- // }
786- // }
778+ pub fn _mm_permute_ps < const IMM8 : i32 > ( a : __m128 ) -> __m128 {
779+ static_assert_uimm_bits ! ( IMM8 , 8 ) ;
780+ {
781+ transmute ( simd_shuffle (
782+ a. as_f32x4 ( ) , _mm_undefined_ps ( ) . as_f32x4 ( ) , [ ( IMM8 as u32 >> 0 ) & 0b11 , ( IMM8 as u32 >> 2 ) & 0b11 ,
783+ ( IMM8 as u32 >> 4 ) & 0b11 , ( IMM8 as u32 >> 6 ) & 0b11 , ] ,
784+ ) )
785+ }
786+ }
787787/// Shuffles double-precision (64-bit) floating-point elements in `a`
788788/// within 256-bit lanes using the control in `b`.
789789///
@@ -886,10 +886,9 @@ pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
886886/// (32-bit) floating-point elements) to all elements of the returned vector.
887887///
888888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ps)
889- // NOTE: Not modeled yet
890- // pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
891- // { transmute(simd_shuffle((*a).as_f32x4(), _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])) }
892- // }
889+ pub fn _mm256_broadcast_ps ( a : & __m128 ) -> __m256 {
890+ { transmute ( simd_shuffle ( ( * a) . as_f32x4 ( ) , _mm_setzero_ps ( ) . as_f32x4 ( ) , [ 0 , 1 , 2 , 3 , 0 , 1 , 2 , 3 ] ) ) }
891+ }
893892/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
894893/// (64-bit) floating-point elements) to all elements of the returned vector.
895894///
@@ -906,30 +905,29 @@ pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
906905/// at the location specified by `imm8`.
907906///
908907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_ps)
909- // NOTE: Not modeled yet
910- // pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
911- // static_assert_uimm_bits!(IMM1, 1);
912- // {
913- // transmute(simd_shuffle(
914- // a.as_f32x8(), _mm256_castps128_ps256(b), [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9,
915- // 10, 11]] [IMM1 as usize],
916- // ))
917- // }
918- // }
908+ pub fn _mm256_insertf128_ps < const IMM1 : i32 > ( a : __m256 , b : __m128 ) -> __m256 {
909+ static_assert_uimm_bits ! ( IMM1 , 1 ) ;
910+ {
911+ transmute ( simd_shuffle (
912+ a. as_f32x8 ( ) , _mm256_castps128_ps256 ( b) . as_f32x8 ( ) , [ [ 8 , 9 , 10 , 11 , 4 , 5 , 6 , 7 ] , [ 0 , 1 , 2 , 3 , 8 , 9 ,
913+ 10 , 11 ] ] [ IMM1 as usize ] ,
914+ ) )
915+ }
916+ }
919917/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
920918/// double-precision (64-bit) floating-point elements) from `b` into result
921919/// at the location specified by `imm8`.
922920///
923921/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_pd)
924- // NOTE: Not modeled yet
925- // pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
926- // static_assert_uimm_bits!(IMM1, 1);
927- // {
928- // simd_shuffle(
929- // a, _mm256_castpd128_pd256(b), [[4, 5, 2, 3], [0, 1, 4, 5]] [IMM1 as usize],
930- // )
931- // }
932- // }
922+ pub fn _mm256_insertf128_pd < const IMM1 : i32 > ( a : __m256d , b : __m128d ) -> __m256d {
923+ static_assert_uimm_bits ! ( IMM1 , 1 ) ;
924+ {
925+ transmute ( simd_shuffle (
926+ a . as_f64x4 ( ) , _mm256_castpd128_pd256 ( b ) . as_f64x4 ( ) ,
927+ [ [ 4 , 5 , 2 , 3 ] , [ 0 , 1 , 4 , 5 ] ] [ IMM1 as usize ] ,
928+ ) )
929+ }
930+ }
933931/// Copies `a` to result, then inserts 128 bits from `b` into result
934932/// at the location specified by `imm8`.
935933///
@@ -1600,10 +1598,9 @@ pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
16001598/// the upper 128 bits of the result are undefined.
16011599///
16021600/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps128_ps256)
1603- // NOTE: Not modeled yet
1604- // pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
1605- // { simd_shuffle(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
1606- // }
1601+ pub fn _mm256_castps128_ps256 ( a : __m128 ) -> __m256 {
1602+ { transmute ( simd_shuffle ( a. as_f32x4 ( ) , _mm_undefined_ps ( ) . as_f32x4 ( ) , [ 0 , 1 , 2 , 3 , 4 , 4 , 4 , 4 ] ) ) }
1603+ }
16071604/// Casts vector of type __m128d to type __m256d;
16081605/// the upper 128 bits of the result are undefined.
16091606///
@@ -1632,10 +1629,9 @@ pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
16321629/// the value of the source vector. The upper 128 bits are set to zero.
16331630///
16341631/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextps128_ps256)
1635- // NOTE: Not modeled yet
1636- // pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
1637- // { simd_shuffle(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
1638- // }
1632+ pub fn _mm256_zextps128_ps256 ( a : __m128 ) -> __m256 {
1633+ { transmute ( simd_shuffle ( a. as_f32x4 ( ) , _mm_setzero_ps ( ) . as_f32x4 ( ) , [ 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ] ) ) }
1634+ }
16391635/// Constructs a 256-bit integer vector from a 128-bit integer vector.
16401636/// The lower 128 bits contain the value of the source vector. The upper
16411637/// 128 bits are set to zero.
@@ -1655,9 +1651,9 @@ pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
16551651///
16561652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextpd128_pd256)
16571653// NOTE: Not modeled yet
1658- // pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
1659- // { simd_shuffle(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
1660- // }
1654+ pub fn _mm256_zextpd128_pd256 ( a : __m128d ) -> __m256d {
1655+ { transmute ( simd_shuffle ( a. as_f64x2 ( ) , _mm_setzero_pd ( ) . as_f64x2 ( ) , [ 0 , 1 , 2 , 3 ] ) ) }
1656+ }
16611657/// Returns vector of type `__m256` with indeterminate elements.
16621658/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
16631659/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
0 commit comments