Skip to content

Commit 5b12d03

Browse files
sse
1 parent 09a0cbd commit 5b12d03

File tree

4 files changed

+75
-60
lines changed

4 files changed

+75
-60
lines changed

testable-simd-models/src/core_arch/x86/models/avx.rs

Lines changed: 40 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
1515
1616
use super::avx_handwritten::*;
17+
use super::sse::*;
1718
use super::sse2::*;
1819
use super::types::*;
1920
use crate::abstractions::simd::*;
@@ -774,16 +775,15 @@ pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
774775
/// using the control in `imm8`.
775776
///
776777
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_ps)
777-
// NOTE: Not modeled yet
778-
// pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
779-
// static_assert_uimm_bits!(IMM8, 8);
780-
// {
781-
// transmute(simd_shuffle(
782-
// a.as_f32x4(), _mm_undefined_ps(), [(IMM8 as u32 >> 0) & 0b11, (IMM8 as u32 >> 2) & 0b11,
783-
// (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
784-
// ))
785-
// }
786-
// }
778+
pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
779+
static_assert_uimm_bits!(IMM8, 8);
780+
{
781+
transmute(simd_shuffle(
782+
a.as_f32x4(), _mm_undefined_ps().as_f32x4(), [(IMM8 as u32 >> 0) & 0b11, (IMM8 as u32 >> 2) & 0b11,
783+
(IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
784+
))
785+
}
786+
}
787787
/// Shuffles double-precision (64-bit) floating-point elements in `a`
788788
/// within 256-bit lanes using the control in `b`.
789789
///
@@ -886,10 +886,9 @@ pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
886886
/// (32-bit) floating-point elements) to all elements of the returned vector.
887887
///
888888
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ps)
889-
// NOTE: Not modeled yet
890-
// pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
891-
// { transmute(simd_shuffle((*a).as_f32x4(), _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])) }
892-
// }
889+
pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
890+
{ transmute(simd_shuffle((*a).as_f32x4(), _mm_setzero_ps().as_f32x4(), [0, 1, 2, 3, 0, 1, 2, 3])) }
891+
}
893892
/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
894893
/// (64-bit) floating-point elements) to all elements of the returned vector.
895894
///
@@ -906,30 +905,29 @@ pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
906905
/// at the location specified by `imm8`.
907906
///
908907
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_ps)
909-
// NOTE: Not modeled yet
910-
// pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
911-
// static_assert_uimm_bits!(IMM1, 1);
912-
// {
913-
// transmute(simd_shuffle(
914-
// a.as_f32x8(), _mm256_castps128_ps256(b), [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9,
915-
// 10, 11]] [IMM1 as usize],
916-
// ))
917-
// }
918-
// }
908+
pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
909+
static_assert_uimm_bits!(IMM1, 1);
910+
{
911+
transmute(simd_shuffle(
912+
a.as_f32x8(), _mm256_castps128_ps256(b).as_f32x8(), [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9,
913+
10, 11]] [IMM1 as usize],
914+
))
915+
}
916+
}
919917
/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
920918
/// double-precision (64-bit) floating-point elements) from `b` into result
921919
/// at the location specified by `imm8`.
922920
///
923921
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_pd)
924-
// NOTE: Not modeled yet
925-
// pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
926-
// static_assert_uimm_bits!(IMM1, 1);
927-
// {
928-
// simd_shuffle(
929-
// a, _mm256_castpd128_pd256(b), [[4, 5, 2, 3], [0, 1, 4, 5]] [IMM1 as usize],
930-
// )
931-
// }
932-
// }
922+
pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
923+
static_assert_uimm_bits!(IMM1, 1);
924+
{
925+
transmute(simd_shuffle(
926+
a.as_f64x4(), _mm256_castpd128_pd256(b).as_f64x4(),
927+
[[4, 5, 2, 3], [0, 1, 4, 5]] [IMM1 as usize],
928+
))
929+
}
930+
}
933931
/// Copies `a` to result, then inserts 128 bits from `b` into result
934932
/// at the location specified by `imm8`.
935933
///
@@ -1600,10 +1598,9 @@ pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
16001598
/// the upper 128 bits of the result are undefined.
16011599
///
16021600
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps128_ps256)
1603-
// NOTE: Not modeled yet
1604-
// pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
1605-
// { simd_shuffle(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
1606-
// }
1601+
pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
1602+
{ transmute(simd_shuffle(a.as_f32x4(), _mm_undefined_ps().as_f32x4(), [0, 1, 2, 3, 4, 4, 4, 4])) }
1603+
}
16071604
/// Casts vector of type __m128d to type __m256d;
16081605
/// the upper 128 bits of the result are undefined.
16091606
///
@@ -1632,10 +1629,9 @@ pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
16321629
/// the value of the source vector. The upper 128 bits are set to zero.
16331630
///
16341631
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextps128_ps256)
1635-
// NOTE: Not modeled yet
1636-
// pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
1637-
// { simd_shuffle(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
1638-
// }
1632+
pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
1633+
{ transmute(simd_shuffle(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), [0, 1, 2, 3, 4, 5, 6, 7])) }
1634+
}
16391635
/// Constructs a 256-bit integer vector from a 128-bit integer vector.
16401636
/// The lower 128 bits contain the value of the source vector. The upper
16411637
/// 128 bits are set to zero.
@@ -1655,9 +1651,9 @@ pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
16551651
///
16561652
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextpd128_pd256)
16571653
// NOTE: Not modeled yet
1658-
// pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
1659-
// { simd_shuffle(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
1660-
// }
1654+
pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
1655+
{ transmute(simd_shuffle(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), [0, 1, 2, 3])) }
1656+
}
16611657
/// Returns vector of type `__m256` with indeterminate elements.
16621658
/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
16631659
/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].

testable-simd-models/src/core_arch/x86/models/avx2.rs

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
use crate::abstractions::simd::*;
2323
use crate::abstractions::utilities::*;
2424

25+
use super::sse::*;
26+
use super::sse2::*;
2527
use super::avx::*;
2628
use super::avx2_handwritten::*;
2729
use super::types::*;
@@ -386,18 +388,16 @@ pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
386388
/// from `a` to all elements of the 128-bit returned value.
387389
///
388390
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
389-
// NOTE: Not modeled yet
390-
// pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
391-
// { simd_shuffle(a, _mm_setzero_pd(), [0_u32; 2]) }
392-
// }
391+
pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
392+
{ transmute(simd_shuffle(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), [0_u32; 2])) }
393+
}
393394
/// Broadcasts the low double-precision (64-bit) floating-point element
394395
/// from `a` to all elements of the 256-bit returned value.
395396
///
396397
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
397-
// NOTE: Not modeled yet
398-
// pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
399-
// { simd_shuffle(a, _mm_setzero_pd(), [0_u32; 4]) }
400-
// }
398+
pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
399+
{ transmute(simd_shuffle(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), [0_u32; 4])) }
400+
}
401401
/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
402402
/// the 256-bit returned value.
403403
///
@@ -422,18 +422,16 @@ pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
422422
/// from `a` to all elements of the 128-bit returned value.
423423
///
424424
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
425-
// NOTE: Not modeled yet
426-
// pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
427-
// { simd_shuffle(a, _mm_setzero_ps(), [0_u32; 4]) }
428-
// }
425+
pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
426+
{ transmute(simd_shuffle(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), [0_u32; 4])) }
427+
}
429428
/// Broadcasts the low single-precision (32-bit) floating-point element
430429
/// from `a` to all elements of the 256-bit returned value.
431430
///
432431
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
433-
// NOTE: Not modeled yet
434-
// pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
435-
// { simd_shuffle(a, _mm_setzero_ps(), [0_u32; 8]) }
436-
// }
432+
pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
433+
{ transmute(simd_shuffle(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), [0_u32; 8])) }
434+
}
437435
/// Broadcasts the low packed 16-bit integer from a to all elements of
438436
/// the 128-bit returned value
439437
///

testable-simd-models/src/core_arch/x86/models/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
//! In general, it is best to gain an idea of how an implementation should be written by looking
2121
//! at how other functions are implemented. Also see `core::arch::x86` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
2222
23+
pub mod sse;
2324
pub mod avx;
2425
pub mod avx2;
2526
pub mod avx2_handwritten;
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
//! Streaming SIMD Extensions (SSE)
2+
use crate::abstractions::simd::*;
3+
use crate::abstractions::utilities::*;
4+
use super::types::*;
5+
6+
/// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
7+
/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
8+
/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
9+
/// In practice, this is typically equivalent to [`mem::zeroed`].
10+
///
11+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
12+
pub fn _mm_undefined_ps() -> __m128 {
13+
transmute(f32x4::ZERO())
14+
}
15+
16+
/// Construct a `__m128` with all elements initialized to zero.
17+
///
18+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
19+
pub fn _mm_setzero_ps() -> __m128 {
20+
transmute(f32x4::ZERO()) }

0 commit comments

Comments
 (0)