diff --git a/.gitignore b/.gitignore index 39ad701a8883f..82d2291fd22b0 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,4 @@ goto-transcoder # already existing elements were commented out #/target +testable-simd-models/target diff --git a/library/Cargo.lock b/library/Cargo.lock index a9a611fe1ed56..3e34ee6173741 100644 --- a/library/Cargo.lock +++ b/library/Cargo.lock @@ -28,6 +28,7 @@ version = "0.0.0" dependencies = [ "compiler_builtins", "core", + "safety", ] [[package]] @@ -67,6 +68,9 @@ dependencies = [ [[package]] name = "core" version = "0.0.0" +dependencies = [ + "safety", +] [[package]] name = "coretests" @@ -200,6 +204,39 @@ dependencies = [ "unwind", ] +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + [[package]] name = "proc_macro" version = "0.0.0" @@ -216,6 +253,15 @@ dependencies = [ "cc", ] +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + [[package]] name = "r-efi" version = "5.3.0" @@ -300,6 +346,16 @@ dependencies = [ "std", ] +[[package]] +name = "safety" +version = "0.1.0" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn 2.0.104", +] + [[package]] name = "shlex" version = "1.3.0" @@ -329,6 +385,7 @@ dependencies = [ "rand", "rand_xorshift", "rustc-demangle", + "safety", "std_detect", "unwind", "wasi", @@ -345,6 +402,27 @@ dependencies = [ "libc", ] +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "sysroot" version = "0.0.0" @@ -365,6 +443,12 @@ dependencies = [ "std", ] +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + [[package]] name = "unicode-width" version = "0.2.1" @@ -397,6 +481,12 @@ dependencies = [ "rustc-std-workspace-core", ] +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" diff --git a/testable-simd-models/Cargo.toml b/testable-simd-models/Cargo.toml new file mode 100644 index 0000000000000..6e2116fec82e0 --- /dev/null +++ b/testable-simd-models/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "testable-simd-models" +version = "0.0.2" +authors = ["Cryspen"] +license = "Apache-2.0" +homepage = "https://github.com/cryspen/verify-rust-std/testable-simd-models" +edition = "2021" +repository = "https://github.com/cryspen/verify-rust-std/testable-simd-models" +readme = "README.md" + +[dependencies] +rand = "0.9" +pastey = "0.1.0" + +[lints.rust] +unexpected_cfgs = { level = "warn" } diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md new file mode 100644 index 0000000000000..470c51072c8e5 --- /dev/null +++ b/testable-simd-models/README.md @@ -0,0 +1,226 @@ +# testable-simd-models + +This crate contains executable, independently testable specifications +for the SIMD intrinsics provided by the `core::arch` library in Rust. +The structure of this crate is based on [rust-lang/stdarch/crates/core_arch](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch). + +## Code Structure +Within the `core_arch` folder in this crate, there is a different +folder for each architecture for which we have written models. +In particular, it contains folders for `x86` and `arm_shared`. +Each such folder has 2 sub-folders: `models` and `tests`. + +The `models` folder contains the models of the intrinsics, with +different files for different target features (e.g. `sse2`, `avx2` +etc.). The code in this folder is written using the various +abstractions implemented in `abstractions`, especially those in +`abstractions::simd`. These models are meant to closely +resemble their implementations within the Rust core itself. + +The `tests` folder contains the tests of these models, and is +structured the same way as `models`. Each file additionally includes +the definition of a macro that makes writing these tests easier. The +tests work by testing the models against the intrinsics in the Rust +core, trying out random inputs (generally 1000), and comparing their +outputs. + +All tests can be run by executing `cargo test`, and we expect this to be +run as part of CI. + +## Modeling a SIMD Intrinsic + +There are three kinds of SIMD intrinsics in `core::arch`. + +The first kind are builtin Rust compiler intrinsics, some of which are +in the [`intrinsics/simd.rs` file](https://github.com/model-checking/verify-rust-std/blob/main/library/core/src/intrinsics/simd.rs) +in the `core` crate, and others are in the [`simd.rs` file of `core_arch`](https://github.com/model-checking/verify-rust-std/blob/main/library/stdarch/crates/core_arch/src/simd.rs). +These builtin intrinsics define generic SIMD operations that the Rust compiler knows how to implement on each platform. + +The second kind are `extern` intrinsics that are links to definitions in LLVM. +See, for example, [this list](https://github.com/rust-lang/stdarch/blob/master/crates/core_arch/src/x86/avx2.rs#L3596C8-L3596C14) +of `extern` intrinsics used in the Intel x86 AVX2 library. +These extern intrinsics are typically platform-specific functions that map to low-level instructions. + +The third kind are `defined` intrinsics that are given proper definitions in Rust, and their code may +depend on the builtin intrinsics or the extern intrinsics. These defined intrinsics represent higher-level +operations that are wrappers around one or more assembly instructions. + +### Modeling builtin intrinsics manually + +We model all three kinds of intrinsics, but in slightly different +ways. For the builtin intrinsics, we can write implementations once +and for all, and to this end, we use a library within the +`abstractions/simd.rs` file, where we copy the signatures of the +intrinsics from Rust but give them our own implementation. In +particular, we model each SIMD vector as an array of scalars, and +define each generic operation as functions over such arrays. This can +be seen as a reference implementation of the builtin intrinsics. + +Hence, for example, the SIMD add intrinsic `simd_add` is modeled as follows, +it takes two arrays of machine integers and adds them pointwise using a +`wrapping_add` operation: + +```rust +pub fn simd_add( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| (x[i].wrapping_add(y[i]))) +} +``` + +Notably, we model a strongly typed version of `simd_add`, in contrast to the compiler +intrinsic, which is too generic and unimplementable in safe Rust: + +```rust +/// Adds two simd vectors elementwise. +/// +/// `T` must be a vector of integers or floats. +#[rustc_intrinsic] +#[rustc_nounwind] +pub unsafe fn simd_add(x: T, y: T) -> T; +``` + +The main rules for writing these models are that they should be simple and self-contained, +relying only on the libraries in `abstractions`, on builtin Rust language features, or +other testable models. In particular, they should not themselves directly call Rust libraries +or external crates, without going through the abstractions API. + + +### Modeling extern intrinsics manually + +For each file in `core::arch`, we split the code into extern +intrinsics that must be modeled by hand and defined intrinsics whose +models can be derived semi-automatically. The extern intrinsics are +placed in a module suffixed with `_handwritten`. Hence, for example, +the extern intrinsics used in `avx2.rs` can be found in `avx2_handwritten.rs`. + +Modeling extern intrinsics is similar to modeling the builtin ones, +in that the models are written by hand and treat the SIMD vectors +as arrays of machine integers. The main difference is that these intrinsics +are platform-specific and so their modeling requires looking at the Intel or ARM +documentation for the underlying operation. + +For example, the extern intrinsic `phaddw` used in `avx2` corresponds to an +Intel instruction called "Packed Horizontal Add" and is used in AVX2 intrinsics +like `_mm256_hadd_epi16` documented [here](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16&ig_expand=3667_) +By inspecting the Intel documentation, we can write a Rust model for it +as follows + +```rust +pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 { + i16x16::from_fn(|i| { + if i < 4 { + a[2 * i].wrapping_add(a[2 * i + 1]) + } else if i < 8 { + b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1]) + } else if i < 12 { + a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1]) + } else { + b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1]) + } + }) +} +``` + +### Modeling defined intrinsics semi-automatically + +To model a defined intrinsic, we essentially copy the Rust code of +the intrinsic from `core::arch` and adapt it to use our underlying abstractions. The +changes needed to the code are sometimes scriptable, and indeed most +of our models were generated from a script, but some changes are still +needed by hand. + +For example, let us say the intrinsic we are modeling is +`_mm256_bsrli_epi128` from the avx2 feature set. + +1. We go to [rust-lang/stdarch/crates/core_arch/src/x86/](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch/src/x86/), and find the implementation of the intrinsic in `avx2.rs`. + +2. We see that the implementation looks like this: +``` rust +/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128) +#[inline] +#[target_feature(enable = "avx2")] +#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))] +#[rustc_legacy_const_generics(1)] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub fn _mm256_bsrli_epi128(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + const fn mask(shift: i32, i: u32) -> u32 { + let shift = shift as u32 & 0xff; + if shift > 15 || (15 - (i % 16)) < shift { + 0 + } else { + 32 + (i + shift) + } + } + unsafe { + let a = a.as_i8x32(); + let r: i8x32 = simd_shuffle!( + i8x32::ZERO, + a, + [ + mask(IMM8, 0), + mask(IMM8, 1), + mask(IMM8, 2), + mask(IMM8, 3), + ... + mask(IMM8, 31), + ], + ); + transmute(r) + } +} +``` + +Thus, we then go to `core_arch/x86/models/avx2.rs`, and add this implementation. +The only change it requires here is that the `simd_shuffle` macro is a function in our model, +and we discard all the function attributes. + +For other intrinsics, we sometimes need to make more changes. Since our model of the builtin intrinsics +is more precise concerning the type of their arguments compared to their Rust counterparts, we +sometimes need to add more type annotations in our defined models. We also remove all `unsafe` guards, +since our models are always in safe Rust. Otherwise, our code for the defined intrinsics looks very +similar to the upstream code in `core::arch`. + +3. Next, we add a test for this intrinsic in `core_arch/avx2/tests/avx2.rs`. For convenience purposes, we have defined a `mk!` macro, which can be used to automatically generate + tests. The test generated by the macro generates a number of random inputs (by default, 1000), and compares the output generated by the model + and that generated by the intrinsic in upstream `core::arch`. A valid test of the intrinsic above looks like this. + ```rust + mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec)); + ``` + The macro invocation has four parts. + 1. `mk!([100]...`: By default, the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that + only 100 inputs are generated. + 2. `_mm256_bsrli_epi128`: This is the name of the intrinsic being tested, and is necessary in all cases. + 3. `{<0>,<1>,<2>,<3>,...,<255>}`: This part only appears when the intrinsic has a const generic argument, like the `IMM8` in this intrinsic. + As the name indicates, this constant argument is supposed to be at most 8 bits wide. + We can confirm this by looking at the implementation and spotting the `static_assert_uimm_bits!(IMM8, 8);` + line, which asserts that constant argument is positive and fits in 8 bits. Thus, we add `{<0>,<1>,<2>,<3>,...,<255>}` to test for each possible constant + value of the constant argument. + 4. `(a: BitVec)`: This part contains all the arguments of the intrinsic and their types. + + This summarizes the steps needed to use the `mk!` macro to generate a test. There is a caveat: in the case that the output of an intrinsic is _not_ + a bit-vector (and is instead, say, an integer like `i32`), then the macro will not work, and a manual test has to be written. You can see examples in the test files. + + + +## Contributing Models + +To contribute new models of intrinsics, we expect the author to follow +the above steps and provide comprehensive tests. It is important that +the model author looks carefully at both the Intel/ARM specifications +and the Rust `stdarch` implementation, because they may look quite different +from each other. + +In some cases, the Rust implementation may not be correct. +Indeed, the previous implementation of `_mm256_bsrli_epi128` (and a +similar intrinsic called `_mm512_bsrli_epi128`) in `stdarch` had a +bug, which we found during the process of modeling and testing this +intrinsic. This bug was [reported by +us](https://github.com/rust-lang/stdarch/issues/1822) using a failing +test case generated from the testable model and then fixed by [our +PR](https://github.com/rust-lang/stdarch/pull/1823) in the 2025-06-30 +version of `stdarch`. diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs new file mode 100644 index 0000000000000..f8b67f2ca20f1 --- /dev/null +++ b/testable-simd-models/src/abstractions/bit.rs @@ -0,0 +1,248 @@ +//! # Bit Manipulation and Machine Integer Utilities +//! +//! This module provides utilities for working with individual bits and machine integer types. +//! It defines a [`Bit`] enum to represent a single bit (`0` or `1`) along with convenient +//! conversion implementations between `Bit`, [`bool`], and various primitive integer types. +//! +//! In addition, the module introduces the [`MachineInteger`] trait which abstracts over +//! integer types, providing associated constants: +//! +//! - `BITS`: The size of the integer type in bits. +//! - `SIGNED`: A flag indicating whether the type is signed. +//! +//! The [`Bit`] type includes methods for extracting the value of a specific bit from an integer. +//! For example, [`Bit::of_int`] returns the bit at a given position for a provided integer, +//! handling both positive and negative values (assuming a two's complement representation). +//! +//! # Examples +//! +//! ```rust +//! use testable_simd_models::abstractions::bit::{Bit, MachineInteger}; +//! +//! // Extract the 3rd bit (0-indexed) from an integer. +//! let bit = Bit::nth_bit(42, 2); +//! println!("The extracted bit is: {:?}", bit); +//! +//! // Convert Bit to a primitive integer type. +//! let num: u8 = bit.into(); +//! println!("As an integer: {}", num); +//! ``` +//! +//! [`bool`]: https://doc.rust-lang.org/std/primitive.bool.html +//! [`Bit::of_int`]: enum.Bit.html#method.of_int + +/// Represent a bit: `0` or `1`. +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum Bit { + Zero, + One, +} +impl std::ops::BitAnd for Bit { + type Output = Self; + fn bitand(self, rhs: Self) -> Self { + match self { + Bit::Zero => Bit::Zero, + Bit::One => rhs, + } + } +} + +impl std::ops::BitOr for Bit { + type Output = Self; + fn bitor(self, rhs: Self) -> Self { + match self { + Bit::Zero => rhs, + Bit::One => Bit::One, + } + } +} + +impl std::ops::BitXor for Bit { + type Output = Self; + fn bitxor(self, rhs: Self) -> Self { + match (self, rhs) { + (Bit::Zero, Bit::Zero) => Bit::Zero, + (Bit::One, Bit::One) => Bit::Zero, + _ => Bit::One, + } + } +} + +impl std::ops::Not for Bit { + type Output = Self; + fn not(self) -> Self { + match self { + Bit::One => Bit::Zero, + Bit::Zero => Bit::One, + } + } +} + +impl std::ops::Neg for Bit { + type Output = Self; + fn neg(self) -> Self { + match self { + Bit::One => Bit::Zero, + Bit::Zero => Bit::One, + } + } +} +macro_rules! generate_from_bit_impls { + ($($ty:ident),*) => { + $(impl From for $ty { + fn from(bit: Bit) -> Self { + bool::from(bit) as $ty + } + })* + }; +} +generate_from_bit_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128); + +impl From for bool { + fn from(bit: Bit) -> Self { + match bit { + Bit::Zero => false, + Bit::One => true, + } + } +} + +impl From for Bit { + fn from(b: bool) -> Bit { + match b { + false => Bit::Zero, + true => Bit::One, + } + } +} + +/// A trait for integers and floats + +pub trait MachineNumeric { + /// The size of this integer type in bits. + const BITS: u32; + /// The signedness of this integer type. + const SIGNED: bool; + /// Element of the integer type with every bit as 0. + const ZEROS: Self; + /// Element of the integer type with every bit as 1. + const ONES: Self; + /// Minimum value of the integer type. + const MIN: Self; + /// Maximum value of the integer type. + const MAX: Self; + /// Raw transmutation of bits to u128 + fn to_u128(self) -> u128; + /// Raw transmutation of bits from u128 + fn from_u128(x: u128) -> Self; +} + +/// A trait for types that represent machine integers. +pub trait MachineInteger: MachineNumeric { + /// Implements functionality for `simd_add` in `crate::abstractions::simd`. + fn wrapping_add(self, rhs: Self) -> Self; + /// Implements functionality for `simd_sub` in `crate::abstractions::simd`. + fn wrapping_sub(self, rhs: Self) -> Self; + /// Implements functionality for `simd_mul` in `crate::abstractions::simd`. + fn overflowing_mul(self, rhs: Self) -> Self; + /// Implements functionality for `simd_saturating_add` in `crate::abstractions::simd`. + fn saturating_add(self, rhs: Self) -> Self; + /// Implements functionality for `simd_saturating_sub` in `crate::abstractions::simd`. + fn saturating_sub(self, rhs: Self) -> Self; + /// Implements functionality for `simd_abs_diff` in `crate::abstractions::simd`. + fn wrapping_abs_diff(self, rhs: Self) -> Self; + /// Implements functionality for `simd_abs` in `crate::abstractions::simd`. + fn wrapping_abs(self) -> Self; +} + +macro_rules! generate_imachine_integer_impls { + ($($ty:ident),*) => { + $( + impl MachineNumeric for $ty { + const BITS: u32 = $ty::BITS; + const SIGNED: bool = true; + const ZEROS: $ty = 0; + const ONES: $ty = -1; + const MIN: $ty = $ty::MIN; + const MAX: $ty = $ty::MAX; + fn to_u128(self) -> u128 {self as u128} + fn from_u128(x:u128) -> Self {x as $ty} + } + impl MachineInteger for $ty { + fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) } + fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) } + fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 } + fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)} + fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs) } + fn wrapping_abs_diff(self, rhs: Self) -> Self {if self > rhs {$ty::wrapping_sub(self, rhs)} else {$ty::wrapping_sub(rhs, self)}} + fn wrapping_abs(self) -> Self {if self == $ty::MIN {self} else {self.abs()}} + })* + }; +} + +macro_rules! generate_umachine_integer_impls { + ($($ty:ident),*) => { + $( + impl MachineNumeric for $ty { + const BITS: u32 = $ty::BITS; + const SIGNED: bool = false; + const ZEROS: $ty = 0; + const ONES: $ty = $ty::MAX; + const MIN: $ty = $ty::MIN; + const MAX: $ty = $ty::MAX; + fn to_u128(self) -> u128 {self as u128} + fn from_u128(x:u128) -> Self {x as $ty} + } + impl MachineInteger for $ty { + fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) } + fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) } + fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 } + fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)} + fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs)} + fn wrapping_abs_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}} + fn wrapping_abs(self) -> Self {self} + })* + }; +} +generate_imachine_integer_impls!(i8, i16, i32, i64, i128); +generate_umachine_integer_impls!(u8, u16, u32, u64, u128); + +impl MachineNumeric for f32 { + const BITS: u32 = 32; + const SIGNED: bool = false; + const ZEROS: f32 = 0.0; + const ONES: f32 = f32::from_bits(0xffffffffu32); + const MIN: f32 = f32::MIN; + const MAX: f32 = f32::MAX; + fn to_u128(self) -> u128 { + self.to_bits() as u128 + } + fn from_u128(x: u128) -> Self { + f32::from_bits(x as u32) + } +} + +impl MachineNumeric for f64 { + const BITS: u32 = 64; + const SIGNED: bool = false; + const ZEROS: f64 = 0.0; + const ONES: f64 = f64::from_bits(0xffffffffffffffffu64); + const MIN: f64 = f64::MIN; + const MAX: f64 = f64::MAX; + fn to_u128(self) -> u128 { + self.to_bits() as u128 + } + fn from_u128(x: u128) -> Self { + f64::from_bits(x as u64) + } +} + +impl Bit { + pub fn nth_bit(x: T, nth: usize) -> Self { + if (x.to_u128() >> nth) % 2 == 1 { + Self::One + } else { + Self::Zero + } + } +} diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs new file mode 100644 index 0000000000000..ac73749482e37 --- /dev/null +++ b/testable-simd-models/src/abstractions/bitvec.rs @@ -0,0 +1,158 @@ +//! This module provides a specification-friendly bit vector type. +use super::bit::{Bit, MachineNumeric}; +use super::funarr::*; + +use std::fmt::Formatter; + +/// A fixed-size bit vector type. +/// +/// `BitVec` is a specification-friendly, fixed-length bit vector that internally +/// stores an array of [`Bit`] values, where each `Bit` represents a single binary digit (0 or 1). +/// +/// This type provides several utility methods for constructing and converting bit vectors: +/// +/// The [`Debug`] implementation for `BitVec` pretty-prints the bits in groups of eight, +/// making the bit pattern more human-readable. The type also implements indexing, +/// allowing for easy access to individual bits. +#[derive(Copy, Clone, Eq, PartialEq)] +pub struct BitVec(FunArray); + +impl BitVec { + #[allow(non_snake_case)] + pub fn ZERO() -> Self { + Self::from_fn(|_| Bit::Zero) + } +} + +/// Pretty prints a bit slice by group of 8 +fn bit_slice_to_string(bits: &[Bit]) -> String { + bits.iter() + .map(|bit| match bit { + Bit::Zero => '0', + Bit::One => '1', + }) + .collect::>() + .chunks(8) + .map(|bits| bits.iter().collect::()) + .map(|s| format!("{s} ")) + .collect::() + .trim() + .into() +} + +impl core::fmt::Debug for BitVec { + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(f, "{}", bit_slice_to_string(&self.0.as_vec())) + } +} + +impl core::ops::Index for BitVec { + type Output = Bit; + fn index(&self, index: u32) -> &Self::Output { + self.0.get(index) + } +} + +/// Convert a bit slice into an unsigned number. + +fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 { + bits.iter() + .enumerate() + .map(|(i, bit)| u128::from(*bit) << i) + .sum::() +} + +/// Convert a bit slice into a machine integer of type `T`. +fn int_from_bit_slice(bits: &[Bit]) -> T { + debug_assert!(bits.len() <= T::BITS as usize); + let result = if T::SIGNED { + let is_negative = matches!(bits[T::BITS as usize - 1], Bit::One); + let s = u128_int_from_bit_slice(&bits[0..T::BITS as usize - 1]) as i128; + if is_negative { + s + (-2i128).pow(T::BITS - 1) + } else { + s + } + } else { + u128_int_from_bit_slice(bits) as i128 + }; + T::from_u128(result as u128) +} +impl BitVec { + /// Constructor for BitVec. `BitVec::::from_fn` constructs a bitvector out of a function that takes usizes smaller than `N` and produces bits. + pub fn from_fn Bit>(f: F) -> Self { + Self(FunArray::from_fn(f)) + } + /// Convert a slice of machine integers where only the `d` least significant bits are relevant. + pub fn from_slice(x: &[T], d: u32) -> Self { + Self::from_fn(|i| Bit::nth_bit::(x[(i / d) as usize], (i % d) as usize)) + } + + /// Construct a BitVec out of a machine integer. + pub fn from_int(n: T) -> Self { + Self::from_slice::(&[n], T::BITS as u32) + } + + /// Convert a BitVec into a machine integer of type `T`. + pub fn to_int(self) -> T { + int_from_bit_slice(&self.0.as_vec()) + } + + /// Convert a BitVec into a vector of machine integers of type `T`. + pub fn to_vec(&self) -> Vec { + self.0 + .as_vec() + .chunks(T::BITS as usize) + .map(int_from_bit_slice) + .collect() + } + + /// Generate a random BitVec. + pub fn rand() -> Self { + use rand::prelude::*; + let random_source: Vec<_> = { + let mut rng = rand::rng(); + (0..N).map(|_| rng.random::()).collect() + }; + Self::from_fn(|i| random_source[i as usize].into()) + } +} + +impl BitVec { + pub fn chunked_shift( + self, + shl: FunArray, + ) -> BitVec { + fn chunked_shift( + bitvec: BitVec, + shl: FunArray, + ) -> BitVec { + BitVec::from_fn(|i| { + let nth_bit = i % CHUNK; + let nth_chunk = i / CHUNK; + let shift: i128 = if nth_chunk < SHIFTS { + shl[nth_chunk] + } else { + 0 + }; + let local_index = (nth_bit as i128).wrapping_sub(shift); + if local_index < CHUNK as i128 && local_index >= 0 { + let local_index = local_index as u32; + bitvec[nth_chunk * CHUNK + local_index] + } else { + Bit::Zero + } + }) + } + chunked_shift::(self, shl) + } + + /// Folds over the array, accumulating a result. + /// + /// # Arguments + /// * `init` - The initial value of the accumulator. + /// * `f` - A function combining the accumulator and each element. + pub fn fold(&self, init: A, f: fn(A, Bit) -> A) -> A { + self.0.fold(init, f) + } +} diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs new file mode 100644 index 0000000000000..4026efb66c1f5 --- /dev/null +++ b/testable-simd-models/src/abstractions/funarr.rs @@ -0,0 +1,185 @@ +//! This module implements a fixed-size array wrapper with functional semantics +//! which are used in formulating abstractions. + +use crate::abstractions::bit::MachineNumeric; + +/// `FunArray` represents an array of `T` values of length `N`, where `N` is a compile-time constant. +/// Internally, it uses a fixed-length array of `Option` with a maximum capacity of 512 elements. +/// Unused elements beyond `N` are filled with `None`. +#[derive(Copy, Clone, Eq, PartialEq)] +pub struct FunArray([Option; 512]); + +impl FunArray { + /// Gets a reference to the element at index `i`. + pub fn get(&self, i: u32) -> &T { + self.0[i as usize].as_ref().unwrap() + } + /// Constructor for FunArray. `FunArray::from_fn` constructs a funarray out of a function that takes usizes smaller than `N` and produces an element of type T. + pub fn from_fn T>(f: F) -> Self { + // let vec = (0..N).map(f).collect(); + let arr = core::array::from_fn(|i| { + if (i as u32) < N { + Some(f(i as u32)) + } else { + None + } + }); + Self(arr) + } + + /// Converts the `FunArray` into a `Vec`. + pub fn as_vec(&self) -> Vec + where + T: Clone, + { + self.0[0..(N as usize)] + .iter() + .cloned() + .map(|x| x.unwrap()) + .collect() + } + + /// Folds over the array, accumulating a result. + /// + /// # Arguments + /// * `init` - The initial value of the accumulator. + /// * `f` - A function combining the accumulator and each element. + pub fn fold(&self, mut init: A, f: fn(A, T) -> A) -> A + where + T: Clone, + { + for i in 0..N { + init = f(init, self[i].clone()); + } + init + } +} + +impl FunArray { + #[allow(non_snake_case)] + pub fn ZERO() -> Self { + Self::from_fn(|_| T::ZEROS) + } +} + +impl TryFrom> for FunArray { + type Error = (); + fn try_from(v: Vec) -> Result { + if (v.len() as u32) < N { + Err(()) + } else { + Ok(Self::from_fn(|i| v[i as usize].clone())) + } + } +} + +impl core::fmt::Debug for FunArray { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{:?}", self.as_vec()) + } +} + +impl core::ops::Index for FunArray { + type Output = T; + + fn index(&self, index: u32) -> &Self::Output { + self.get(index) + } +} + +impl FunArray<1, T> { + pub fn new(x: T) -> Self { + let v = [x]; + Self::from_fn(|i| v[i as usize]) + } +} + +impl FunArray<2, T> { + pub fn new(x0: T, x1: T) -> Self { + let v = [x0, x1]; + Self::from_fn(|i| v[i as usize]) + } +} + +impl FunArray<4, T> { + pub fn new(x0: T, x1: T, x2: T, x3: T) -> Self { + let v = [x0, x1, x2, x3]; + Self::from_fn(|i| v[i as usize]) + } +} + +impl FunArray<8, T> { + pub fn new(x0: T, x1: T, x2: T, x3: T, x4: T, x5: T, x6: T, x7: T) -> Self { + let v = [x0, x1, x2, x3, x4, x5, x6, x7]; + Self::from_fn(|i| v[i as usize]) + } +} + +impl FunArray<16, T> { + pub fn new( + x0: T, + x1: T, + x2: T, + x3: T, + x4: T, + x5: T, + x6: T, + x7: T, + x8: T, + x9: T, + x10: T, + x11: T, + x12: T, + x13: T, + x14: T, + x15: T, + ) -> Self { + let v = [ + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, + ]; + Self::from_fn(|i| v[i as usize]) + } +} + +impl FunArray<32, T> { + pub fn new( + x0: T, + x1: T, + x2: T, + x3: T, + x4: T, + x5: T, + x6: T, + x7: T, + x8: T, + x9: T, + x10: T, + x11: T, + x12: T, + x13: T, + x14: T, + x15: T, + x16: T, + x17: T, + x18: T, + x19: T, + x20: T, + x21: T, + x22: T, + x23: T, + x24: T, + x25: T, + x26: T, + x27: T, + x28: T, + x29: T, + x30: T, + x31: T, + ) -> Self { + let v = [ + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, + x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, + ]; + Self::from_fn(|i| v[i as usize]) + } +} diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs new file mode 100644 index 0000000000000..4f840ab60235d --- /dev/null +++ b/testable-simd-models/src/abstractions/mod.rs @@ -0,0 +1,27 @@ +//! This module provides abstractions that are useful for writing +//! specifications for the intrinsics. Currently it provides two abstractions: bits and +//! bit vectors. +//! +//! # Examples +//! +//! Converting an integer to a bit vector and back: +//! +//! ```rust +//! use testable_simd_models::abstractions::{bit::{Bit, MachineInteger}, bitvec::BitVec}; +//! +//! // Create a BitVec from a machine integer (using the integer's bit-width) +//! let bv = BitVec::<16>::from_int(42u16); +//! println!("BitVec: {:?}", bv); +//! +//! // Convert the BitVec back into a machine integer +//! let n: u16 = bv.to_int(); +//! println!("Integer: {}", n); +//! +//! assert!(n == 42); +//! ``` + +pub mod bit; +pub mod bitvec; +pub mod funarr; +pub mod simd; +pub mod utilities; diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs new file mode 100644 index 0000000000000..70e0556618288 --- /dev/null +++ b/testable-simd-models/src/abstractions/simd.rs @@ -0,0 +1,947 @@ +//! Models of SIMD compiler intrinsics. +//! +//! Operations are defined on FunArrs. + +use crate::abstractions::{bit::*, bitvec::*, funarr::*}; +use std::convert::*; +use std::ops::*; + +#[allow(dead_code)] +/// Derives interpretations functions, and type synonyms. +macro_rules! interpretations { +($n:literal; $($name:ident [$ty:ty; $m:literal]),*) => { + $( + #[doc = concat!(stringify!($ty), " vectors of size ", stringify!($m))] + #[allow(non_camel_case_types)] + pub type $name = FunArray<$m, $ty>; + pastey::paste! { + const _: () = { + impl BitVec<$n> { + #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to bit vectors of size ", stringify!($n))] + pub fn [< from_ $name >](iv: $name) -> BitVec<$n> { + let vec: Vec<$ty> = iv.as_vec(); + Self::from_slice(&vec[..], <$ty>::BITS as u32) + } + #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))] + pub fn [< to_ $name >](bv: BitVec<$n>) -> $name { + let vec: Vec<$ty> = bv.to_vec(); + $name::from_fn(|i| vec[i as usize]) + } + #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))] + pub fn [< as_ $name >](self) -> $name { + let vec: Vec<$ty> = self.to_vec(); + $name::from_fn(|i| vec[i as usize]) + } + + + } + + + impl From> for $name { + fn from(bv: BitVec<$n>) -> Self { + BitVec::[< to_ $name >](bv) + } + } + + impl From<$name> for BitVec<$n> { + fn from(iv: $name) -> Self { + BitVec::[< from_ $name >](iv) + } + } + + impl $name { + + pub fn splat(value: $ty) -> Self { + FunArray::from_fn(|_| value) + } + } + }; + } + )* +}; +} + +interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32], + u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32], f32x8 [f32; 8], f64x4 [f64; 4]); +interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16], + u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16], f32x4 [f32; 4], f64x2 [f64; 2]); + +interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]); +interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8], f32x2 [f32; 2], f64x1 [f64; 1]); +interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]); + +/// Inserts an element into a vector, returning the updated vector. +/// +/// # Safety +/// +/// `idx` must be in-bounds of the vector, ie. idx < N +pub fn simd_insert(x: FunArray, idx: u32, val: T) -> FunArray { + FunArray::from_fn(|i| if i == idx { val } else { x[i] }) +} + +/// Extracts an element from a vector. +/// +/// # Safety +/// +/// `idx` must be in-bounds of the vector, ie. idx < N +pub fn simd_extract(x: FunArray, idx: u32) -> T { + x.get(idx).clone() +} + +/// Adds two vectors elementwise with wrapping on overflow/underflow. +pub fn simd_add( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| x[i].wrapping_add(y[i])) +} + +/// Subtracts `rhs` from `lhs` elementwise with wrapping on overflow/underflow. +pub fn simd_sub( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| x[i].wrapping_sub(y[i])) +} + +/// Multiplies two vectors elementwise with wrapping on overflow/underflow. +pub fn simd_mul( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| x[i].overflowing_mul(y[i])) +} + +/// Produces the elementwise absolute values. +/// For vectors of unsigned integers it returns the vector untouched. +/// If the element is the minimum value of a signed integer, it returns the element as is. +pub fn simd_abs(x: FunArray) -> FunArray { + FunArray::from_fn(|i| x[i].wrapping_abs()) +} + +/// Produces the elementwise absolute difference of two vectors. +/// Note: Absolute difference in this case is simply the element with the smaller value subtracted from the element with the larger value, with overflow/underflow. +/// For example, if the elements are i8, the absolute difference of 255 and -2 is -255. +pub fn simd_abs_diff( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| x[i].wrapping_abs_diff(y[i])) +} + +/// Shifts vector left elementwise, with UB on overflow. +/// +/// # Safety +/// +/// Each element of `rhs` must be less than `::BITS`. +pub fn simd_shl( + x: FunArray, + y: FunArray, +) -> FunArray::Output> { + FunArray::from_fn(|i| x[i] << y[i]) +} + +/// Shifts vector right elementwise, with UB on overflow. +/// +/// Shifts `lhs` right by `rhs`, shifting in sign bits for signed types. +/// +/// # Safety +/// +/// Each element of `rhs` must be less than `::BITS`. + +pub fn simd_shr( + x: FunArray, + y: FunArray, +) -> FunArray::Output> { + FunArray::from_fn(|i| x[i] >> y[i]) +} + +/// "Ands" vectors elementwise. + +pub fn simd_and( + x: FunArray, + y: FunArray, +) -> FunArray::Output> { + FunArray::from_fn(|i| x[i] & y[i]) +} + +/// "Ors" vectors elementwise. + +pub fn simd_or( + x: FunArray, + y: FunArray, +) -> FunArray::Output> { + FunArray::from_fn(|i| x[i] | y[i]) +} + +/// "Exclusive ors" vectors elementwise. + +pub fn simd_xor( + x: FunArray, + y: FunArray, +) -> FunArray::Output> { + FunArray::from_fn(|i| x[i] ^ y[i]) +} + +pub trait CastsFrom { + fn cast(a: T) -> Self; +} +pub trait TruncateFrom { + /// Truncates into [`Self`] from a larger integer + fn truncate_from(v: T) -> Self; +} + +macro_rules! from_impls{ + ($([$ty1:ty, $ty2: ty]),*) => { + $( + impl CastsFrom<$ty2> for $ty1 { + fn cast(a: $ty2) -> $ty1 { + a as $ty1 + } + } + )* + }; +} +macro_rules! truncate_from_order { + ($t:ty, $($from:ty),+) => { + $( + impl TruncateFrom<$from> for $t { + #[inline] + fn truncate_from(v: $from) -> $t { v as $t } + } + )* + truncate_from_order!($($from),+); + }; + + ($t:ty) => {}; +} +truncate_from_order!(u8, u16, u32, u64, u128); +truncate_from_order!(i8, i16, i32, i64, i128); + +macro_rules! truncate_from_impls{ + ($([$ty1:ty, $ty2: ty]),*) => { + $( + impl CastsFrom<$ty2> for $ty1 { + fn cast(a: $ty2) -> $ty1 { + <$ty1>::truncate_from(a) + } + } + )* + }; +} + +macro_rules! symm_impls{ + ($([$ty1:ty, $ty2: ty]),*) => { + $( + impl CastsFrom<$ty2> for $ty1 { + fn cast(a: $ty2) -> $ty1 { + a as $ty1 + } + } + impl CastsFrom<$ty1> for $ty2 { + fn cast(a: $ty1) -> $ty2 { + a as $ty2 + } + } + )* + }; +} +macro_rules! self_impls{ + ($($ty1:ty),*) => { + $( + impl CastsFrom<$ty1> for $ty1 { + fn cast(a: $ty1) -> $ty1 { + a + } + } + + )* + }; +} +from_impls!( + [u16, u8], + [u32, u8], + [u32, u16], + [u64, u8], + [u64, u16], + [u64, u32], + [u128, u8], + [u128, u16], + [u128, u32], + [u128, u64], + [i16, i8], + [i32, i8], + [i32, i16], + [i64, i8], + [i64, i16], + [i64, i32], + [i128, i8], + [i128, i16], + [i128, i32], + [i128, i64], + [f64, u32], + [f64, i32], + [f32, u32], + [f32, i32], + [f32, f64], + [f64, f32] +); +truncate_from_impls!( + [u8, u16], + [u8, u32], + [u16, u32], + [u8, u64], + [u16, u64], + [u32, u64], + [u8, u128], + [u16, u128], + [u32, u128], + [u64, u128], + [i8, i16], + [i8, i32], + [i16, i32], + [i8, i64], + [i16, i64], + [i32, i64], + [i8, i128], + [i16, i128], + [i32, i128], + [i64, i128] +); + +symm_impls!([u8, i8], [u16, i16], [u32, i32], [u64, i64], [u128, i128]); + +self_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128); + +// Would like to do the below instead of using the above macros, but currently this is an active issue in Rust (#31844) +// impl CastsFrom for U +// where +// U : From { +// fn cast(a: T) -> U { +// U::from(a) +// } +// } + +// impl CastsFrom for U +// where +// U : TruncateFrom { +// fn cast(a: T) -> U { +// U::truncate_from(a) +// } +// } + +/// Numerically casts a vector, elementwise. +/// +/// Casting can only happen between two integers of the same signedness. +/// +/// When casting from a wider number to a smaller number, the higher bits are removed. +/// Otherwise, it extends the number, following signedness. +pub fn simd_cast>(x: FunArray) -> FunArray { + FunArray::from_fn(|i| T2::cast(x[i])) +} + +/// Negates a vector elementwise. +/// +/// Rust panics for `-::Min` due to overflow, but here, it just returns the element as is. + +pub fn simd_neg::Output> + MachineInteger + Eq + Neg + Copy>( + x: FunArray, +) -> FunArray { + FunArray::from_fn(|i| { + if x[i] == T::MIN { + T::MIN + } else { + T::from(-x[i]) + } + }) +} +/// Tests elementwise equality of two vectors. +/// +/// Returns `0` (all zeros) for false and `!0` (all ones) for true. + +pub fn simd_eq( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| if x[i] == y[i] { T::ONES } else { T::ZEROS }) +} + +/// Tests elementwise inequality equality of two vectors. +/// +/// Returns `0` (all zeros) for false and `!0` (all ones) for true. + +pub fn simd_ne( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| if x[i] != y[i] { T::ONES } else { T::ZEROS }) +} + +/// Tests if `x` is less than `y`, elementwise. +/// +/// Returns `0` (all zeros) for false and `!0` (all ones) for true. + +pub fn simd_lt( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| if x[i] < y[i] { T::ONES } else { T::ZEROS }) +} + +/// Tests if `x` is less than or equal to `y`, elementwise. +/// +/// Returns `0` (all zeros) for false and `!0` (all ones) for true. + +pub fn simd_le( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| if x[i] <= y[i] { T::ONES } else { T::ZEROS }) +} + +/// Tests if `x` is greater than `y`, elementwise. +/// +/// Returns `0` (all zeros) for false and `!0` (all ones) for true. + +pub fn simd_gt( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| if x[i] > y[i] { T::ONES } else { T::ZEROS }) +} + +/// Tests if `x` is greater than or equal to `y`, elementwise. +/// +/// Returns `0` (all zeros) for false and `!0` (all ones) for true. + +pub fn simd_ge( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| if x[i] >= y[i] { T::ONES } else { T::ZEROS }) +} + +/// Shuffles two vectors by the indices in idx. +/// +/// For safety, `N2 <= N1 + N3` must hold. +pub fn simd_shuffle( + x: FunArray, + y: FunArray, + idx: [u32; N2], +) -> FunArray { + FunArray::from_fn(|i| { + let i = idx[i as usize]; + if i < N1 { + x[i] + } else { + y[i - N1] + } + }) +} + +/// Adds two vectors elementwise, with saturation. + +pub fn simd_saturating_add( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| x[i].saturating_add(y[i])) +} + +/// Subtracts `y` from `x` elementwise, with saturation. + +pub fn simd_saturating_sub( + x: FunArray, + y: FunArray, +) -> FunArray { + FunArray::from_fn(|i| x[i].saturating_sub(y[i])) +} + +/// Truncates an integer vector to a bitmask. +/// Macro for that expands to an expression which is equivalent to truncating an integer vector to a bitmask, as it would on little endian systems. +/// +/// The macro takes 3 arguments. +/// The first is the highest index of the vector. +/// The second is the vector itself, which should just contain `0` and `!0`. +/// The third is the type to which the truncation happens, which should be atleast as wide as the number of elements in the vector. +/// +/// Thus for example, to truncate the vector, +/// `let a : i32 = [!0, 0, 0, 0, 0, 0, 0, 0, !0, !0, 0, 0, 0, 0, !0, 0]` +/// to u16, you would call, +/// `simd_bitmask_little!(15, a, u16)` +/// to get, +/// `0b0100001100000001u16` +/// +/// # Safety +/// The second argument must be a vector of signed integer types. +/// The length of the vector must be 64 at most. + +// The numbers in here are powers of 2. If it is needed to extend the length of the vector, simply add more cases in the same manner. +// The reason for doing this is that the expression becomes easier to work with when compiled for a proof assistant. +macro_rules! simd_bitmask_little { + (63, $a:ident, $ty:ty) => { + 9223372036854775808 * ((if $a[63] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(62, $a, $ty) + }; + (62, $a:ident, $ty:ty) => { + 4611686018427387904 * ((if $a[62] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(61, $a, $ty) + }; + (61, $a:ident, $ty:ty) => { + 2305843009213693952 * ((if $a[61] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(60, $a, $ty) + }; + (60, $a:ident, $ty:ty) => { + 1152921504606846976 * ((if $a[60] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(59, $a, $ty) + }; + (59, $a:ident, $ty:ty) => { + 576460752303423488 * ((if $a[59] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(58, $a, $ty) + }; + (58, $a:ident, $ty:ty) => { + 288230376151711744 * ((if $a[58] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(57, $a, $ty) + }; + (57, $a:ident, $ty:ty) => { + 144115188075855872 * ((if $a[57] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(56, $a, $ty) + }; + (56, $a:ident, $ty:ty) => { + 72057594037927936 * ((if $a[56] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(55, $a, $ty) + }; + (55, $a:ident, $ty:ty) => { + 36028797018963968 * ((if $a[55] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(54, $a, $ty) + }; + (54, $a:ident, $ty:ty) => { + 18014398509481984 * ((if $a[54] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(53, $a, $ty) + }; + (53, $a:ident, $ty:ty) => { + 9007199254740992 * ((if $a[53] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(52, $a, $ty) + }; + (52, $a:ident, $ty:ty) => { + 4503599627370496 * ((if $a[52] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(51, $a, $ty) + }; + (51, $a:ident, $ty:ty) => { + 2251799813685248 * ((if $a[51] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(50, $a, $ty) + }; + (50, $a:ident, $ty:ty) => { + 1125899906842624 * ((if $a[50] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(49, $a, $ty) + }; + (49, $a:ident, $ty:ty) => { + 562949953421312 * ((if $a[49] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(48, $a, $ty) + }; + (48, $a:ident, $ty:ty) => { + 281474976710656 * ((if $a[48] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(47, $a, $ty) + }; + (47, $a:ident, $ty:ty) => { + 140737488355328 * ((if $a[47] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(46, $a, $ty) + }; + (46, $a:ident, $ty:ty) => { + 70368744177664 * ((if $a[46] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(45, $a, $ty) + }; + (45, $a:ident, $ty:ty) => { + 35184372088832 * ((if $a[45] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(44, $a, $ty) + }; + (44, $a:ident, $ty:ty) => { + 17592186044416 * ((if $a[44] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(43, $a, $ty) + }; + (43, $a:ident, $ty:ty) => { + 8796093022208 * ((if $a[43] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(42, $a, $ty) + }; + (42, $a:ident, $ty:ty) => { + 4398046511104 * ((if $a[42] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(41, $a, $ty) + }; + (41, $a:ident, $ty:ty) => { + 2199023255552 * ((if $a[41] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(40, $a, $ty) + }; + (40, $a:ident, $ty:ty) => { + 1099511627776 * ((if $a[40] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_little!(39, $a, $ty) + }; + (39, $a:ident, $ty:ty) => { + 549755813888 * ((if $a[39] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(38, $a, $ty) + }; + (38, $a:ident, $ty:ty) => { + 274877906944 * ((if $a[38] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(37, $a, $ty) + }; + (37, $a:ident, $ty:ty) => { + 137438953472 * ((if $a[37] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(36, $a, $ty) + }; + (36, $a:ident, $ty:ty) => { + 68719476736 * ((if $a[36] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(35, $a, $ty) + }; + (35, $a:ident, $ty:ty) => { + 34359738368 * ((if $a[35] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(34, $a, $ty) + }; + (34, $a:ident, $ty:ty) => { + 17179869184 * ((if $a[34] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(33, $a, $ty) + }; + (33, $a:ident, $ty:ty) => { + 8589934592 * ((if $a[33] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(32, $a, $ty) + }; + (32, $a:ident, $ty:ty) => { + 4294967296 * ((if $a[32] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(31, $a, $ty) + }; + (31, $a:ident, $ty:ty) => { + 2147483648 * ((if $a[31] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(30, $a, $ty) + }; + (30, $a:ident, $ty:ty) => { + 1073741824 * ((if $a[30] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(29, $a, $ty) + }; + (29, $a:ident, $ty:ty) => { + 536870912 * ((if $a[29] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(28, $a, $ty) + }; + (28, $a:ident, $ty:ty) => { + 268435456 * ((if $a[28] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(27, $a, $ty) + }; + (27, $a:ident, $ty:ty) => { + 134217728 * ((if $a[27] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(26, $a, $ty) + }; + (26, $a:ident, $ty:ty) => { + 67108864 * ((if $a[26] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(25, $a, $ty) + }; + (25, $a:ident, $ty:ty) => { + 33554432 * ((if $a[25] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(24, $a, $ty) + }; + (24, $a:ident, $ty:ty) => { + 16777216 * ((if $a[24] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(23, $a, $ty) + }; + (23, $a:ident, $ty:ty) => { + 8388608 * ((if $a[23] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(22, $a, $ty) + }; + (22, $a:ident, $ty:ty) => { + 4194304 * ((if $a[22] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(21, $a, $ty) + }; + (21, $a:ident, $ty:ty) => { + 2097152 * ((if $a[21] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(20, $a, $ty) + }; + (20, $a:ident, $ty:ty) => { + 1048576 * ((if $a[20] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(19, $a, $ty) + }; + (19, $a:ident, $ty:ty) => { + 524288 * ((if $a[19] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(18, $a, $ty) + }; + (18, $a:ident, $ty:ty) => { + 262144 * ((if $a[18] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(17, $a, $ty) + }; + (17, $a:ident, $ty:ty) => { + 131072 * ((if $a[17] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(16, $a, $ty) + }; + (16, $a:ident, $ty:ty) => { + 65536 * ((if $a[16] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(15, $a, $ty) + }; + (15, $a:ident, $ty:ty) => { + 32768 * ((if $a[15] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(14, $a, $ty) + }; + (14, $a:ident, $ty:ty) => { + 16384 * ((if $a[14] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(13, $a, $ty) + }; + (13, $a:ident, $ty:ty) => { + 8192 * ((if $a[13] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(12, $a, $ty) + }; + (12, $a:ident, $ty:ty) => { + 4096 * ((if $a[12] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(11, $a, $ty) + }; + (11, $a:ident, $ty:ty) => { + 2048 * ((if $a[11] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(10, $a, $ty) + }; + (10, $a:ident, $ty:ty) => { + 1024 * ((if $a[10] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(9, $a, $ty) + }; + (9, $a:ident, $ty:ty) => { + 512 * ((if $a[9] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(8, $a, $ty) + }; + (8, $a:ident, $ty:ty) => { + 256 * ((if $a[8] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(7, $a, $ty) + }; + (7, $a:ident, $ty:ty) => { + 128 * ((if $a[7] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(6, $a, $ty) + }; + (6, $a:ident, $ty:ty) => { + 64 * ((if $a[6] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(5, $a, $ty) + }; + (5, $a:ident, $ty:ty) => { + 32 * ((if $a[5] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(4, $a, $ty) + }; + (4, $a:ident, $ty:ty) => { + 16 * ((if $a[4] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(3, $a, $ty) + }; + (3, $a:ident, $ty:ty) => { + 8 * ((if $a[3] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(2, $a, $ty) + }; + (2, $a:ident, $ty:ty) => { + 4 * ((if $a[2] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(1, $a, $ty) + }; + (1, $a:ident, $ty:ty) => { + 2 * ((if $a[1] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(0, $a, $ty) + }; + (0, $a:ident, $ty:ty) => { + ((if $a[0] < 0 { 1 } else { 0 }) as $ty) + }; +} +pub(crate) use simd_bitmask_little; + +/// Truncates an integer vector to a bitmask. +/// Macro for that expands to an expression which is equivalent to truncating an integer vector to a bitmask, as it would on big endian systems. +/// +/// The macro takes 3 arguments. +/// The first is the highest index of the vector. +/// The second is the vector itself, which should just contain `0` and `!0`. +/// The third is the type to which the truncation happens, which should be atleast as wide as the number of elements in the vector. +/// +/// Thus for example, to truncate the vector, +/// `let a : i32 = [!0, 0, 0, 0, 0, 0, 0, 0, !0, !0, 0, 0, 0, 0, !0, 0]` +/// to u16, you would call, +/// `simd_bitmask_big!(15, a, u16)` +/// to get, +/// `0b1000000011000010u16` +/// +/// # Safety +/// The second argument must be a vector of signed integer types. + +#[allow(unused)] +macro_rules! simd_bitmask_big { + (63, $a:ident, $ty:ty) => { + 1 * ((if $a[63] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(62, $a, $ty) + }; + (62, $a:ident, $ty:ty) => { + 2 * ((if $a[62] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(61, $a, $ty) + }; + (61, $a:ident, $ty:ty) => { + 4 * ((if $a[61] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(60, $a, $ty) + }; + (60, $a:ident, $ty:ty) => { + 8 * ((if $a[60] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(59, $a, $ty) + }; + (59, $a:ident, $ty:ty) => { + 16 * ((if $a[59] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(58, $a, $ty) + }; + (58, $a:ident, $ty:ty) => { + 32 * ((if $a[58] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(57, $a, $ty) + }; + (57, $a:ident, $ty:ty) => { + 64 * ((if $a[57] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(56, $a, $ty) + }; + (56, $a:ident, $ty:ty) => { + 128 * ((if $a[56] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(55, $a, $ty) + }; + (55, $a:ident, $ty:ty) => { + 256 * ((if $a[55] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(54, $a, $ty) + }; + (54, $a:ident, $ty:ty) => { + 512 * ((if $a[54] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(53, $a, $ty) + }; + (53, $a:ident, $ty:ty) => { + 1024 * ((if $a[53] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(52, $a, $ty) + }; + (52, $a:ident, $ty:ty) => { + 2048 * ((if $a[52] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(51, $a, $ty) + }; + (51, $a:ident, $ty:ty) => { + 4096 * ((if $a[51] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(50, $a, $ty) + }; + (50, $a:ident, $ty:ty) => { + 8192 * ((if $a[50] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(49, $a, $ty) + }; + (49, $a:ident, $ty:ty) => { + 16384 * ((if $a[49] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(48, $a, $ty) + }; + (48, $a:ident, $ty:ty) => { + 32768 * ((if $a[48] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(47, $a, $ty) + }; + (47, $a:ident, $ty:ty) => { + 65536 * ((if $a[47] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(46, $a, $ty) + }; + (46, $a:ident, $ty:ty) => { + 131072 * ((if $a[46] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(45, $a, $ty) + }; + (45, $a:ident, $ty:ty) => { + 262144 * ((if $a[45] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(44, $a, $ty) + }; + (44, $a:ident, $ty:ty) => { + 524288 * ((if $a[44] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(43, $a, $ty) + }; + (43, $a:ident, $ty:ty) => { + 1048576 * ((if $a[43] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(42, $a, $ty) + }; + (42, $a:ident, $ty:ty) => { + 2097152 * ((if $a[42] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(41, $a, $ty) + }; + (41, $a:ident, $ty:ty) => { + 4194304 * ((if $a[41] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(40, $a, $ty) + }; + (40, $a:ident, $ty:ty) => { + 8388608 * ((if $a[40] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(39, $a, $ty) + }; + (39, $a:ident, $ty:ty) => { + 16777216 * ((if $a[39] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(38, $a, $ty) + }; + (38, $a:ident, $ty:ty) => { + 33554432 * ((if $a[38] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(37, $a, $ty) + }; + (37, $a:ident, $ty:ty) => { + 67108864 * ((if $a[37] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(36, $a, $ty) + }; + (36, $a:ident, $ty:ty) => { + 134217728 * ((if $a[36] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(35, $a, $ty) + }; + (35, $a:ident, $ty:ty) => { + 268435456 * ((if $a[35] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(34, $a, $ty) + }; + (34, $a:ident, $ty:ty) => { + 536870912 * ((if $a[34] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(33, $a, $ty) + }; + (33, $a:ident, $ty:ty) => { + 1073741824 * ((if $a[33] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(32, $a, $ty) + }; + (32, $a:ident, $ty:ty) => { + 2147483648 * ((if $a[32] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(31, $a, $ty) + }; + (31, $a:ident, $ty:ty) => { + 4294967296 * ((if $a[31] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(30, $a, $ty) + }; + (30, $a:ident, $ty:ty) => { + 8589934592 * ((if $a[30] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(29, $a, $ty) + }; + (29, $a:ident, $ty:ty) => { + 17179869184 * ((if $a[29] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(28, $a, $ty) + }; + (28, $a:ident, $ty:ty) => { + 34359738368 * ((if $a[28] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(27, $a, $ty) + }; + (27, $a:ident, $ty:ty) => { + 68719476736 * ((if $a[27] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(26, $a, $ty) + }; + (26, $a:ident, $ty:ty) => { + 137438953472 * ((if $a[26] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(25, $a, $ty) + }; + (25, $a:ident, $ty:ty) => { + 274877906944 * ((if $a[25] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(24, $a, $ty) + }; + (24, $a:ident, $ty:ty) => { + 549755813888 * ((if $a[24] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(23, $a, $ty) + }; + (23, $a:ident, $ty:ty) => { + 1099511627776 * ((if $a[23] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(22, $a, $ty) + }; + (22, $a:ident, $ty:ty) => { + 2199023255552 * ((if $a[22] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(21, $a, $ty) + }; + (21, $a:ident, $ty:ty) => { + 4398046511104 * ((if $a[21] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(20, $a, $ty) + }; + (20, $a:ident, $ty:ty) => { + 8796093022208 * ((if $a[20] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(19, $a, $ty) + }; + (19, $a:ident, $ty:ty) => { + 17592186044416 * ((if $a[19] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(18, $a, $ty) + }; + (18, $a:ident, $ty:ty) => { + 35184372088832 * ((if $a[18] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(17, $a, $ty) + }; + (17, $a:ident, $ty:ty) => { + 70368744177664 * ((if $a[17] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(16, $a, $ty) + }; + (16, $a:ident, $ty:ty) => { + 140737488355328 * ((if $a[16] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(15, $a, $ty) + }; + (15, $a:ident, $ty:ty) => { + 281474976710656 * ((if $a[15] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(14, $a, $ty) + }; + (14, $a:ident, $ty:ty) => { + 562949953421312 * ((if $a[14] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(13, $a, $ty) + }; + (13, $a:ident, $ty:ty) => { + 1125899906842624 * ((if $a[13] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_big!(12, $a, $ty) + }; + (12, $a:ident, $ty:ty) => { + 2251799813685248 * ((if $a[12] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_big!(11, $a, $ty) + }; + (11, $a:ident, $ty:ty) => { + 4503599627370496 * ((if $a[11] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_big!(10, $a, $ty) + }; + (10, $a:ident, $ty:ty) => { + 9007199254740992 * ((if $a[10] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(9, $a, $ty) + }; + (9, $a:ident, $ty:ty) => { + 18014398509481984 * ((if $a[9] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(8, $a, $ty) + }; + (8, $a:ident, $ty:ty) => { + 36028797018963968 * ((if $a[8] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(7, $a, $ty) + }; + (7, $a:ident, $ty:ty) => { + 72057594037927936 * ((if $a[7] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(6, $a, $ty) + }; + (6, $a:ident, $ty:ty) => { + 144115188075855872 * ((if $a[6] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_big!(5, $a, $ty) + }; + (5, $a:ident, $ty:ty) => { + 288230376151711744 * ((if $a[5] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_big!(4, $a, $ty) + }; + (4, $a:ident, $ty:ty) => { + 576460752303423488 * ((if $a[4] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_big!(3, $a, $ty) + }; + (3, $a:ident, $ty:ty) => { + 1152921504606846976 * ((if $a[3] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_big!(2, $a, $ty) + }; + (2, $a:ident, $ty:ty) => { + 2305843009213693952 * ((if $a[2] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_big!(1, $a, $ty) + }; + (1, $a:ident, $ty:ty) => { + 4611686018427387904 * ((if $a[1] < 0 { 1 } else { 0 }) as $ty) + + simd_bitmask_big!(0, $a, $ty) + }; + (0, $a:ident, $ty:ty) => { + 9223372036854775808 * ((if $a[0] < 0 { 1 } else { 0 }) as $ty) + }; +} +#[allow(unused)] +pub(crate) use simd_bitmask_big; + +/// Selects elements from a mask. +/// +/// For each element, if the corresponding value in `mask` is `!0`, select the element from +/// `if_true`. If the corresponding value in `mask` is `0`, select the element from +/// `if_false`. +/// +/// # Safety +/// `mask` must only contain `0` and `!0`. + +pub fn simd_select( + mask: FunArray, + if_true: FunArray, + if_false: FunArray, +) -> FunArray { + FunArray::from_fn(|i| { + if mask[i] == T1::ONES { + if_true[i] + } else { + if_false[i] + } + }) +} diff --git a/testable-simd-models/src/abstractions/utilities.rs b/testable-simd-models/src/abstractions/utilities.rs new file mode 100644 index 0000000000000..86e1c0ba52de1 --- /dev/null +++ b/testable-simd-models/src/abstractions/utilities.rs @@ -0,0 +1,59 @@ +/// Converts one type to another +pub fn transmute>(a: T) -> U { + a.into() +} + +#[allow(unused)] +#[macro_export] +macro_rules! static_assert { + ($e:expr) => { + const { + assert!($e); + } + }; + ($e:expr, $msg:expr) => { + const { + assert!($e, $msg); + } + }; +} + +#[allow(unused_macros)] +#[macro_export] +macro_rules! static_assert_uimm_bits { + ($imm:ident, $bits:expr) => { + // `0 <= $imm` produces a warning if the immediate has an unsigned type + #[allow(unused_comparisons)] + { + static_assert!( + 0 <= $imm && $imm < (1 << $bits), + concat!( + stringify!($imm), + " doesn't fit in ", + stringify!($bits), + " bits", + ) + ) + } + }; +} + +#[allow(unused_macros)] +#[macro_export] +macro_rules! static_assert_simm_bits { + ($imm:ident, $bits:expr) => { + static_assert!( + (-1 << ($bits - 1)) - 1 <= $imm && $imm < (1 << ($bits - 1)), + concat!( + stringify!($imm), + " doesn't fit in ", + stringify!($bits), + " bits", + ) + ) + }; +} + +pub use static_assert; +pub use static_assert_simm_bits; +pub use static_assert_uimm_bits; diff --git a/testable-simd-models/src/core_arch.rs b/testable-simd-models/src/core_arch.rs new file mode 100644 index 0000000000000..19e643885f4ce --- /dev/null +++ b/testable-simd-models/src/core_arch.rs @@ -0,0 +1,5 @@ +/// This is a (partial) mirror of [`core::arch`] +pub mod x86; +pub use x86 as x86_64; + +pub mod arm_shared; diff --git a/testable-simd-models/src/core_arch/arm_shared/mod.rs b/testable-simd-models/src/core_arch/arm_shared/mod.rs new file mode 100644 index 0000000000000..6e2272ec0e50a --- /dev/null +++ b/testable-simd-models/src/core_arch/arm_shared/mod.rs @@ -0,0 +1,4 @@ +pub mod models; +#[cfg(test)] +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +pub mod tests; diff --git a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs new file mode 100644 index 0000000000000..fb7844c6d0441 --- /dev/null +++ b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs @@ -0,0 +1,44 @@ +//! Rust models for ARM intrinsics. +//! +//! This module contains models for the intrinsics as they are defined in the Rust core. +//! Since this is supposed to model the Rust core, the implemented functions must +//! mirror the Rust implementations as closely as they can. +//! +//! For example, calls to simd functions like simd_add and simd_sub are left as is, +//! with their implementations defined in `crate::abstractions::simd`. Some other +//! operations like simd_cast or simd_shuffle might need a little modification +//! for correct compilation. +//! +//! Calls to transmute are replaced with either an explicit call to a `BitVec::from_ function`, +//! or with `.into()`. +//! +//! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding +//! LLVM instruction via an `unsafe extern "C"` module. In those cases, the corresponding +//! function is defined in the `c_extern` module in each file, which contain manually +//! written implementations made by consulting the appropriate Intel documentation. +//! +//! In general, it is best to gain an idea of how an implementation should be written by looking +//! at how other functions are implemented. Also see `core::arch::arm` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch). +#![allow(unused)] +#[allow(non_camel_case_types)] +mod types { + use crate::abstractions::simd::*; + pub type int32x4_t = i32x4; + pub type int64x1_t = i64x1; + pub type int64x2_t = i64x2; + pub type int16x8_t = i16x8; + pub type int8x16_t = i8x16; + pub type uint32x4_t = u32x4; + pub type uint64x1_t = u64x1; + pub type uint64x2_t = u64x2; + pub type uint16x8_t = u16x8; + pub type uint8x16_t = u8x16; + pub type int32x2_t = i32x2; + pub type int16x4_t = i16x4; + pub type int8x8_t = i8x8; + pub type uint32x2_t = u32x2; + pub type uint16x4_t = u16x4; + pub type uint8x8_t = u8x8; +} + +pub mod neon; diff --git a/testable-simd-models/src/core_arch/arm_shared/models/neon.rs b/testable-simd-models/src/core_arch/arm_shared/models/neon.rs new file mode 100644 index 0000000000000..794fd25285b47 --- /dev/null +++ b/testable-simd-models/src/core_arch/arm_shared/models/neon.rs @@ -0,0 +1,873 @@ +use super::types::*; +use crate::abstractions::simd::*; + +pub fn vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t { + simd_add(a, vabd_s16(b, c)) +} + +pub fn vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t { + simd_add(a, vabd_s32(b, c)) +} + +pub fn vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t { + simd_add(a, vabd_s8(b, c)) +} + +pub fn vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t { + simd_add(a, vabd_u16(b, c)) +} + +pub fn vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t { + simd_add(a, vabd_u32(b, c)) +} + +pub fn vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t { + simd_add(a, vabd_u8(b, c)) +} + +pub fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t { + let d: uint8x8_t = vabd_u8(b, c); + simd_add(a, simd_cast(d)) +} + +pub fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t { + let d: uint16x4_t = vabd_u16(b, c); + simd_add(a, simd_cast(d)) +} + +pub fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t { + let d: uint32x2_t = vabd_u32(b, c); + simd_add(a, simd_cast(d)) +} + +pub fn vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { + simd_add(a, vabdq_s16(b, c)) +} + +pub fn vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { + simd_add(a, vabdq_s32(b, c)) +} + +pub fn vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t { + simd_add(a, vabdq_s8(b, c)) +} + +pub fn vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t { + simd_add(a, vabdq_u16(b, c)) +} + +pub fn vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { + simd_add(a, vabdq_u32(b, c)) +} + +pub fn vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t { + simd_add(a, vabdq_u8(b, c)) +} + +pub fn vabd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { + simd_abs_diff(a, b) +} + +pub fn vabdq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + simd_abs_diff(a, b) +} + +pub fn vabd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { + simd_abs_diff(a, b) +} + +pub fn vabdq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { + simd_abs_diff(a, b) +} + +pub fn vabd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { + simd_abs_diff(a, b) +} + +pub fn vabdq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + simd_abs_diff(a, b) +} + +pub fn vabd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + simd_abs_diff(a, b) +} + +pub fn vabdq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + simd_abs_diff(a, b) +} + +pub fn vabd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + simd_abs_diff(a, b) +} + +pub fn vabdq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + simd_abs_diff(a, b) +} + +pub fn vabd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + simd_abs_diff(a, b) +} + +pub fn vabdq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + simd_abs_diff(a, b) +} + +pub fn vabdl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t { + simd_cast(vabd_u8(a, b)) +} + +pub fn vabdl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t { + simd_cast(vabd_u16(a, b)) +} + +pub fn vabdl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t { + simd_cast(vabd_u32(a, b)) +} + +pub fn vabs_s8(a: int8x8_t) -> int8x8_t { + simd_abs(a) +} + +pub fn vabsq_s8(a: int8x16_t) -> int8x16_t { + simd_abs(a) +} + +pub fn vabs_s16(a: int16x4_t) -> int16x4_t { + simd_abs(a) +} + +pub fn vabsq_s16(a: int16x8_t) -> int16x8_t { + simd_abs(a) +} + +pub fn vabs_s32(a: int32x2_t) -> int32x2_t { + simd_abs(a) +} + +pub fn vabsq_s32(a: int32x4_t) -> int32x4_t { + simd_abs(a) +} + +pub fn vadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { + simd_add(a, b) +} + +pub fn vadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { + simd_add(a, b) +} + +pub fn vadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { + simd_add(a, b) +} + +pub fn vadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + simd_add(a, b) +} + +pub fn vadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + simd_add(a, b) +} + +pub fn vadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + simd_add(a, b) +} + +pub fn vaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { + simd_add(a, b) +} + +pub fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + simd_add(a, b) +} + +pub fn vaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t { + simd_add(a, b) +} + +pub fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + simd_add(a, b) +} + +pub fn vaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + simd_add(a, b) +} + +pub fn vaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + simd_add(a, b) +} + +pub fn vaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + simd_add(a, b) +} + +pub fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + simd_add(a, b) +} + +pub fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t { + let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8))); + simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) +} + +pub fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t { + let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16))); + simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7]) +} + +pub fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t { + let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32))); + simd_shuffle(r, x, [0, 1, 2, 3]) +} + +pub fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t { + let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8))); + simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) +} + +pub fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t { + let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16))); + simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7]) +} + +pub fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t { + let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32))); + simd_shuffle(r, x, [0, 1, 2, 3]) +} + +pub fn vaddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t { + simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8))) +} + +pub fn vaddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t { + simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16))) +} + +pub fn vaddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t { + simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32))) +} + +pub fn vaddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t { + simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8))) +} + +pub fn vaddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t { + simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16))) +} + +pub fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t { + simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32))) +} + +pub fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { + let a: int16x4_t = simd_shuffle(a, a, [4, 5, 6, 7]); + let b: int16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]); + let a: int32x4_t = simd_cast(a); + let b: int32x4_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { + let a: int32x2_t = simd_shuffle(a, a, [2, 3]); + let b: int32x2_t = simd_shuffle(b, b, [2, 3]); + let a: int64x2_t = simd_cast(a); + let b: int64x2_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t { + let a: int8x8_t = simd_shuffle(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); + let b: int8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + let a: int16x8_t = simd_cast(a); + let b: int16x8_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { + let a: uint16x4_t = simd_shuffle(a, a, [4, 5, 6, 7]); + let b: uint16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]); + let a: uint32x4_t = simd_cast(a); + let b: uint32x4_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { + let a: uint32x2_t = simd_shuffle(a, a, [2, 3]); + let b: uint32x2_t = simd_shuffle(b, b, [2, 3]); + let a: uint64x2_t = simd_cast(a); + let b: uint64x2_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t { + let a: uint8x8_t = simd_shuffle(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); + let b: uint8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + let a: uint16x8_t = simd_cast(a); + let b: uint16x8_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t { + let a: int32x4_t = simd_cast(a); + let b: int32x4_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t { + let a: int64x2_t = simd_cast(a); + let b: int64x2_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t { + let a: int16x8_t = simd_cast(a); + let b: int16x8_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t { + let a: uint32x4_t = simd_cast(a); + let b: uint32x4_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t { + let a: uint64x2_t = simd_cast(a); + let b: uint64x2_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t { + let a: uint16x8_t = simd_cast(a); + let b: uint16x8_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t { + let b: int16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]); + let b: int32x4_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t { + let b: int32x2_t = simd_shuffle(b, b, [2, 3]); + let b: int64x2_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t { + let b: int8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + let b: int16x8_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t { + let b: uint16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]); + let b: uint32x4_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t { + let b: uint32x2_t = simd_shuffle(b, b, [2, 3]); + let b: uint64x2_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t { + let b: uint8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + let b: uint16x8_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t { + let b: int32x4_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t { + let b: int64x2_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t { + let b: int16x8_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t { + let b: uint32x4_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t { + let b: uint64x2_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vaddw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t { + let b: uint16x8_t = simd_cast(b); + simd_add(a, b) +} + +pub fn vand_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { + simd_and(a, b) +} + +pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + simd_and(a, b) +} + +pub fn vand_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { + simd_and(a, b) +} + +pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { + simd_and(a, b) +} + +pub fn vand_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { + simd_and(a, b) +} + +pub fn vandq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + simd_and(a, b) +} + +pub fn vand_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t { + simd_and(a, b) +} + +pub fn vandq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t { + simd_and(a, b) +} + +pub fn vand_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + simd_and(a, b) +} + +pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + simd_and(a, b) +} + +pub fn vand_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + simd_and(a, b) +} + +pub fn vandq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + simd_and(a, b) +} + +pub fn vand_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + simd_and(a, b) +} + +pub fn vandq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + simd_and(a, b) +} + +pub fn vand_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t { + simd_and(a, b) +} + +pub fn vandq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + simd_and(a, b) +} + +pub fn vbic_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { + let c = int16x4_t::splat(-1); + simd_and(simd_xor(b, c), a) +} + +pub fn vbic_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { + let c = int32x2_t::splat(-1); + simd_and(simd_xor(b, c), a) +} + +pub fn vbic_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t { + let c = int64x1_t::splat(-1); + simd_and(simd_xor(b, c), a) +} + +pub fn vbic_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { + let c = int8x8_t::splat(-1); + simd_and(simd_xor(b, c), a) +} + +pub fn vbicq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { + let c = int16x8_t::splat(-1); + simd_and(simd_xor(b, c), a) +} + +pub fn vbicq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + let c = int32x4_t::splat(-1); + simd_and(simd_xor(b, c), a) +} + +pub fn vbicq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t { + let c = int64x2_t::splat(-1); + simd_and(simd_xor(b, c), a) +} + +pub fn vbicq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + let c = int8x16_t::splat(-1); + simd_and(simd_xor(b, c), a) +} + +pub fn vbic_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + let c = int16x4_t::splat(-1); + simd_and(simd_xor(b, simd_cast(c)), a) +} + +pub fn vbic_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + let c = int32x2_t::splat(-1); + simd_and(simd_xor(b, simd_cast(c)), a) +} + +pub fn vbic_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t { + let c = int64x1_t::splat(-1); + simd_and(simd_xor(b, simd_cast(c)), a) +} + +pub fn vbic_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + let c = int8x8_t::splat(-1); + simd_and(simd_xor(b, simd_cast(c)), a) +} + +pub fn vbicq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + let c = int16x8_t::splat(-1); + simd_and(simd_xor(b, simd_cast(c)), a) +} + +pub fn vbicq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + let c = int32x4_t::splat(-1); + simd_and(simd_xor(b, simd_cast(c)), a) +} + +pub fn vbicq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + let c = int64x2_t::splat(-1); + simd_and(simd_xor(b, simd_cast(c)), a) +} + +pub fn vbicq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + let c = int8x16_t::splat(-1); + simd_and(simd_xor(b, simd_cast(c)), a) +} + +pub fn vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t { + let not = int16x4_t::splat(-1); + simd_cast(simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)), + )) +} + +pub fn vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t { + let not = int32x2_t::splat(-1); + simd_cast(simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)), + )) +} + +pub fn vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t) -> int64x1_t { + let not = int64x1_t::splat(-1); + simd_cast(simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)), + )) +} + +pub fn vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t { + let not = int8x8_t::splat(-1); + simd_cast(simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)), + )) +} + +pub fn vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { + let not = int16x8_t::splat(-1); + simd_cast(simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)), + )) +} + +pub fn vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { + let not = int32x4_t::splat(-1); + simd_cast(simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)), + )) +} + +pub fn vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t { + let not = int64x2_t::splat(-1); + simd_cast(simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)), + )) +} + +pub fn vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t { + let not = int8x16_t::splat(-1); + simd_cast(simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)), + )) +} + +pub fn vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t { + let not = int16x4_t::splat(-1); + simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), c), + ) +} + +pub fn vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t { + let not = int32x2_t::splat(-1); + simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), c), + ) +} + +pub fn vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t) -> uint64x1_t { + let not = int64x1_t::splat(-1); + simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), c), + ) +} + +pub fn vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t { + let not = int8x8_t::splat(-1); + simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), c), + ) +} + +pub fn vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t { + let not = int16x8_t::splat(-1); + simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), c), + ) +} + +pub fn vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { + let not = int32x4_t::splat(-1); + simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), c), + ) +} + +pub fn vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { + let not = int64x2_t::splat(-1); + simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), c), + ) +} + +pub fn vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t { + let not = int8x16_t::splat(-1); + simd_or( + simd_and(a, simd_cast(b)), + simd_and(simd_xor(a, simd_cast(not)), c), + ) +} + +pub fn vceq_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t { + simd_cast(simd_eq(a, b)) +} + +pub fn vceqq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t { + simd_cast(simd_eq(a, b)) +} + +pub fn vceq_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t { + simd_cast(simd_eq(a, b)) +} + +pub fn vceqq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t { + simd_cast(simd_eq(a, b)) +} + +pub fn vceq_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t { + simd_cast(simd_eq(a, b)) +} + +pub fn vceqq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t { + simd_cast(simd_eq(a, b)) +} + +pub fn vceq_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + simd_eq(a, b) +} + +pub fn vceqq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + simd_eq(a, b) +} + +pub fn vceq_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + simd_eq(a, b) +} + +pub fn vceqq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + simd_eq(a, b) +} + +pub fn vceq_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + simd_eq(a, b) +} + +pub fn vceqq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + simd_eq(a, b) +} + +pub fn vcge_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t { + simd_cast(simd_ge(a, b)) +} + +pub fn vcgeq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t { + simd_cast(simd_ge(a, b)) +} + +pub fn vcge_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t { + simd_cast(simd_ge(a, b)) +} + +pub fn vcgeq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t { + simd_cast(simd_ge(a, b)) +} + +pub fn vcge_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t { + simd_cast(simd_ge(a, b)) +} + +pub fn vcgeq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t { + simd_cast(simd_ge(a, b)) +} + +pub fn vcge_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + simd_ge(a, b) +} + +pub fn vcgeq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + simd_ge(a, b) +} + +pub fn vcge_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + simd_ge(a, b) +} + +pub fn vcgeq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + simd_ge(a, b) +} + +pub fn vcge_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + simd_ge(a, b) +} + +pub fn vcgeq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + simd_ge(a, b) +} + +pub fn vcgt_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t { + simd_cast(simd_gt(a, b)) +} + +pub fn vcgtq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t { + simd_cast(simd_gt(a, b)) +} + +pub fn vcgt_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t { + simd_cast(simd_gt(a, b)) +} + +pub fn vcgtq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t { + simd_cast(simd_gt(a, b)) +} + +pub fn vcgt_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t { + simd_cast(simd_gt(a, b)) +} + +pub fn vcgtq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t { + simd_cast(simd_gt(a, b)) +} + +pub fn vcgt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + simd_gt(a, b) +} + +pub fn vcgtq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + simd_gt(a, b) +} + +pub fn vcgt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + simd_gt(a, b) +} + +pub fn vcgtq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + simd_gt(a, b) +} + +pub fn vcgt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + simd_gt(a, b) +} + +pub fn vcgtq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + simd_gt(a, b) +} + +pub fn vcle_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t { + simd_cast(simd_le(a, b)) +} + +pub fn vcleq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t { + simd_cast(simd_le(a, b)) +} + +pub fn vcle_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t { + simd_cast(simd_le(a, b)) +} + +pub fn vcleq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t { + simd_cast(simd_le(a, b)) +} + +pub fn vcle_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t { + simd_cast(simd_le(a, b)) +} + +pub fn vcleq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t { + simd_cast(simd_le(a, b)) +} + +pub fn vcle_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + simd_le(a, b) +} + +pub fn vcleq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + simd_le(a, b) +} + +pub fn vcle_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + simd_le(a, b) +} + +pub fn vcleq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + simd_le(a, b) +} + +pub fn vcle_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + simd_le(a, b) +} + +pub fn vcleq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + simd_le(a, b) +} diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs new file mode 100644 index 0000000000000..7ec0df1263b7f --- /dev/null +++ b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs @@ -0,0 +1,112 @@ +//! Tests for intrinsics defined in `crate::core_arch::models::arm_shared` +//! +//! Each and every modelled intrinsic is tested against the Rust +//! implementation here. For the most part, the tests work by +//! generating random inputs, passing them as arguments +//! to both the models in this crate, and the corresponding intrinsics +//! in the Rust core and then comparing their outputs. +//! +//! To add a test for a modelled intrinsic, go the appropriate file, and +//! use the `mk!` macro to define it. +//! +//! A `mk!` macro invocation looks like the following, +//! `mk!([]{<,>}()) +//! +//! For example, some valid invocations are +//! +//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: BitVec));` +//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));` +//! `mk!(_mm256_abs_epi16(a: BitVec));` +//! +//! The number of random tests is optional. If not provided, it is taken to be 1000 by default. +//! The const values are necessary if the function has constant arguments, but should be discarded if not. +//! The function name and the function arguments are necessary in all cases. +//! +//! Note: This only works if the function returns a bit-vector or funarray. If it returns an integer, the +//! test has to be written manually. It is recommended that the manually defined test follows +//! the pattern of tests defined via the `mk!` invocation. It is also recommended that, in the +//! case that the intrinsic takes constant arguments, each and every possible constant value +//! (upto a maximum of 255) that can be passed to the function be used for testing. The number +//! of constant values passed depends on if the Rust intrinsics statically asserts that the +//! length of the constant argument be less than or equal to a certain number of bits. + +pub mod neon; + +#[allow(non_camel_case_types)] +mod types { + use crate::abstractions::simd::*; + pub type int32x4_t = i32x4; + pub type int64x1_t = i64x1; + pub type int64x2_t = i64x2; + pub type int16x8_t = i16x8; + pub type int8x16_t = i8x16; + pub type uint32x4_t = u32x4; + pub type uint64x1_t = u64x1; + pub type uint64x2_t = u64x2; + pub type uint16x8_t = u16x8; + pub type uint8x16_t = u8x16; + pub type int32x2_t = i32x2; + pub type int16x4_t = i16x4; + pub type int8x8_t = i8x8; + pub type uint32x2_t = u32x2; + pub type uint16x4_t = u16x4; + pub type uint8x8_t = u8x8; +} + +pub(crate) mod upstream { + #[cfg(target_arch = "aarch64")] + pub use core::arch::aarch64::*; + #[cfg(target_arch = "arm")] + pub use core::arch::arm::*; +} + +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +pub mod conversions { + use super::upstream::*; + + use super::types; + use crate::abstractions::bitvec::BitVec; + use crate::abstractions::funarr::FunArray; + + macro_rules! convert{ + ($($ty1:ident [$ty2:ty ; $n:literal]),*) => { + $( + impl From<$ty1> for types::$ty1 { + fn from (arg: $ty1) -> types::$ty1 { + let stuff = unsafe { *(&arg as *const $ty1 as *const [$ty2; $n])}; + FunArray::from_fn(|i| + stuff[i as usize] + ) + } + } + impl From for $ty1 { + fn from (arg: types::$ty1) -> $ty1 { + let bv: &[u8] = &(BitVec::from(arg)).to_vec()[..]; + unsafe { + *(bv.as_ptr() as *const [$ty2; $n] as *const _) + } + } + } + )* + } + } + + convert!( + int32x4_t [i32; 4], + int64x1_t [i64; 1], + int64x2_t [i64; 2], + int16x8_t [i16; 8], + int8x16_t [i8; 16], + uint32x4_t [u32; 4], + uint64x1_t [u64; 1], + uint64x2_t [u64; 2], + uint16x8_t [u16; 8], + uint8x16_t [u8; 16], + int32x2_t [i32; 2], + int16x4_t [i16; 4], + int8x8_t [i8; 8], + uint32x2_t [u32; 2], + uint16x4_t [u16; 4], + uint8x8_t [u8; 8] + ); +} diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs new file mode 100644 index 0000000000000..e07d385f656f6 --- /dev/null +++ b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs @@ -0,0 +1,218 @@ +#[cfg(test)] +use super::upstream; +use crate::abstractions::funarr::FunArray; +use crate::helpers::test::HasRandom; +/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default). +macro_rules! mk { + ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => { + #[test] + fn $name() { + #[allow(unused)] + const N: usize = { + let n: usize = 1000; + $(let n: usize = $N;)? + n + }; + mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*)); + } + }; + (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => { + for _ in 0..$N { + $(let $x = $ty::random();)* + assert_eq!(super::super::models::neon::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe { + FunArray::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into() + }); + } + }; + (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => { + let one = || { + mk!(@[$N]$name<$($c1),*>($($x : $ty),*)); + }; + one(); + mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*)); + } + +} + +use super::types::*; +mk!(vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t)); +mk!(vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t)); +mk!(vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t)); +mk!(vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t)); +mk!(vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t)); +mk!(vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t)); +mk!(vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t)); +mk!(vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t)); +mk!(vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t)); +mk!(vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t)); +mk!(vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t)); +mk!(vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t)); +mk!(vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t)); +mk!(vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t)); +mk!(vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t)); +mk!(vabd_s8(a: int8x8_t, b: int8x8_t)); +mk!(vabdq_s8(a: int8x16_t, b: int8x16_t)); +mk!(vabd_s16(a: int16x4_t, b: int16x4_t)); +mk!(vabdq_s16(a: int16x8_t, b: int16x8_t)); +mk!(vabd_s32(a: int32x2_t, b: int32x2_t)); +mk!(vabdq_s32(a: int32x4_t, b: int32x4_t)); +mk!(vabd_u8(a: uint8x8_t, b: uint8x8_t)); +mk!(vabdq_u8(a: uint8x16_t, b: uint8x16_t)); +mk!(vabd_u16(a: uint16x4_t, b: uint16x4_t)); +mk!(vabdq_u16(a: uint16x8_t, b: uint16x8_t)); +mk!(vabd_u32(a: uint32x2_t, b: uint32x2_t)); +mk!(vabdq_u32(a: uint32x4_t, b: uint32x4_t)); +mk!(vabdl_u8(a: uint8x8_t, b: uint8x8_t)); +mk!(vabdl_u16(a: uint16x4_t, b: uint16x4_t)); +mk!(vabdl_u32(a: uint32x2_t, b: uint32x2_t)); +mk!(vabs_s8(a: int8x8_t)); +mk!(vabsq_s8(a: int8x16_t)); +mk!(vabs_s16(a: int16x4_t)); +mk!(vabsq_s16(a: int16x8_t)); +mk!(vabs_s32(a: int32x2_t)); +mk!(vabsq_s32(a: int32x4_t)); +mk!(vadd_s16(a: int16x4_t, b: int16x4_t)); +mk!(vadd_s32(a: int32x2_t, b: int32x2_t)); +mk!(vadd_s8(a: int8x8_t, b: int8x8_t)); +mk!(vadd_u16(a: uint16x4_t, b: uint16x4_t)); +mk!(vadd_u32(a: uint32x2_t, b: uint32x2_t)); +mk!(vadd_u8(a: uint8x8_t, b: uint8x8_t)); +mk!(vaddq_s16(a: int16x8_t, b: int16x8_t)); +mk!(vaddq_s32(a: int32x4_t, b: int32x4_t)); +mk!(vaddq_s64(a: int64x2_t, b: int64x2_t)); +mk!(vaddq_s8(a: int8x16_t, b: int8x16_t)); +mk!(vaddq_u16(a: uint16x8_t, b: uint16x8_t)); +mk!(vaddq_u32(a: uint32x4_t, b: uint32x4_t)); +mk!(vaddq_u64(a: uint64x2_t, b: uint64x2_t)); +mk!(vaddq_u8(a: uint8x16_t, b: uint8x16_t)); +mk!(vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t)); +mk!(vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t)); +mk!(vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t)); +mk!(vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t)); +mk!(vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t)); +mk!(vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t)); +mk!(vaddhn_s16(a: int16x8_t, b: int16x8_t)); +mk!(vaddhn_s32(a: int32x4_t, b: int32x4_t)); +mk!(vaddhn_s64(a: int64x2_t, b: int64x2_t)); +mk!(vaddhn_u16(a: uint16x8_t, b: uint16x8_t)); +mk!(vaddhn_u32(a: uint32x4_t, b: uint32x4_t)); +mk!(vaddhn_u64(a: uint64x2_t, b: uint64x2_t)); +mk!(vaddl_high_s16(a: int16x8_t, b: int16x8_t)); +mk!(vaddl_high_s32(a: int32x4_t, b: int32x4_t)); +mk!(vaddl_high_s8(a: int8x16_t, b: int8x16_t)); +mk!(vaddl_high_u16(a: uint16x8_t, b: uint16x8_t)); +mk!(vaddl_high_u32(a: uint32x4_t, b: uint32x4_t)); +mk!(vaddl_high_u8(a: uint8x16_t, b: uint8x16_t)); +mk!(vaddl_s16(a: int16x4_t, b: int16x4_t)); +mk!(vaddl_s32(a: int32x2_t, b: int32x2_t)); +mk!(vaddl_s8(a: int8x8_t, b: int8x8_t)); +mk!(vaddl_u16(a: uint16x4_t, b: uint16x4_t)); +mk!(vaddl_u32(a: uint32x2_t, b: uint32x2_t)); +mk!(vaddl_u8(a: uint8x8_t, b: uint8x8_t)); +mk!(vaddw_high_s16(a: int32x4_t, b: int16x8_t)); +mk!(vaddw_high_s32(a: int64x2_t, b: int32x4_t)); +mk!(vaddw_high_s8(a: int16x8_t, b: int8x16_t)); +mk!(vaddw_high_u16(a: uint32x4_t, b: uint16x8_t)); +mk!(vaddw_high_u32(a: uint64x2_t, b: uint32x4_t)); +mk!(vaddw_high_u8(a: uint16x8_t, b: uint8x16_t)); +mk!(vaddw_s16(a: int32x4_t, b: int16x4_t)); +mk!(vaddw_s32(a: int64x2_t, b: int32x2_t)); +mk!(vaddw_s8(a: int16x8_t, b: int8x8_t)); +mk!(vaddw_u16(a: uint32x4_t, b: uint16x4_t)); +mk!(vaddw_u32(a: uint64x2_t, b: uint32x2_t)); +mk!(vaddw_u8(a: uint16x8_t, b: uint8x8_t)); +mk!(vand_s8(a: int8x8_t, b: int8x8_t)); +mk!(vandq_s8(a: int8x16_t, b: int8x16_t)); +mk!(vand_s16(a: int16x4_t, b: int16x4_t)); +mk!(vandq_s16(a: int16x8_t, b: int16x8_t)); +mk!(vand_s32(a: int32x2_t, b: int32x2_t)); +mk!(vandq_s32(a: int32x4_t, b: int32x4_t)); +mk!(vand_s64(a: int64x1_t, b: int64x1_t)); +mk!(vandq_s64(a: int64x2_t, b: int64x2_t)); +mk!(vand_u8(a: uint8x8_t, b: uint8x8_t)); +mk!(vandq_u8(a: uint8x16_t, b: uint8x16_t)); +mk!(vand_u16(a: uint16x4_t, b: uint16x4_t)); +mk!(vandq_u16(a: uint16x8_t, b: uint16x8_t)); +mk!(vand_u32(a: uint32x2_t, b: uint32x2_t)); +mk!(vandq_u32(a: uint32x4_t, b: uint32x4_t)); +mk!(vand_u64(a: uint64x1_t, b: uint64x1_t)); +mk!(vandq_u64(a: uint64x2_t, b: uint64x2_t)); +mk!(vbic_s16(a: int16x4_t, b: int16x4_t)); +mk!(vbic_s32(a: int32x2_t, b: int32x2_t)); +mk!(vbic_s8(a: int8x8_t, b: int8x8_t)); +mk!(vbicq_s16(a: int16x8_t, b: int16x8_t)); +mk!(vbicq_s32(a: int32x4_t, b: int32x4_t)); +mk!(vbicq_s64(a: int64x2_t, b: int64x2_t)); +mk!(vbicq_s8(a: int8x16_t, b: int8x16_t)); +mk!(vbic_u16(a: uint16x4_t, b: uint16x4_t)); +mk!(vbic_u32(a: uint32x2_t, b: uint32x2_t)); +mk!(vbic_u64(a: uint64x1_t, b: uint64x1_t)); +mk!(vbic_u8(a: uint8x8_t, b: uint8x8_t)); +mk!(vbicq_u16(a: uint16x8_t, b: uint16x8_t)); +mk!(vbicq_u32(a: uint32x4_t, b: uint32x4_t)); +mk!(vbicq_u64(a: uint64x2_t, b: uint64x2_t)); +mk!(vbicq_u8(a: uint8x16_t, b: uint8x16_t)); +mk!(vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t)); +mk!(vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t)); +mk!(vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t)); +mk!(vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t)); +mk!(vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t)); +mk!(vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t)); +mk!(vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t)); +mk!(vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t)); +mk!(vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t)); +mk!(vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t)); +mk!(vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t)); +mk!(vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t)); +mk!(vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t)); +mk!(vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t)); +mk!(vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t)); +mk!(vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t)); +mk!(vceq_s8(a: int8x8_t, b: int8x8_t)); +mk!(vceqq_s8(a: int8x16_t, b: int8x16_t)); +mk!(vceq_s16(a: int16x4_t, b: int16x4_t)); +mk!(vceqq_s16(a: int16x8_t, b: int16x8_t)); +mk!(vceq_s32(a: int32x2_t, b: int32x2_t)); +mk!(vceqq_s32(a: int32x4_t, b: int32x4_t)); +mk!(vceq_u8(a: uint8x8_t, b: uint8x8_t)); +mk!(vceqq_u8(a: uint8x16_t, b: uint8x16_t)); +mk!(vceq_u16(a: uint16x4_t, b: uint16x4_t)); +mk!(vceqq_u16(a: uint16x8_t, b: uint16x8_t)); +mk!(vceq_u32(a: uint32x2_t, b: uint32x2_t)); +mk!(vceqq_u32(a: uint32x4_t, b: uint32x4_t)); +mk!(vcge_s8(a: int8x8_t, b: int8x8_t)); +mk!(vcgeq_s8(a: int8x16_t, b: int8x16_t)); +mk!(vcge_s16(a: int16x4_t, b: int16x4_t)); +mk!(vcgeq_s16(a: int16x8_t, b: int16x8_t)); +mk!(vcge_s32(a: int32x2_t, b: int32x2_t)); +mk!(vcgeq_s32(a: int32x4_t, b: int32x4_t)); +mk!(vcge_u8(a: uint8x8_t, b: uint8x8_t)); +mk!(vcgeq_u8(a: uint8x16_t, b: uint8x16_t)); +mk!(vcge_u16(a: uint16x4_t, b: uint16x4_t)); +mk!(vcgeq_u16(a: uint16x8_t, b: uint16x8_t)); +mk!(vcge_u32(a: uint32x2_t, b: uint32x2_t)); +mk!(vcgeq_u32(a: uint32x4_t, b: uint32x4_t)); +mk!(vcgt_s8(a: int8x8_t, b: int8x8_t)); +mk!(vcgtq_s8(a: int8x16_t, b: int8x16_t)); +mk!(vcgt_s16(a: int16x4_t, b: int16x4_t)); +mk!(vcgtq_s16(a: int16x8_t, b: int16x8_t)); +mk!(vcgt_s32(a: int32x2_t, b: int32x2_t)); +mk!(vcgtq_s32(a: int32x4_t, b: int32x4_t)); +mk!(vcgt_u8(a: uint8x8_t, b: uint8x8_t)); +mk!(vcgtq_u8(a: uint8x16_t, b: uint8x16_t)); +mk!(vcgt_u16(a: uint16x4_t, b: uint16x4_t)); +mk!(vcgtq_u16(a: uint16x8_t, b: uint16x8_t)); +mk!(vcgt_u32(a: uint32x2_t, b: uint32x2_t)); +mk!(vcgtq_u32(a: uint32x4_t, b: uint32x4_t)); +mk!(vcle_s8(a: int8x8_t, b: int8x8_t)); +mk!(vcleq_s8(a: int8x16_t, b: int8x16_t)); +mk!(vcle_s16(a: int16x4_t, b: int16x4_t)); +mk!(vcleq_s16(a: int16x8_t, b: int16x8_t)); +mk!(vcle_s32(a: int32x2_t, b: int32x2_t)); +mk!(vcleq_s32(a: int32x4_t, b: int32x4_t)); +mk!(vcle_u8(a: uint8x8_t, b: uint8x8_t)); +mk!(vcleq_u8(a: uint8x16_t, b: uint8x16_t)); +mk!(vcle_u16(a: uint16x4_t, b: uint16x4_t)); +mk!(vcleq_u16(a: uint16x8_t, b: uint16x8_t)); +mk!(vcle_u32(a: uint32x2_t, b: uint32x2_t)); +mk!(vcleq_u32(a: uint32x4_t, b: uint32x4_t)); diff --git a/testable-simd-models/src/core_arch/x86/mod.rs b/testable-simd-models/src/core_arch/x86/mod.rs new file mode 100644 index 0000000000000..3c5cd51d9c56b --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/mod.rs @@ -0,0 +1,4 @@ +pub mod models; +#[cfg(test)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +mod tests; diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs new file mode 100644 index 0000000000000..8e2fb37319d36 --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/models/avx.rs @@ -0,0 +1,1828 @@ +//! Advanced Vector Extensions (AVX) +//! +//! The references are: +//! +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: +//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture +//! Programmer's Manual, Volume 3: General-Purpose and System +//! Instructions][amd64_ref]. +//! +//! [Wikipedia][wiki] provides a quick overview of the instructions available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf +//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions + +use super::avx_handwritten::*; +use super::sse::*; +use super::sse2::*; +use super::types::*; +use crate::abstractions::simd::*; +use crate::abstractions::utilities::*; + +/// Adds packed double-precision (64-bit) floating-point elements +/// in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_add_pd) +// NOTE: Not modeled yet +// pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d { +// { transmute(simd_add(a.as_f64x4(), b.as_f64x4())) } +// } + +/// Adds packed single-precision (32-bit) floating-point elements in `a` and +/// `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_add_ps) +// NOTE: Not modeled yet +// pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 { +// { transmute(simd_add(a.as_f32x8(), b.as_f32x8())) } +// } + +/// Computes the bitwise AND of a packed double-precision (64-bit) +/// floating-point elements in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_and_pd) +pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d { + { + let a: u64x4 = transmute(a); + let b: u64x4 = transmute(b); + transmute(simd_and(a, b)) + } +} +/// Computes the bitwise AND of packed single-precision (32-bit) floating-point +/// elements in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_and_ps) +pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 { + { + let a: u32x8 = transmute(a); + let b: u32x8 = transmute(b); + transmute(simd_and(a, b)) + } +} +/// Computes the bitwise OR packed double-precision (64-bit) floating-point +/// elements in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_or_pd) +pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d { + { + let a: u64x4 = transmute(a); + let b: u64x4 = transmute(b); + transmute(simd_or(a, b)) + } +} +/// Computes the bitwise OR packed single-precision (32-bit) floating-point +/// elements in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_or_ps) +pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 { + { + let a: u32x8 = transmute(a); + let b: u32x8 = transmute(b); + transmute(simd_or(a, b)) + } +} +/// Shuffles double-precision (64-bit) floating-point elements within 128-bit +/// lanes using the control in `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_shuffle_pd) +pub fn _mm256_shuffle_pd(a: __m256d, b: __m256d) -> __m256d { + static_assert_uimm_bits!(MASK, 8); + { + transmute(simd_shuffle( + a.as_f64x4(), + b.as_f64x4(), + [ + MASK as u32 & 0b1, + ((MASK as u32 >> 1) & 0b1) + 4, + ((MASK as u32 >> 2) & 0b1) + 2, + ((MASK as u32 >> 3) & 0b1) + 6, + ], + )) + } +} +/// Shuffles single-precision (32-bit) floating-point elements in `a` within +/// 128-bit lanes using the control in `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_shuffle_ps) +pub fn _mm256_shuffle_ps(a: __m256, b: __m256) -> __m256 { + static_assert_uimm_bits!(MASK, 8); + { + transmute(simd_shuffle( + a.as_f32x8(), + b.as_f32x8(), + [ + MASK as u32 & 0b11, + (MASK as u32 >> 2) & 0b11, + ((MASK as u32 >> 4) & 0b11) + 8, + ((MASK as u32 >> 6) & 0b11) + 8, + (MASK as u32 & 0b11) + 4, + ((MASK as u32 >> 2) & 0b11) + 4, + ((MASK as u32 >> 4) & 0b11) + 12, + ((MASK as u32 >> 6) & 0b11) + 12, + ], + )) + } +} +/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point +/// elements in `a`, and then AND with `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_andnot_pd) +pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d { + { + let a: u64x4 = transmute(a); + let b: u64x4 = transmute(b); + transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b)) + } +} +/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point +/// elements in `a` +/// and then AND with `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_andnot_ps) +pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 { + { + let a: u32x8 = transmute(a); + let b: u32x8 = transmute(b); + transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b)) + } +} +/// Compares packed double-precision (64-bit) floating-point elements +/// in `a` and `b`, and returns packed maximum values +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_max_pd) +// NOTE: Not modeled yet +// pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d { +// { vmaxpd(a, b) } +// } + +/// Compares packed single-precision (32-bit) floating-point elements in `a` +/// and `b`, and returns packed maximum values +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_max_ps) +// NOTE: Not modeled yet +// pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 { +// { vmaxps(a, b) } +// } + +/// Compares packed double-precision (64-bit) floating-point elements +/// in `a` and `b`, and returns packed minimum values +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_min_pd) +// NOTE: Not modeled yet +// pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d { +// { vminpd(a, b) } +// } + +/// Compares packed single-precision (32-bit) floating-point elements in `a` +/// and `b`, and returns packed minimum values +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_min_ps) +// NOTE: Not modeled yet +// pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 { +// { vminps(a, b) } +// } + +/// Multiplies packed double-precision (64-bit) floating-point elements +/// in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_mul_pd) +// NOTE: Not modeled yet +// pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d { +// { transmute(simd_mul(a.as_f64x4(), b.as_f64x4())) } +// } + +/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and +/// `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_mul_ps) +// NOTE: Not modeled yet +// pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 { +// { transmute(simd_mul(a.as_f32x8(), b.as_f32x8())) } +// } + +/// Alternatively adds and subtracts packed double-precision (64-bit) +/// floating-point elements in `a` to/from packed elements in `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_addsub_pd) +// NOTE: Not modeled yet +// pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d { +// { +// let a = a.as_f64x4(); +// let b = b.as_f64x4(); +// let add = simd_add(a, b); +// let sub = simd_sub(a, b); +// simd_shuffle(add, sub, [4, 1, 6, 3]) +// } +// } + +/// Alternatively adds and subtracts packed single-precision (32-bit) +/// floating-point elements in `a` to/from packed elements in `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_addsub_ps) +// NOTE: Not modeled yet +// pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 { +// { +// let a = a.as_f32x8(); +// let b = b.as_f32x8(); +// let add = simd_add(a, b); +// let sub = simd_sub(a, b); +// simd_shuffle(add, sub, [8, 1, 10, 3, 12, 5, 14, 7]) +// } +// } + +/// Subtracts packed double-precision (64-bit) floating-point elements in `b` +/// from packed elements in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sub_pd) +// NOTE: Not modeled yet +// pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d { +// { simd_sub(a, b) } +// } + +/// Subtracts packed single-precision (32-bit) floating-point elements in `b` +/// from packed elements in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sub_ps) +// NOTE: Not modeled yet +// pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 { +// { simd_sub(a, b) } +// } + +/// Computes the division of each of the 8 packed 32-bit floating-point elements +/// in `a` by the corresponding packed elements in `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_div_ps) +// NOTE: Not modeled yet +// pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 { +// { simd_div(a, b) } +// } + +/// Computes the division of each of the 4 packed 64-bit floating-point elements +/// in `a` by the corresponding packed elements in `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_div_pd) +// NOTE: Not modeled yet +// pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d { +// { simd_div(a, b) } +// } + +/// Rounds packed double-precision (64-bit) floating point elements in `a` +/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows: +/// +/// - `0x00`: Round to the nearest whole number. +/// - `0x01`: Round down, toward negative infinity. +/// - `0x02`: Round up, toward positive infinity. +/// - `0x03`: Truncate the values. +/// +/// For a complete list of options, check [the LLVM docs][llvm_docs]. +/// +/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_round_pd) +// NOTE: Not modeled yet +// pub fn _mm256_round_pd(a: __m256d) -> __m256d { +// static_assert_uimm_bits!(ROUNDING, 4); +// { roundpd256(a, ROUNDING) } +// } + +/// Rounds packed double-precision (64-bit) floating point elements in `a` +/// toward positive infinity. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_ceil_pd) +// NOTE: Not modeled yet +// pub fn _mm256_ceil_pd(a: __m256d) -> __m256d { +// { simd_ceil(a) } +// } + +/// Rounds packed double-precision (64-bit) floating point elements in `a` +/// toward negative infinity. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_floor_pd) +// NOTE: Not modeled yet +// pub fn _mm256_floor_pd(a: __m256d) -> __m256d { +// { simd_floor(a) } +// } + +/// Rounds packed single-precision (32-bit) floating point elements in `a` +/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows: +/// +/// - `0x00`: Round to the nearest whole number. +/// - `0x01`: Round down, toward negative infinity. +/// - `0x02`: Round up, toward positive infinity. +/// - `0x03`: Truncate the values. +/// +/// For a complete list of options, check [the LLVM docs][llvm_docs]. +/// +/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_round_ps) +// NOTE: Not modeled yet +// pub fn _mm256_round_ps(a: __m256) -> __m256 { +// static_assert_uimm_bits!(ROUNDING, 4); +// { roundps256(a, ROUNDING) } +// } + +/// Rounds packed single-precision (32-bit) floating point elements in `a` +/// toward positive infinity. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_ceil_ps) +// NOTE: Not modeled yet +// pub fn _mm256_ceil_ps(a: __m256) -> __m256 { +// { simd_ceil(a) } +// } + +/// Rounds packed single-precision (32-bit) floating point elements in `a` +/// toward negative infinity. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_floor_ps) +// NOTE: Not modeled yet +// pub fn _mm256_floor_ps(a: __m256) -> __m256 { +// { simd_floor(a) } +// } + +/// Returns the square root of packed single-precision (32-bit) floating point +/// elements in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sqrt_ps) +// NOTE: Not modeled yet +// pub fn _mm256_sqrt_ps(a: __m256) -> __m256 { +// { simd_fsqrt(a) } +// } + +/// Returns the square root of packed double-precision (64-bit) floating point +/// elements in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sqrt_pd) +// NOTE: Not modeled yet +// pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d { +// { simd_fsqrt(a) } +// } + +/// Blends packed double-precision (64-bit) floating-point elements from +/// `a` and `b` using control mask `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blend_pd) +pub fn _mm256_blend_pd(a: __m256d, b: __m256d) -> __m256d { + static_assert_uimm_bits!(IMM4, 4); + { + transmute(simd_shuffle( + a.as_f64x4(), + b.as_f64x4(), + [ + ((IMM4 as u32 >> 0) & 1) * 4 + 0, + ((IMM4 as u32 >> 1) & 1) * 4 + 1, + ((IMM4 as u32 >> 2) & 1) * 4 + 2, + ((IMM4 as u32 >> 3) & 1) * 4 + 3, + ], + )) + } +} +/// Blends packed single-precision (32-bit) floating-point elements from +/// `a` and `b` using control mask `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blend_ps) +pub fn _mm256_blend_ps(a: __m256, b: __m256) -> __m256 { + static_assert_uimm_bits!(IMM8, 8); + { + transmute(simd_shuffle( + a.as_f32x8(), + b.as_f32x8(), + [ + ((IMM8 as u32 >> 0) & 1) * 8 + 0, + ((IMM8 as u32 >> 1) & 1) * 8 + 1, + ((IMM8 as u32 >> 2) & 1) * 8 + 2, + ((IMM8 as u32 >> 3) & 1) * 8 + 3, + ((IMM8 as u32 >> 4) & 1) * 8 + 4, + ((IMM8 as u32 >> 5) & 1) * 8 + 5, + ((IMM8 as u32 >> 6) & 1) * 8 + 6, + ((IMM8 as u32 >> 7) & 1) * 8 + 7, + ], + )) + } +} +/// Blends packed double-precision (64-bit) floating-point elements from +/// `a` and `b` using `c` as a mask. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blendv_pd) +pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { + { + let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO()); + transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4())) + } +} +/// Blends packed single-precision (32-bit) floating-point elements from +/// `a` and `b` using `c` as a mask. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blendv_ps) +pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 { + { + let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO()); + transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8())) + } +} +/// Conditionally multiplies the packed single-precision (32-bit) floating-point +/// elements in `a` and `b` using the high 4 bits in `imm8`, +/// sum the four products, and conditionally return the sum +/// using the low 4 bits of `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_dp_ps) +// NOTE: Not modeled yet +// pub fn _mm256_dp_ps(a: __m256, b: __m256) -> __m256 { +// static_assert_uimm_bits!(IMM8, 8); +// { vdpps(a, b, IMM8 as i8) } +// } + +/// Horizontal addition of adjacent pairs in the two packed vectors +/// of 4 64-bit floating points `a` and `b`. +/// In the result, sums of elements from `a` are returned in even locations, +/// while sums of elements from `b` are returned in odd locations. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hadd_pd) +// NOTE: Not modeled yet +// pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d { +// { vhaddpd(a, b) } +// } + +/// Horizontal addition of adjacent pairs in the two packed vectors +/// of 8 32-bit floating points `a` and `b`. +/// In the result, sums of elements from `a` are returned in locations of +/// indices 0, 1, 4, 5; while sums of elements from `b` are locations +/// 2, 3, 6, 7. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hadd_ps) +// NOTE: Not modeled yet +// pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 { +// { vhaddps(a, b) } +// } + +/// Horizontal subtraction of adjacent pairs in the two packed vectors +/// of 4 64-bit floating points `a` and `b`. +/// In the result, sums of elements from `a` are returned in even locations, +/// while sums of elements from `b` are returned in odd locations. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hsub_pd) +// NOTE: Not modeled yet +// pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d { +// { vhsubpd(a, b) } +// } + +/// Horizontal subtraction of adjacent pairs in the two packed vectors +/// of 8 32-bit floating points `a` and `b`. +/// In the result, sums of elements from `a` are returned in locations of +/// indices 0, 1, 4, 5; while sums of elements from `b` are locations +/// 2, 3, 6, 7. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hsub_ps) +// NOTE: Not modeled yet +// pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 { +// { vhsubps(a, b) } +// } + +/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point +/// elements in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_xor_pd) +pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d { + { + let a: u64x4 = transmute(a); + let b: u64x4 = transmute(b); + transmute(simd_xor(a, b)) + } +} +/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point +/// elements in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_xor_ps) +pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 { + { + let a: u32x8 = transmute(a); + let b: u32x8 = transmute(b); + transmute(simd_xor(a, b)) + } +} +/// Equal (ordered, non-signaling) +pub const _CMP_EQ_OQ: i32 = 0x00; +/// Less-than (ordered, signaling) +pub const _CMP_LT_OS: i32 = 0x01; +/// Less-than-or-equal (ordered, signaling) +pub const _CMP_LE_OS: i32 = 0x02; +/// Unordered (non-signaling) +pub const _CMP_UNORD_Q: i32 = 0x03; +/// Not-equal (unordered, non-signaling) +pub const _CMP_NEQ_UQ: i32 = 0x04; +/// Not-less-than (unordered, signaling) +pub const _CMP_NLT_US: i32 = 0x05; +/// Not-less-than-or-equal (unordered, signaling) +pub const _CMP_NLE_US: i32 = 0x06; +/// Ordered (non-signaling) +pub const _CMP_ORD_Q: i32 = 0x07; +/// Equal (unordered, non-signaling) +pub const _CMP_EQ_UQ: i32 = 0x08; +/// Not-greater-than-or-equal (unordered, signaling) +pub const _CMP_NGE_US: i32 = 0x09; +/// Not-greater-than (unordered, signaling) +pub const _CMP_NGT_US: i32 = 0x0a; +/// False (ordered, non-signaling) +pub const _CMP_FALSE_OQ: i32 = 0x0b; +/// Not-equal (ordered, non-signaling) +pub const _CMP_NEQ_OQ: i32 = 0x0c; +/// Greater-than-or-equal (ordered, signaling) +pub const _CMP_GE_OS: i32 = 0x0d; +/// Greater-than (ordered, signaling) +pub const _CMP_GT_OS: i32 = 0x0e; +/// True (unordered, non-signaling) +pub const _CMP_TRUE_UQ: i32 = 0x0f; +/// Equal (ordered, signaling) +pub const _CMP_EQ_OS: i32 = 0x10; +/// Less-than (ordered, non-signaling) +pub const _CMP_LT_OQ: i32 = 0x11; +/// Less-than-or-equal (ordered, non-signaling) +pub const _CMP_LE_OQ: i32 = 0x12; +/// Unordered (signaling) +pub const _CMP_UNORD_S: i32 = 0x13; +/// Not-equal (unordered, signaling) +pub const _CMP_NEQ_US: i32 = 0x14; +/// Not-less-than (unordered, non-signaling) +pub const _CMP_NLT_UQ: i32 = 0x15; +/// Not-less-than-or-equal (unordered, non-signaling) +pub const _CMP_NLE_UQ: i32 = 0x16; +/// Ordered (signaling) +pub const _CMP_ORD_S: i32 = 0x17; +/// Equal (unordered, signaling) +pub const _CMP_EQ_US: i32 = 0x18; +/// Not-greater-than-or-equal (unordered, non-signaling) +pub const _CMP_NGE_UQ: i32 = 0x19; +/// Not-greater-than (unordered, non-signaling) +pub const _CMP_NGT_UQ: i32 = 0x1a; +/// False (ordered, signaling) +pub const _CMP_FALSE_OS: i32 = 0x1b; +/// Not-equal (ordered, signaling) +pub const _CMP_NEQ_OS: i32 = 0x1c; +/// Greater-than-or-equal (ordered, non-signaling) +pub const _CMP_GE_OQ: i32 = 0x1d; +/// Greater-than (ordered, non-signaling) +pub const _CMP_GT_OQ: i32 = 0x1e; +/// True (unordered, signaling) +pub const _CMP_TRUE_US: i32 = 0x1f; +/// Compares packed double-precision (64-bit) floating-point +/// elements in `a` and `b` based on the comparison operand +/// specified by `IMM5`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmp_pd(a: __m128d, b: __m128d) -> __m128d { +// static_assert_uimm_bits!(IMM5, 5); +// { vcmppd(a, b, const { IMM5 as i8 }) } +// } + +/// Compares packed double-precision (64-bit) floating-point +/// elements in `a` and `b` based on the comparison operand +/// specified by `IMM5`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cmp_pd) +// NOTE: Not modeled yet +// pub fn _mm256_cmp_pd(a: __m256d, b: __m256d) -> __m256d { +// static_assert_uimm_bits!(IMM5, 5); +// { vcmppd256(a, b, IMM5 as u8) } +// } + +/// Compares packed single-precision (32-bit) floating-point +/// elements in `a` and `b` based on the comparison operand +/// specified by `IMM5`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_ps) +// NOTE: Not modeled yet +// pub fn _mm_cmp_ps(a: __m128, b: __m128) -> __m128 { +// static_assert_uimm_bits!(IMM5, 5); +// { vcmpps(a, b, const { IMM5 as i8 }) } +// } + +/// Compares packed single-precision (32-bit) floating-point +/// elements in `a` and `b` based on the comparison operand +/// specified by `IMM5`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cmp_ps) +// NOTE: Not modeled yet +// pub fn _mm256_cmp_ps(a: __m256, b: __m256) -> __m256 { +// static_assert_uimm_bits!(IMM5, 5); +// { vcmpps256(a, b, const { IMM5 as u8 }) } +// } + +/// Compares the lower double-precision (64-bit) floating-point element in +/// `a` and `b` based on the comparison operand specified by `IMM5`, +/// store the result in the lower element of returned vector, +/// and copies the upper element from `a` to the upper element of returned +/// vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmp_sd(a: __m128d, b: __m128d) -> __m128d { +// static_assert_uimm_bits!(IMM5, 5); +// { vcmpsd(a, b, IMM5 as i8) } +// } + +/// Compares the lower single-precision (32-bit) floating-point element in +/// `a` and `b` based on the comparison operand specified by `IMM5`, +/// store the result in the lower element of returned vector, +/// and copies the upper 3 packed elements from `a` to the upper elements of +/// returned vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_ss) +// NOTE: Not modeled yet +// pub fn _mm_cmp_ss(a: __m128, b: __m128) -> __m128 { +// static_assert_uimm_bits!(IMM5, 5); +// { vcmpss(a, b, IMM5 as i8) } +// } + +/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit) +/// floating-point elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtepi32_pd) +pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d { + transmute(simd_cast::<4, i32, f64>(a.as_i32x4())) +} +/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit) +/// floating-point elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtepi32_ps) +pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 { + transmute(simd_cast::<8, _, f32>(a.as_i32x8())) +} +/// Converts packed double-precision (64-bit) floating-point elements in `a` +/// to packed single-precision (32-bit) floating-point elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtpd_ps) +pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 { + transmute(simd_cast::<4, _, f32>(a.as_f64x4())) +} +/// Converts packed single-precision (32-bit) floating-point elements in `a` +/// to packed 32-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtps_epi32) +// NOTE: Not modeled yet +// pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i { +// { transmute(vcvtps2dq(a)) } +// } + +/// Converts packed single-precision (32-bit) floating-point elements in `a` +/// to packed double-precision (64-bit) floating-point elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtps_pd) +pub fn _mm256_cvtps_pd(a: __m128) -> __m256d { + transmute(simd_cast::<4, _, f64>(a.as_f32x4())) +} +/// Returns the first element of the input vector of `[4 x double]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtsd_f64) +pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 { + simd_extract(a.as_f64x4(), 0) +} + +/// Converts packed double-precision (64-bit) floating-point elements in `a` +/// to packed 32-bit integers with truncation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvttpd_epi32) +// NOTE: Not modeled yet +// pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i { +// { transmute(vcvttpd2dq(a)) } +// } + +/// Converts packed double-precision (64-bit) floating-point elements in `a` +/// to packed 32-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtpd_epi32) +// NOTE: Not modeled yet +// pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i { +// { transmute(vcvtpd2dq(a)) } +// } + +/// Converts packed single-precision (32-bit) floating-point elements in `a` +/// to packed 32-bit integers with truncation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvttps_epi32) +// NOTE: Not modeled yet +// pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i { +// { transmute(vcvttps2dq(a)) } +// } + +/// Extracts 128 bits (composed of 4 packed single-precision (32-bit) +/// floating-point elements) from `a`, selected with `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_ps) +pub fn _mm256_extractf128_ps(a: __m256) -> __m128 { + static_assert_uimm_bits!(IMM1, 1); + { + transmute(simd_shuffle( + a.as_f32x8(), + _mm256_undefined_ps().as_f32x8(), + [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize], + )) + } +} +/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) +/// floating-point elements) from `a`, selected with `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_pd) +pub fn _mm256_extractf128_pd(a: __m256d) -> __m128d { + static_assert_uimm_bits!(IMM1, 1); + transmute(simd_shuffle( + a.as_f64x4(), + _mm256_undefined_pd().as_f64x4(), + [[0, 1], [2, 3]][IMM1 as usize], + )) +} +/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_si256) +pub fn _mm256_extractf128_si256(a: __m256i) -> __m128i { + static_assert_uimm_bits!(IMM1, 1); + { + let dst: i64x2 = simd_shuffle(a.as_i64x4(), i64x4::ZERO(), [[0, 1], [2, 3]][IMM1 as usize]); + transmute(dst) + } +} +/// Extracts a 32-bit integer from `a`, selected with `INDEX`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extract_epi32) +pub fn _mm256_extract_epi32(a: __m256i) -> i32 { + static_assert_uimm_bits!(INDEX, 3); + simd_extract(a.as_i32x8(), INDEX as u32) +} +/// Returns the first element of the input vector of `[8 x i32]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtsi256_si32) +pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 { + simd_extract(a.as_i32x8(), 0) +} +/// Zeroes the contents of all XMM or YMM registers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zeroall) +// NOTE: Not modeled yet +// pub fn _mm256_zeroall() { +// { vzeroall() } +// } + +/// Zeroes the upper 128 bits of all YMM registers; +/// the lower 128-bits of the registers are unmodified. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zeroupper) +// NOTE: Not modeled yet +// pub fn _mm256_zeroupper() { +// { vzeroupper() } +// } + +/// Shuffles single-precision (32-bit) floating-point elements in `a` +/// within 128-bit lanes using the control in `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permutevar_ps) +// NOTE: Not modeled yet +// pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 { +// { vpermilps256(a, b.as_i32x8()) } +// } + +/// Shuffles single-precision (32-bit) floating-point elements in `a` +/// using the control in `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permutevar_ps) +// NOTE: Not modeled yet +// pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 { +// { vpermilps(a, b.as_i32x4()) } +// } + +/// Shuffles single-precision (32-bit) floating-point elements in `a` +/// within 128-bit lanes using the control in `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute_ps) +pub fn _mm256_permute_ps(a: __m256) -> __m256 { + static_assert_uimm_bits!(IMM8, 8); + { + transmute(simd_shuffle( + a.as_f32x8(), + _mm256_undefined_ps().as_f32x8(), + [ + (IMM8 as u32 >> 0) & 0b11, + (IMM8 as u32 >> 2) & 0b11, + (IMM8 as u32 >> 4) & 0b11, + (IMM8 as u32 >> 6) & 0b11, + ((IMM8 as u32 >> 0) & 0b11) + 4, + ((IMM8 as u32 >> 2) & 0b11) + 4, + ((IMM8 as u32 >> 4) & 0b11) + 4, + ((IMM8 as u32 >> 6) & 0b11) + 4, + ], + )) + } +} +/// Shuffles single-precision (32-bit) floating-point elements in `a` +/// using the control in `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_ps) +pub fn _mm_permute_ps(a: __m128) -> __m128 { + static_assert_uimm_bits!(IMM8, 8); + { + transmute(simd_shuffle( + a.as_f32x4(), + _mm_undefined_ps().as_f32x4(), + [ + (IMM8 as u32 >> 0) & 0b11, + (IMM8 as u32 >> 2) & 0b11, + (IMM8 as u32 >> 4) & 0b11, + (IMM8 as u32 >> 6) & 0b11, + ], + )) + } +} + +/// Shuffles double-precision (64-bit) floating-point elements in `a` +/// within 256-bit lanes using the control in `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permutevar_pd) +// NOTE: Not modeled yet +// pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d { +// { vpermilpd256(a, b.as_i64x4()) } +// } + +/// Shuffles double-precision (64-bit) floating-point elements in `a` +/// using the control in `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permutevar_pd) +// NOTE: Not modeled yet +// pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d { +// { vpermilpd(a, b.as_i64x2()) } +// } + +/// Shuffles double-precision (64-bit) floating-point elements in `a` +/// within 128-bit lanes using the control in `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute_pd) +pub fn _mm256_permute_pd(a: __m256d) -> __m256d { + static_assert_uimm_bits!(IMM4, 4); + { + transmute(simd_shuffle( + a.as_f64x4(), + _mm256_undefined_pd().as_f64x4(), + [ + ((IMM4 as u32 >> 0) & 1), + ((IMM4 as u32 >> 1) & 1), + ((IMM4 as u32 >> 2) & 1) + 2, + ((IMM4 as u32 >> 3) & 1) + 2, + ], + )) + } +} +/// Shuffles double-precision (64-bit) floating-point elements in `a` +/// using the control in `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_pd) +pub fn _mm_permute_pd(a: __m128d) -> __m128d { + static_assert_uimm_bits!(IMM2, 2); + { + transmute(simd_shuffle( + a.as_f64x2(), + _mm_undefined_pd().as_f64x2(), + [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1], + )) + } +} +/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit) +/// floating-point elements) selected by `imm8` from `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_ps) +// NOTE: Not modeled yet +// pub fn _mm256_permute2f128_ps(a: __m256, b: __m256) -> __m256 { +// static_assert_uimm_bits!(IMM8, 8); +// { vperm2f128ps256(a, b, IMM8 as i8) } +// } +/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit) +/// floating-point elements) selected by `imm8` from `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_pd) +// NOTE: Not modeled yet +// pub fn _mm256_permute2f128_pd(a: __m256d, b: __m256d) -> __m256d { +// static_assert_uimm_bits!(IMM8, 8); +// { vperm2f128pd256(a, b, IMM8 as i8) } +// } +/// Shuffles 128-bits (composed of integer data) selected by `imm8` +/// from `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_si256) +pub fn _mm256_permute2f128_si256(a: __m256i, b: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) +} +/// Broadcasts a single-precision (32-bit) floating-point element from memory +/// to all elements of the returned vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ss) +pub fn _mm256_broadcast_ss(f: &f32) -> __m256 { + _mm256_set1_ps(*f) +} +/// Broadcasts a single-precision (32-bit) floating-point element from memory +/// to all elements of the returned vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_broadcast_ss) +// NOTE: Not modeled yet +// pub fn _mm_broadcast_ss(f: &f32) -> __m128 { +// _mm_set1_ps(*f) +// } +/// Broadcasts a double-precision (64-bit) floating-point element from memory +/// to all elements of the returned vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_sd) +// NOTE: Not modeled yet +// pub fn _mm256_broadcast_sd(f: &f64) -> __m256d { +// _mm256_set1_pd(*f) +// } +/// Broadcasts 128 bits from memory (composed of 4 packed single-precision +/// (32-bit) floating-point elements) to all elements of the returned vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ps) +pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 { + { + transmute(simd_shuffle( + (*a).as_f32x4(), + _mm_setzero_ps().as_f32x4(), + [0, 1, 2, 3, 0, 1, 2, 3], + )) + } +} +/// Broadcasts 128 bits from memory (composed of 2 packed double-precision +/// (64-bit) floating-point elements) to all elements of the returned vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_pd) +pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d { + transmute(simd_shuffle( + (*a).as_f64x2(), + _mm_setzero_pd().as_f64x2(), + [0, 1, 0, 1], + )) +} +/// Copies `a` to result, then inserts 128 bits (composed of 4 packed +/// single-precision (32-bit) floating-point elements) from `b` into result +/// at the location specified by `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_ps) +pub fn _mm256_insertf128_ps(a: __m256, b: __m128) -> __m256 { + static_assert_uimm_bits!(IMM1, 1); + { + transmute(simd_shuffle( + a.as_f32x8(), + _mm256_castps128_ps256(b).as_f32x8(), + [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize], + )) + } +} +/// Copies `a` to result, then inserts 128 bits (composed of 2 packed +/// double-precision (64-bit) floating-point elements) from `b` into result +/// at the location specified by `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_pd) +pub fn _mm256_insertf128_pd(a: __m256d, b: __m128d) -> __m256d { + static_assert_uimm_bits!(IMM1, 1); + { + transmute(simd_shuffle( + a.as_f64x4(), + _mm256_castpd128_pd256(b).as_f64x4(), + [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize], + )) + } +} +/// Copies `a` to result, then inserts 128 bits from `b` into result +/// at the location specified by `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_si256) +pub fn _mm256_insertf128_si256(a: __m256i, b: __m128i) -> __m256i { + static_assert_uimm_bits!(IMM1, 1); + { + let dst: i64x4 = simd_shuffle( + a.as_i64x4(), + _mm256_castsi128_si256(b).as_i64x4(), + [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize], + ); + transmute(dst) + } +} +/// Copies `a` to result, and inserts the 8-bit integer `i` into result +/// at the location specified by `index`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi8) +pub fn _mm256_insert_epi8(a: __m256i, i: i8) -> __m256i { + static_assert_uimm_bits!(INDEX, 5); + transmute(simd_insert(a.as_i8x32(), INDEX as u32, i)) +} +/// Copies `a` to result, and inserts the 16-bit integer `i` into result +/// at the location specified by `index`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi16) +pub fn _mm256_insert_epi16(a: __m256i, i: i16) -> __m256i { + static_assert_uimm_bits!(INDEX, 4); + transmute(simd_insert(a.as_i16x16(), INDEX as u32, i)) +} +/// Copies `a` to result, and inserts the 32-bit integer `i` into result +/// at the location specified by `index`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi32) +pub fn _mm256_insert_epi32(a: __m256i, i: i32) -> __m256i { + static_assert_uimm_bits!(INDEX, 3); + transmute(simd_insert(a.as_i32x8(), INDEX as u32, i)) +} +/// Duplicate odd-indexed single-precision (32-bit) floating-point elements +/// from `a`, and returns the results. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movehdup_ps) +pub fn _mm256_movehdup_ps(a: __m256) -> __m256 { + transmute(simd_shuffle( + a.as_f32x8(), + a.as_f32x8(), + [1, 1, 3, 3, 5, 5, 7, 7], + )) +} +/// Duplicate even-indexed single-precision (32-bit) floating-point elements +/// from `a`, and returns the results. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_moveldup_ps) +pub fn _mm256_moveldup_ps(a: __m256) -> __m256 { + transmute(simd_shuffle( + a.as_f32x8(), + a.as_f32x8(), + [0, 0, 2, 2, 4, 4, 6, 6], + )) +} +/// Duplicate even-indexed double-precision (64-bit) floating-point elements +/// from `a`, and returns the results. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movedup_pd) +pub fn _mm256_movedup_pd(a: __m256d) -> __m256d { + transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 0, 2, 2])) +} +/// Computes the approximate reciprocal of packed single-precision (32-bit) +/// floating-point elements in `a`, and returns the results. The maximum +/// relative error for this approximation is less than 1.5*2^-12. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_rcp_ps) +// NOTE: Not modeled yet +// pub fn _mm256_rcp_ps(a: __m256) -> __m256 { +// { vrcpps(a) } +// } +/// Computes the approximate reciprocal square root of packed single-precision +/// (32-bit) floating-point elements in `a`, and returns the results. +/// The maximum relative error for this approximation is less than 1.5*2^-12. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_rsqrt_ps) +// NOTE: Not modeled yet +// pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 { +// { vrsqrtps(a) } +// } +/// Unpacks and interleave double-precision (64-bit) floating-point elements +/// from the high half of each 128-bit lane in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpackhi_pd) +pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d { + transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [1, 5, 3, 7])) +} +/// Unpacks and interleave single-precision (32-bit) floating-point elements +/// from the high half of each 128-bit lane in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpackhi_ps) +pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 { + transmute(simd_shuffle( + a.as_f32x8(), + b.as_f32x8(), + [2, 10, 3, 11, 6, 14, 7, 15], + )) +} +/// Unpacks and interleave double-precision (64-bit) floating-point elements +/// from the low half of each 128-bit lane in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpacklo_pd) +pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d { + transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [0, 4, 2, 6])) +} +/// Unpacks and interleave single-precision (32-bit) floating-point elements +/// from the low half of each 128-bit lane in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpacklo_ps) +pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 { + transmute(simd_shuffle( + a.as_f32x8(), + b.as_f32x8(), + [0, 8, 1, 9, 4, 12, 5, 13], + )) +} +/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and +/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. +/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if +/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_si256) +pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 { + ptestz256(a.as_i64x4(), b.as_i64x4()) +} +/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and +/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. +/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if +/// the result is zero, otherwise set `CF` to 0. Return the `CF` value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_si256) +pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 { + ptestc256(a.as_i64x4(), b.as_i64x4()) +} + +/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and +/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. +/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if +/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and +/// `CF` values are zero, otherwise return 0. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_si256) +// NOTE: Not modeled yet +// pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 { +// { ptestnzc256(a.as_i64x4(), b.as_i64x4()) } +// } + +/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit +/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return the `ZF` value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_pd) +// NOTE: Not modeled yet +// pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 { +// { vtestzpd256(a, b) } +// } + +/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit +/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return the `CF` value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_pd) +// NOTE: Not modeled yet +// pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 { +// { vtestcpd256(a, b) } +// } + +/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit +/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values +/// are zero, otherwise return 0. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_pd) +// NOTE: Not modeled yet +// pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 { +// { vtestnzcpd256(a, b) } +// } + +/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit +/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return the `ZF` value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testz_pd) +// NOTE: Not modeled yet +// pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 { +// { vtestzpd(a, b) } +// } + +/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit +/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return the `CF` value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testc_pd) +// NOTE: Not modeled yet +// pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 { +// { vtestcpd(a, b) } +// } + +/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit +/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values +/// are zero, otherwise return 0. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testnzc_pd) +// NOTE: Not modeled yet +// pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 { +// { vtestnzcpd(a, b) } +// } + +/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit +/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return the `ZF` value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_ps) +// NOTE: Not modeled yet +// pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 { +// { vtestzps256(a, b) } +// } + +/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit +/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return the `CF` value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_ps) +// NOTE: Not modeled yet +// pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 { +// { vtestcps256(a, b) } +// } + +/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit +/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values +/// are zero, otherwise return 0. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_ps) +// NOTE: Not modeled yet +// pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 { +// { vtestnzcps256(a, b) } +// } + +/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit +/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return the `ZF` value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testz_ps) +// NOTE: Not modeled yet +// pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 { +// { vtestzps(a, b) } +// } + +/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit +/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return the `CF` value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testc_ps) +// NOTE: Not modeled yet +// pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 { +// { vtestcps(a, b) } +// } + +/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) +/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit +/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +/// NOT of `a` and then AND with `b`, producing an intermediate value, and set +/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values +/// are zero, otherwise return 0. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testnzc_ps) +// NOTE: Not modeled yet +// pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 { +// { vtestnzcps(a, b) } +// } + +/// Sets each bit of the returned mask based on the most significant bit of the +/// corresponding packed double-precision (64-bit) floating-point element in +/// `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movemask_pd) +pub fn _mm256_movemask_pd(a: __m256d) -> i32 { + { + let mask: i64x4 = simd_lt(a.as_i64x4(), i64x4::ZERO()); + simd_bitmask_little!(3, mask, u8) as i32 + } +} +/// Sets each bit of the returned mask based on the most significant bit of the +/// corresponding packed single-precision (32-bit) floating-point element in +/// `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movemask_ps) +pub fn _mm256_movemask_ps(a: __m256) -> i32 { + { + let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO()); + simd_bitmask_little!(7, mask, u8) as i32 + } +} +/// Returns vector of type __m256d with all elements set to zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_pd) +pub fn _mm256_setzero_pd() -> __m256d { + transmute(f64x4::ZERO()) +} +/// Returns vector of type __m256 with all elements set to zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_ps) +pub fn _mm256_setzero_ps() -> __m256 { + transmute(f32x8::ZERO()) +} +/// Returns vector of type __m256i with all elements set to zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_si256) +pub fn _mm256_setzero_si256() -> __m256i { + transmute(i64x4::ZERO()) +} +/// Sets packed double-precision (64-bit) floating-point elements in returned +/// vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_pd) +pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d { + _mm256_setr_pd(d, c, b, a) +} +/// Sets packed single-precision (32-bit) floating-point elements in returned +/// vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_ps) +pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 { + _mm256_setr_ps(h, g, f, e, d, c, b, a) +} +/// Sets packed 8-bit integers in returned vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi8) +pub fn _mm256_set_epi8( + e00: i8, + e01: i8, + e02: i8, + e03: i8, + e04: i8, + e05: i8, + e06: i8, + e07: i8, + e08: i8, + e09: i8, + e10: i8, + e11: i8, + e12: i8, + e13: i8, + e14: i8, + e15: i8, + e16: i8, + e17: i8, + e18: i8, + e19: i8, + e20: i8, + e21: i8, + e22: i8, + e23: i8, + e24: i8, + e25: i8, + e26: i8, + e27: i8, + e28: i8, + e29: i8, + e30: i8, + e31: i8, +) -> __m256i { + _mm256_setr_epi8( + e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, + e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00, + ) +} +/// Sets packed 16-bit integers in returned vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi16) +pub fn _mm256_set_epi16( + e00: i16, + e01: i16, + e02: i16, + e03: i16, + e04: i16, + e05: i16, + e06: i16, + e07: i16, + e08: i16, + e09: i16, + e10: i16, + e11: i16, + e12: i16, + e13: i16, + e14: i16, + e15: i16, +) -> __m256i { + _mm256_setr_epi16( + e15, e14, e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00, + ) +} +/// Sets packed 32-bit integers in returned vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi32) +pub fn _mm256_set_epi32( + e0: i32, + e1: i32, + e2: i32, + e3: i32, + e4: i32, + e5: i32, + e6: i32, + e7: i32, +) -> __m256i { + _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) +} +/// Sets packed 64-bit integers in returned vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi64x) +pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i { + _mm256_setr_epi64x(d, c, b, a) +} +/// Sets packed double-precision (64-bit) floating-point elements in returned +/// vector with the supplied values in reverse order. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_pd) +pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d { + transmute(f64x4::new(a, b, c, d)) +} +/// Sets packed single-precision (32-bit) floating-point elements in returned +/// vector with the supplied values in reverse order. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_ps) +pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 { + transmute(f32x8::new(a, b, c, d, e, f, g, h)) +} +/// Sets packed 8-bit integers in returned vector with the supplied values in +/// reverse order. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi8) +pub fn _mm256_setr_epi8( + e00: i8, + e01: i8, + e02: i8, + e03: i8, + e04: i8, + e05: i8, + e06: i8, + e07: i8, + e08: i8, + e09: i8, + e10: i8, + e11: i8, + e12: i8, + e13: i8, + e14: i8, + e15: i8, + e16: i8, + e17: i8, + e18: i8, + e19: i8, + e20: i8, + e21: i8, + e22: i8, + e23: i8, + e24: i8, + e25: i8, + e26: i8, + e27: i8, + e28: i8, + e29: i8, + e30: i8, + e31: i8, +) -> __m256i { + { + transmute(i8x32::new( + e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, + e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, + )) + } +} +/// Sets packed 16-bit integers in returned vector with the supplied values in +/// reverse order. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi16) +pub fn _mm256_setr_epi16( + e00: i16, + e01: i16, + e02: i16, + e03: i16, + e04: i16, + e05: i16, + e06: i16, + e07: i16, + e08: i16, + e09: i16, + e10: i16, + e11: i16, + e12: i16, + e13: i16, + e14: i16, + e15: i16, +) -> __m256i { + { + transmute(i16x16::new( + e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, + )) + } +} +/// Sets packed 32-bit integers in returned vector with the supplied values in +/// reverse order. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi32) +pub fn _mm256_setr_epi32( + e0: i32, + e1: i32, + e2: i32, + e3: i32, + e4: i32, + e5: i32, + e6: i32, + e7: i32, +) -> __m256i { + transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) +} +/// Sets packed 64-bit integers in returned vector with the supplied values in +/// reverse order. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi64x) +pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i { + transmute(i64x4::new(a, b, c, d)) +} +/// Broadcasts double-precision (64-bit) floating-point value `a` to all +/// elements of returned vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_pd) +pub fn _mm256_set1_pd(a: f64) -> __m256d { + _mm256_setr_pd(a, a, a, a) +} +/// Broadcasts single-precision (32-bit) floating-point value `a` to all +/// elements of returned vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_ps) +pub fn _mm256_set1_ps(a: f32) -> __m256 { + _mm256_setr_ps(a, a, a, a, a, a, a, a) +} +/// Broadcasts 8-bit integer `a` to all elements of returned vector. +/// This intrinsic may generate the `vpbroadcastb`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi8) +pub fn _mm256_set1_epi8(a: i8) -> __m256i { + _mm256_setr_epi8( + a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, + a, a, + ) +} +/// Broadcasts 16-bit integer `a` to all elements of returned vector. +/// This intrinsic may generate the `vpbroadcastw`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi16) +pub fn _mm256_set1_epi16(a: i16) -> __m256i { + _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} +/// Broadcasts 32-bit integer `a` to all elements of returned vector. +/// This intrinsic may generate the `vpbroadcastd`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi32) +pub fn _mm256_set1_epi32(a: i32) -> __m256i { + _mm256_setr_epi32(a, a, a, a, a, a, a, a) +} +/// Broadcasts 64-bit integer `a` to all elements of returned vector. +/// This intrinsic may generate the `vpbroadcastq`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi64x) +pub fn _mm256_set1_epi64x(a: i64) -> __m256i { + _mm256_setr_epi64x(a, a, a, a) +} +/// Cast vector of type __m256d to type __m256. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd_ps) +pub fn _mm256_castpd_ps(a: __m256d) -> __m256 { + transmute(a) +} +/// Cast vector of type __m256 to type __m256d. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps_pd) +pub fn _mm256_castps_pd(a: __m256) -> __m256d { + transmute(a) +} +/// Casts vector of type __m256 to type __m256i. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps_si256) +pub fn _mm256_castps_si256(a: __m256) -> __m256i { + transmute(a) +} +/// Casts vector of type __m256i to type __m256. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_ps) +pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 { + transmute(a) +} +/// Casts vector of type __m256d to type __m256i. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd_si256) +pub fn _mm256_castpd_si256(a: __m256d) -> __m256i { + transmute(a) +} +/// Casts vector of type __m256i to type __m256d. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_pd) +pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d { + transmute(a) +} +/// Casts vector of type __m256 to type __m128. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps256_ps128) +pub fn _mm256_castps256_ps128(a: __m256) -> __m128 { + transmute(simd_shuffle(a.as_f32x8(), a.as_f32x8(), [0, 1, 2, 3])) +} +/// Casts vector of type __m256d to type __m128d. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd256_pd128) +pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d { + transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 1])) +} +/// Casts vector of type __m256i to type __m128i. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_si128) +pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i { + { + let a = a.as_i64x4(); + let dst: i64x2 = simd_shuffle(a, a, [0, 1]); + transmute(dst) + } +} +/// Casts vector of type __m128 to type __m256; +/// the upper 128 bits of the result are undefined. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps128_ps256) +pub fn _mm256_castps128_ps256(a: __m128) -> __m256 { + { + transmute(simd_shuffle( + a.as_f32x4(), + _mm_undefined_ps().as_f32x4(), + [0, 1, 2, 3, 4, 4, 4, 4], + )) + } +} +/// Casts vector of type __m128d to type __m256d; +/// the upper 128 bits of the result are undefined. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd128_pd256) +pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d { + transmute(simd_shuffle( + a.as_f64x2(), + _mm_undefined_pd().as_f64x2(), + [0, 1, 2, 2], + )) +} +/// Casts vector of type __m128i to type __m256i; +/// the upper 128 bits of the result are undefined. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi128_si256) +pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i { + { + let a = a.as_i64x2(); + let undefined = i64x2::ZERO(); + let dst: i64x4 = simd_shuffle(a, undefined, [0, 1, 2, 2]); + transmute(dst) + } +} +/// Constructs a 256-bit floating-point vector of `[8 x float]` from a +/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain +/// the value of the source vector. The upper 128 bits are set to zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextps128_ps256) +pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 { + { + transmute(simd_shuffle( + a.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + [0, 1, 2, 3, 4, 5, 6, 7], + )) + } +} +/// Constructs a 256-bit integer vector from a 128-bit integer vector. +/// The lower 128 bits contain the value of the source vector. The upper +/// 128 bits are set to zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextsi128_si256) +pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i { + { + let b = i64x2::ZERO(); + let dst: i64x4 = simd_shuffle(a.as_i64x2(), b, [0, 1, 2, 3]); + transmute(dst) + } +} +/// Constructs a 256-bit floating-point vector of `[4 x double]` from a +/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits +/// contain the value of the source vector. The upper 128 bits are set +/// to zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextpd128_pd256) +// NOTE: Not modeled yet +pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d { + { + transmute(simd_shuffle( + a.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + [0, 1, 2, 3], + )) + } +} +/// Returns vector of type `__m256` with indeterminate elements. +/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically +/// picks some valid value and is not equivalent to [`mem::MaybeUninit`]. +/// In practice, this is typically equivalent to [`mem::zeroed`]. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_ps) +pub fn _mm256_undefined_ps() -> __m256 { + transmute(f32x8::ZERO()) +} +/// Returns vector of type `__m256d` with indeterminate elements. +/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically +/// picks some valid value and is not equivalent to [`mem::MaybeUninit`]. +/// In practice, this is typically equivalent to [`mem::zeroed`]. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_pd) +pub fn _mm256_undefined_pd() -> __m256d { + transmute(f32x8::ZERO()) +} +/// Returns vector of type __m256i with with indeterminate elements. +/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically +/// picks some valid value and is not equivalent to [`mem::MaybeUninit`]. +/// In practice, this is typically equivalent to [`mem::zeroed`]. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_si256) +pub fn _mm256_undefined_si256() -> __m256i { + transmute(i32x8::ZERO()) +} +/// Sets packed __m256 returned vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128) +pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 { + transmute(simd_shuffle( + lo.as_i32x4(), + hi.as_i32x4(), + [0, 1, 2, 3, 4, 5, 6, 7], + )) +} +/// Sets packed __m256d returned vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128d) +pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d { + { + let hi: __m128 = transmute(hi); + let lo: __m128 = transmute(lo); + transmute(_mm256_set_m128(hi, lo)) + } +} +/// Sets packed __m256i returned vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128i) +pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i { + { + let hi: __m128 = transmute(hi); + let lo: __m128 = transmute(lo); + transmute(_mm256_set_m128(hi, lo)) + } +} +/// Sets packed __m256 returned vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128) +pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 { + _mm256_set_m128(hi, lo) +} +/// Sets packed __m256d returned vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128d) +pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d { + _mm256_set_m128d(hi, lo) +} +/// Sets packed __m256i returned vector with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128i) +pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i { + _mm256_set_m128i(hi, lo) +} +/// Returns the first element of the input vector of `[8 x float]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtss_f32) +pub fn _mm256_cvtss_f32(a: __m256) -> f32 { + simd_extract(a.as_f32x8(), 0) +} diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs new file mode 100644 index 0000000000000..2626d04635bd6 --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs @@ -0,0 +1,1873 @@ +//! Advanced Vector Extensions 2 (AVX) +//! +//! +//! This module contains models for AVX2 intrinsics. +//! AVX2 expands most AVX commands to 256-bit wide vector registers and +//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate). +//! +//! The references are: +//! +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: +//! Instruction Set Reference, A-Z][intel64_ref]. +//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and +//! System Instructions][amd64_ref]. +//! +//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick +//! overview of the instructions available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf +//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions +//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate +use crate::abstractions::simd::*; +use crate::abstractions::utilities::*; + +use super::avx::*; +use super::avx2_handwritten::*; +use super::sse::*; +use super::sse2::*; +use super::types::*; + +/// Computes the absolute values of packed 32-bit integers in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32) +pub fn _mm256_abs_epi32(a: __m256i) -> __m256i { + { + let a = a.as_i32x8(); + let r = simd_select(simd_lt(a, i32x8::ZERO()), simd_neg(a), a); + transmute(r) + } +} +/// Computes the absolute values of packed 16-bit integers in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16) +pub fn _mm256_abs_epi16(a: __m256i) -> __m256i { + { + let a = a.as_i16x16(); + let r = simd_select(simd_lt(a, i16x16::ZERO()), simd_neg(a), a); + transmute(r) + } +} +/// Computes the absolute values of packed 8-bit integers in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8) +pub fn _mm256_abs_epi8(a: __m256i) -> __m256i { + { + let a = a.as_i8x32(); + let r = simd_select(simd_lt(a, i8x32::ZERO()), simd_neg(a), a); + transmute(r) + } +} +/// Adds packed 64-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64) +pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_add(a.as_i64x4(), b.as_i64x4())) + } +} +/// Adds packed 32-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32) +pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_add(a.as_i32x8(), b.as_i32x8())) + } +} +/// Adds packed 16-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16) +pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_add(a.as_i16x16(), b.as_i16x16())) + } +} +/// Adds packed 8-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8) +pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_add(a.as_i8x32(), b.as_i8x32())) + } +} +/// Adds packed 8-bit integers in `a` and `b` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8) +pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) + } +} +/// Adds packed 16-bit integers in `a` and `b` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16) +pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) + } +} +/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8) +pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) + } +} +/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16) +pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) + } +} +/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary +/// result, shifts the result right by `n` bytes, and returns the low 16 bytes. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8) +pub fn _mm256_alignr_epi8(a: __m256i, b: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + if IMM8 >= 32 { + return _mm256_setzero_si256(); + } + let (a, b) = if IMM8 > 16 { + (_mm256_setzero_si256(), a) + } else { + (a, b) + }; + { + if IMM8 == 16 { + return transmute(a); + } + } + const fn mask(shift: u32, i: u32) -> u32 { + let shift = shift % 16; + let mod_i = i % 16; + if mod_i < (16 - shift) { + i + shift + } else { + i + 16 + shift + } + } + { + let r: i8x32 = simd_shuffle( + b.as_i8x32(), + a.as_i8x32(), + [ + mask(IMM8 as u32, 0), + mask(IMM8 as u32, 1), + mask(IMM8 as u32, 2), + mask(IMM8 as u32, 3), + mask(IMM8 as u32, 4), + mask(IMM8 as u32, 5), + mask(IMM8 as u32, 6), + mask(IMM8 as u32, 7), + mask(IMM8 as u32, 8), + mask(IMM8 as u32, 9), + mask(IMM8 as u32, 10), + mask(IMM8 as u32, 11), + mask(IMM8 as u32, 12), + mask(IMM8 as u32, 13), + mask(IMM8 as u32, 14), + mask(IMM8 as u32, 15), + mask(IMM8 as u32, 16), + mask(IMM8 as u32, 17), + mask(IMM8 as u32, 18), + mask(IMM8 as u32, 19), + mask(IMM8 as u32, 20), + mask(IMM8 as u32, 21), + mask(IMM8 as u32, 22), + mask(IMM8 as u32, 23), + mask(IMM8 as u32, 24), + mask(IMM8 as u32, 25), + mask(IMM8 as u32, 26), + mask(IMM8 as u32, 27), + mask(IMM8 as u32, 28), + mask(IMM8 as u32, 29), + mask(IMM8 as u32, 30), + mask(IMM8 as u32, 31), + ], + ); + transmute(r) + } +} +/// Computes the bitwise AND of 256 bits (representing integer data) +/// in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256) +pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_and(a.as_i64x4(), b.as_i64x4())) + } +} +/// Computes the bitwise NOT of 256 bits (representing integer data) +/// in `a` and then AND with `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256) +pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { + { + let all_ones = _mm256_set1_epi8(-1); + transmute(simd_and( + simd_xor(a.as_i64x4(), all_ones.as_i64x4()), + b.as_i64x4(), + )) + } +} +/// Averages packed unsigned 16-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16) +pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { + { + let a = simd_cast::<16, _, u32>(a.as_u16x16()); + let b = simd_cast::<16, _, u32>(b.as_u16x16()); + let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1)); + transmute(simd_cast::<16, _, u16>(r)) + } +} +/// Averages packed unsigned 8-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8) +pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { + { + let a = simd_cast::<32, _, u16>(a.as_u8x32()); + let b = simd_cast::<32, _, u16>(b.as_u8x32()); + let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1)); + transmute(simd_cast::<32, _, u8>(r)) + } +} +/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32) +pub fn _mm_blend_epi32(a: __m128i, b: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM4, 4); + { + let a = a.as_i32x4(); + let b = b.as_i32x4(); + let r: i32x4 = simd_shuffle( + a, + b, + [ + [0, 4, 0, 4][IMM4 as usize & 0b11], + [1, 1, 5, 5][IMM4 as usize & 0b11], + [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11], + [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11], + ], + ); + transmute(r) + } +} +/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32) +pub fn _mm256_blend_epi32(a: __m256i, b: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + let a = a.as_i32x8(); + let b = b.as_i32x8(); + let r: i32x8 = simd_shuffle( + a, + b, + [ + [0, 8, 0, 8][IMM8 as usize & 0b11], + [1, 1, 9, 9][IMM8 as usize & 0b11], + [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11], + [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11], + [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11], + [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11], + [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11], + [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11], + ], + ); + transmute(r) + } +} +/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16) +pub fn _mm256_blend_epi16(a: __m256i, b: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + let a = a.as_i16x16(); + let b = b.as_i16x16(); + let r: i16x16 = simd_shuffle( + a, + b, + [ + [0, 16, 0, 16][IMM8 as usize & 0b11], + [1, 1, 17, 17][IMM8 as usize & 0b11], + [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11], + [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11], + [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11], + [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11], + [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11], + [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11], + [8, 24, 8, 24][IMM8 as usize & 0b11], + [9, 9, 25, 25][IMM8 as usize & 0b11], + [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11], + [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11], + [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11], + [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11], + [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11], + [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11], + ], + ); + transmute(r) + } +} +/// Blends packed 8-bit integers from `a` and `b` using `mask`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8) +pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i { + { + let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO()); + transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32())) + } +} +/// Broadcasts the low packed 8-bit integer from `a` to all elements of +/// the 128-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8) +pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i { + { + let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 16]); + transmute::(ret) + } +} +/// Broadcasts the low packed 8-bit integer from `a` to all elements of +/// the 256-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8) +pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i { + { + let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 32]); + transmute::(ret) + } +} +/// Broadcasts the low packed 32-bit integer from `a` to all elements of +/// the 128-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32) +pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i { + { + let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 4]); + transmute::(ret) + } +} +/// Broadcasts the low packed 32-bit integer from `a` to all elements of +/// the 256-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32) +pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i { + { + let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 8]); + transmute::(ret) + } +} +/// Broadcasts the low packed 64-bit integer from `a` to all elements of +/// the 128-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64) +pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i { + { + let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]); + transmute::(ret) + } +} +/// Broadcasts the low packed 64-bit integer from `a` to all elements of +/// the 256-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64) +pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i { + { + let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]); + transmute::(ret) + } +} +/// Broadcasts the low double-precision (64-bit) floating-point element +/// from `a` to all elements of the 128-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd) +pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d { + { + transmute(simd_shuffle( + a.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + [0_u32; 2], + )) + } +} +/// Broadcasts the low double-precision (64-bit) floating-point element +/// from `a` to all elements of the 256-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd) +pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d { + { + transmute(simd_shuffle( + a.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + [0_u32; 4], + )) + } +} +/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in +/// the 256-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256) +pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i { + { + let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]); + transmute::(ret) + } +} +/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in +/// the 256-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256) +pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i { + { + let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]); + transmute::(ret) + } +} +/// Broadcasts the low single-precision (32-bit) floating-point element +/// from `a` to all elements of the 128-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps) +pub fn _mm_broadcastss_ps(a: __m128) -> __m128 { + { + transmute(simd_shuffle( + a.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + [0_u32; 4], + )) + } +} +/// Broadcasts the low single-precision (32-bit) floating-point element +/// from `a` to all elements of the 256-bit returned value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps) +pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 { + { + transmute(simd_shuffle( + a.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + [0_u32; 8], + )) + } +} +/// Broadcasts the low packed 16-bit integer from a to all elements of +/// the 128-bit returned value +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16) +pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i { + { + let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 8]); + transmute::(ret) + } +} +/// Broadcasts the low packed 16-bit integer from a to all elements of +/// the 256-bit returned value +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16) +pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i { + { + let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 16]); + transmute::(ret) + } +} +/// Compares packed 64-bit integers in `a` and `b` for equality. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64) +pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i { + { + transmute::(simd_eq(a.as_i64x4(), b.as_i64x4())) + } +} +/// Compares packed 32-bit integers in `a` and `b` for equality. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32) +pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute::(simd_eq(a.as_i32x8(), b.as_i32x8())) + } +} +/// Compares packed 16-bit integers in `a` and `b` for equality. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16) +pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute::(simd_eq(a.as_i16x16(), b.as_i16x16())) + } +} +/// Compares packed 8-bit integers in `a` and `b` for equality. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8) +pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i { + { + transmute::(simd_eq(a.as_i8x32(), b.as_i8x32())) + } +} +/// Compares packed 64-bit integers in `a` and `b` for greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64) +pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i { + { + transmute::(simd_gt(a.as_i64x4(), b.as_i64x4())) + } +} +/// Compares packed 32-bit integers in `a` and `b` for greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32) +pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute::(simd_gt(a.as_i32x8(), b.as_i32x8())) + } +} +/// Compares packed 16-bit integers in `a` and `b` for greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16) +pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute::(simd_gt(a.as_i16x16(), b.as_i16x16())) + } +} +/// Compares packed 8-bit integers in `a` and `b` for greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8) +pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i { + { + transmute::(simd_gt(a.as_i8x32(), b.as_i8x32())) + } +} +/// Sign-extend 16-bit integers to 32-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32) +pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i { + { + transmute::(simd_cast(a.as_i16x8())) + } +} +/// Sign-extend 16-bit integers to 64-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64) +pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i { + { + let a = a.as_i16x8(); + let v64: i16x4 = simd_shuffle(a, a, [0, 1, 2, 3]); + transmute::(simd_cast(v64)) + } +} +/// Sign-extend 32-bit integers to 64-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64) +pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i { + { + transmute::(simd_cast(a.as_i32x4())) + } +} +/// Sign-extend 8-bit integers to 16-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16) +pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i { + { + transmute::(simd_cast(a.as_i8x16())) + } +} +/// Sign-extend 8-bit integers to 32-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32) +pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i { + { + let a = a.as_i8x16(); + let v64: i8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); + transmute::(simd_cast(v64)) + } +} +/// Sign-extend 8-bit integers to 64-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64) +pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i { + { + let a = a.as_i8x16(); + let v32: i8x4 = simd_shuffle(a, a, [0, 1, 2, 3]); + transmute::(simd_cast(v32)) + } +} +/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit +/// integers, and stores the results in `dst`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32) +pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i { + { + transmute(simd_cast::<8, _, u32>(a.as_u16x8())) + } +} +/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit +/// integers. The upper four elements of `a` are unused. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64) +pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i { + { + let a = a.as_u16x8(); + let v64: u16x4 = simd_shuffle(a, a, [0, 1, 2, 3]); + transmute(simd_cast::<4, _, u64>(v64)) + } +} +/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64) +pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i { + { + transmute(simd_cast::<4, _, u64>(a.as_u32x4())) + } +} +/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16) +pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i { + { + transmute(simd_cast::<16, _, u16>(a.as_u8x16())) + } +} +/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit +/// integers. The upper eight elements of `a` are unused. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32) +pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i { + { + let a = a.as_u8x16(); + let v64: u8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); + transmute(simd_cast::<8, _, u32>(v64)) + } +} +/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit +/// integers. The upper twelve elements of `a` are unused. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64) +pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i { + { + let a = a.as_u8x16(); + let v32: u8x4 = simd_shuffle(a, a, [0, 1, 2, 3]); + transmute(simd_cast::<4, _, u64>(v32)) + } +} +/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256) +pub fn _mm256_extracti128_si256(a: __m256i) -> __m128i { + static_assert_uimm_bits!(IMM1, 1); + { + let a = a.as_i64x4(); + let b = i64x4::ZERO(); + let dst: i64x2 = simd_shuffle(a, b, [[0, 1], [2, 3]][IMM1 as usize]); + transmute(dst) + } +} +/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16) +pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(phaddw(a.as_i16x16(), b.as_i16x16())) + } +} +/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32) +pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute(phaddd(a.as_i32x8(), b.as_i32x8())) + } +} +/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b` +/// using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16) +pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) + } +} +/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16) +pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(phsubw(a.as_i16x16(), b.as_i16x16())) + } +} +/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32) +pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute(phsubd(a.as_i32x8(), b.as_i32x8())) + } +} +/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b` +/// using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16) +pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) + } +} +/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the +/// location specified by `IMM1`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256) +pub fn _mm256_inserti128_si256(a: __m256i, b: __m128i) -> __m256i { + static_assert_uimm_bits!(IMM1, 1); + { + let a = a.as_i64x4(); + let b = _mm256_castsi128_si256(b).as_i64x4(); + let dst: i64x4 = simd_shuffle(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]); + transmute(dst) + } +} +/// Multiplies packed signed 16-bit integers in `a` and `b`, producing +/// intermediate signed 32-bit integers. Horizontally add adjacent pairs +/// of intermediate 32-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16) +pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) + } +} +/// Vertically multiplies each unsigned 8-bit integer from `a` with the +/// corresponding signed 8-bit integer from `b`, producing intermediate +/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate +/// signed 16-bit integers +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16) +pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) + } +} +/// Compares packed 16-bit integers in `a` and `b`, and returns the packed +/// maximum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16) +pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_i16x16(); + let b = b.as_i16x16(); + transmute(simd_select(simd_gt(a, b), a, b)) + } +} +/// Compares packed 32-bit integers in `a` and `b`, and returns the packed +/// maximum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32) +pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_i32x8(); + let b = b.as_i32x8(); + transmute(simd_select(simd_gt(a, b), a, b)) + } +} +/// Compares packed 8-bit integers in `a` and `b`, and returns the packed +/// maximum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8) +pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_i8x32(); + let b = b.as_i8x32(); + transmute(simd_select(simd_gt(a, b), a, b)) + } +} +/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns +/// the packed maximum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16) +pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_u16x16(); + let b = b.as_u16x16(); + transmute(simd_select(simd_gt(a, b), a, b)) + } +} +/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns +/// the packed maximum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32) +pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_u32x8(); + let b = b.as_u32x8(); + transmute(simd_select(simd_gt(a, b), a, b)) + } +} +/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns +/// the packed maximum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8) +pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_u8x32(); + let b = b.as_u8x32(); + transmute(simd_select(simd_gt(a, b), a, b)) + } +} +/// Compares packed 16-bit integers in `a` and `b`, and returns the packed +/// minimum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16) +pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_i16x16(); + let b = b.as_i16x16(); + transmute(simd_select(simd_lt(a, b), a, b)) + } +} +/// Compares packed 32-bit integers in `a` and `b`, and returns the packed +/// minimum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32) +pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_i32x8(); + let b = b.as_i32x8(); + transmute(simd_select(simd_lt(a, b), a, b)) + } +} +/// Compares packed 8-bit integers in `a` and `b`, and returns the packed +/// minimum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8) +pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_i8x32(); + let b = b.as_i8x32(); + transmute(simd_select(simd_lt(a, b), a, b)) + } +} +/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns +/// the packed minimum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16) +pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_u16x16(); + let b = b.as_u16x16(); + transmute(simd_select(simd_lt(a, b), a, b)) + } +} +/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns +/// the packed minimum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32) +pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_u32x8(); + let b = b.as_u32x8(); + transmute(simd_select(simd_lt(a, b), a, b)) + } +} +/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns +/// the packed minimum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8) +pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_u8x32(); + let b = b.as_u8x32(); + transmute(simd_select(simd_lt(a, b), a, b)) + } +} +/// Creates mask from the most significant bit of each 8-bit element in `a`, +/// return the result. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8) +pub fn _mm256_movemask_epi8(a: __m256i) -> i32 { + { + let z = i8x32::ZERO(); + let m: i8x32 = simd_lt(a.as_i8x32(), z); + simd_bitmask_little!(31, m, u32) as i32 + } +} +/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned +/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit +/// results in dst. Eight SADs are performed for each 128-bit lane using one +/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is +/// selected from `b` starting at on the offset specified in `imm8`. Eight +/// quadruplets are formed from sequential 8-bit integers selected from `a` +/// starting at the offset specified in `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8) +pub fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) + } +} +/// Multiplies the low 32-bit integers from each packed 64-bit element in +/// `a` and `b` +/// +/// Returns the 64-bit results. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32) +pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { + { + let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(a.as_i64x4())); + let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(b.as_i64x4())); + transmute(simd_mul(a, b)) + } +} +/// Multiplies the low unsigned 32-bit integers from each packed 64-bit +/// element in `a` and `b` +/// +/// Returns the unsigned 64-bit results. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32) +pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { + { + let a = a.as_u64x4(); + let b = b.as_u64x4(); + let mask = u64x4::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) + } +} +/// Multiplies the packed 16-bit integers in `a` and `b`, producing +/// intermediate 32-bit integers and returning the high 16 bits of the +/// intermediate integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16) +pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { + { + let a = simd_cast::<16, _, i32>(a.as_i16x16()); + let b = simd_cast::<16, _, i32>(b.as_i16x16()); + let r = simd_shr(simd_mul(a, b), i32x16::splat(16)); + transmute(simd_cast::<16, i32, i16>(r)) + } +} +/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing +/// intermediate 32-bit integers and returning the high 16 bits of the +/// intermediate integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16) +pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { + { + let a = simd_cast::<16, _, u32>(a.as_u16x16()); + let b = simd_cast::<16, _, u32>(b.as_u16x16()); + let r = simd_shr(simd_mul(a, b), u32x16::splat(16)); + transmute(simd_cast::<16, u32, u16>(r)) + } +} +/// Multiplies the packed 16-bit integers in `a` and `b`, producing +/// intermediate 32-bit integers, and returns the low 16 bits of the +/// intermediate integers +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16) +pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) + } +} +/// Multiplies the packed 32-bit integers in `a` and `b`, producing +/// intermediate 64-bit integers, and returns the low 32 bits of the +/// intermediate integers +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32) +pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) + } +} +/// Multiplies packed 16-bit integers in `a` and `b`, producing +/// intermediate signed 32-bit integers. Truncate each intermediate +/// integer to the 18 most significant bits, round by adding 1, and +/// return bits `[16:1]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16) +pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) + } +} +/// Computes the bitwise OR of 256 bits (representing integer data) in `a` +/// and `b` +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256) +pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_or(a.as_i32x8(), b.as_i32x8())) + } +} +/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// using signed saturation +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16) +pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(packsswb(a.as_i16x16(), b.as_i16x16())) + } +} +/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// using signed saturation +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32) +pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute(packssdw(a.as_i32x8(), b.as_i32x8())) + } +} +/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// using unsigned saturation +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16) +pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(packuswb(a.as_i16x16(), b.as_i16x16())) + } +} +/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// using unsigned saturation +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32) +pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute(packusdw(a.as_i32x8(), b.as_i32x8())) + } +} +/// Permutes packed 32-bit integers from `a` according to the content of `b`. +/// +/// The last 3 bits of each integer of `b` are used as addresses into the 8 +/// integers of `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32) +pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute(permd(a.as_u32x8(), b.as_u32x8())) + } +} +/// Permutes 64-bit integers from `a` using control mask `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64) +pub fn _mm256_permute4x64_epi64(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + let zero = i64x4::ZERO(); + let r: i64x4 = simd_shuffle( + a.as_i64x4(), + zero, + [ + IMM8 as u32 & 0b11, + (IMM8 as u32 >> 2) & 0b11, + (IMM8 as u32 >> 4) & 0b11, + (IMM8 as u32 >> 6) & 0b11, + ], + ); + transmute(r) + } +} +/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256) +pub fn _mm256_permute2x128_si256(a: __m256i, b: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) + } +} +/// Shuffles 64-bit floating-point elements in `a` across lanes using the +/// control in `imm8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd) +// NOTE: Not modeled yet +// pub fn _mm256_permute4x64_pd(a: __m256d) -> __m256d { +// static_assert_uimm_bits!(IMM8, 8); +// { +// transmute(simd_shuffle( +// a, _mm256_undefined_pd(), [IMM8 as u32 & 0b11, (IMM8 as u32 >> 2) & 0b11, +// (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,], +// )) +// } +// } + +/// Shuffles eight 32-bit floating-point elements in `a` across lanes using +/// the corresponding 32-bit integer index in `idx`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps) +// NOTE: Not modeled yet +// pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 { +// { permps(a, idx.as_i32x8()) } +// } + +/// Computes the absolute differences of packed unsigned 8-bit integers in `a` +/// and `b`, then horizontally sum each consecutive 8 differences to +/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit +/// integers in the low 16 bits of the 64-bit return value +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8) +pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i { + { + transmute(psadbw(a.as_u8x32(), b.as_u8x32())) + } +} +/// Shuffles bytes from `a` according to the content of `b`. +/// +/// For each of the 128-bit low and high halves of the vectors, the last +/// 4 bits of each byte of `b` are used as addresses into the respective +/// low or high 16 bytes of `a`. That is, the halves are shuffled separately. +/// +/// In addition, if the highest significant bit of a byte of `b` is set, the +/// respective destination byte is set to 0. +/// +/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically +/// equivalent to: +/// +/// ``` +/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] { +/// let mut r = [0; 32]; +/// for i in 0..16 { +/// // if the most significant bit of b is set, +/// // then the destination byte is set to 0. +/// if b[i] & 0x80 == 0u8 { +/// r[i] = a[(b[i] % 16) as usize]; +/// } +/// if b[i + 16] & 0x80 == 0u8 { +/// r[i + 16] = a[(b[i + 16] % 16 + 16) as usize]; +/// } +/// } +/// r +/// } +/// ``` +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8) +pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i { + { + transmute(pshufb(a.as_u8x32(), b.as_u8x32())) + } +} +/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in +/// `imm8`. +/// +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32) +pub fn _mm256_shuffle_epi32(a: __m256i) -> __m256i { + static_assert_uimm_bits!(MASK, 8); + { + let r: i32x8 = simd_shuffle( + a.as_i32x8(), + a.as_i32x8(), + [ + MASK as u32 & 0b11, + (MASK as u32 >> 2) & 0b11, + (MASK as u32 >> 4) & 0b11, + (MASK as u32 >> 6) & 0b11, + (MASK as u32 & 0b11) + 4, + ((MASK as u32 >> 2) & 0b11) + 4, + ((MASK as u32 >> 4) & 0b11) + 4, + ((MASK as u32 >> 6) & 0b11) + 4, + ], + ); + transmute(r) + } +} +/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using +/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied +/// to the output. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16) +pub fn _mm256_shufflehi_epi16(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + let a = a.as_i16x16(); + let r: i16x16 = simd_shuffle( + a, + a, + [ + 0, + 1, + 2, + 3, + 4 + (IMM8 as u32 & 0b11), + 4 + ((IMM8 as u32 >> 2) & 0b11), + 4 + ((IMM8 as u32 >> 4) & 0b11), + 4 + ((IMM8 as u32 >> 6) & 0b11), + 8, + 9, + 10, + 11, + 12 + (IMM8 as u32 & 0b11), + 12 + ((IMM8 as u32 >> 2) & 0b11), + 12 + ((IMM8 as u32 >> 4) & 0b11), + 12 + ((IMM8 as u32 >> 6) & 0b11), + ], + ); + transmute(r) + } +} +/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using +/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied +/// to the output. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16) +pub fn _mm256_shufflelo_epi16(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + let a = a.as_i16x16(); + let r: i16x16 = simd_shuffle( + a, + a, + [ + 0 + (IMM8 as u32 & 0b11), + 0 + ((IMM8 as u32 >> 2) & 0b11), + 0 + ((IMM8 as u32 >> 4) & 0b11), + 0 + ((IMM8 as u32 >> 6) & 0b11), + 4, + 5, + 6, + 7, + 8 + (IMM8 as u32 & 0b11), + 8 + ((IMM8 as u32 >> 2) & 0b11), + 8 + ((IMM8 as u32 >> 4) & 0b11), + 8 + ((IMM8 as u32 >> 6) & 0b11), + 12, + 13, + 14, + 15, + ], + ); + transmute(r) + } +} +/// Negates packed 16-bit integers in `a` when the corresponding signed +/// 16-bit integer in `b` is negative, and returns the results. +/// Results are zeroed out when the corresponding element in `b` is zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16) +pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(psignw(a.as_i16x16(), b.as_i16x16())) + } +} +/// Negates packed 32-bit integers in `a` when the corresponding signed +/// 32-bit integer in `b` is negative, and returns the results. +/// Results are zeroed out when the corresponding element in `b` is zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32) +pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute(psignd(a.as_i32x8(), b.as_i32x8())) + } +} +/// Negates packed 8-bit integers in `a` when the corresponding signed +/// 8-bit integer in `b` is negative, and returns the results. +/// Results are zeroed out when the corresponding element in `b` is zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8) +pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i { + { + transmute(psignb(a.as_i8x32(), b.as_i8x32())) + } +} +/// Shifts packed 16-bit integers in `a` left by `count` while +/// shifting in zeros, and returns the result +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16) +pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i { + { + transmute(psllw(a.as_i16x16(), count.as_i16x8())) + } +} +/// Shifts packed 32-bit integers in `a` left by `count` while +/// shifting in zeros, and returns the result +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32) +pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i { + { + transmute(pslld(a.as_i32x8(), count.as_i32x4())) + } +} +/// Shifts packed 64-bit integers in `a` left by `count` while +/// shifting in zeros, and returns the result +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64) +pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i { + { + transmute(psllq(a.as_i64x4(), count.as_i64x2())) + } +} +/// Shifts packed 16-bit integers in `a` left by `IMM8` while +/// shifting in zeros, return the results; +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16) +pub fn _mm256_slli_epi16(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + if IMM8 >= 16 { + _mm256_setzero_si256() + } else { + transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16))) + } + } +} +/// Shifts packed 32-bit integers in `a` left by `IMM8` while +/// shifting in zeros, return the results; +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32) +pub fn _mm256_slli_epi32(a: __m256i) -> __m256i { + { + static_assert_uimm_bits!(IMM8, 8); + if IMM8 >= 32 { + _mm256_setzero_si256() + } else { + transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32))) + } + } +} +/// Shifts packed 64-bit integers in `a` left by `IMM8` while +/// shifting in zeros, return the results; +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64) +pub fn _mm256_slli_epi64(a: __m256i) -> __m256i { + { + static_assert_uimm_bits!(IMM8, 8); + if IMM8 >= 64 { + _mm256_setzero_si256() + } else { + transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64))) + } + } +} +/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256) +pub fn _mm256_slli_si256(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + _mm256_bslli_epi128::(a) +} + +/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128) +pub fn _mm256_bslli_epi128(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + const fn mask(shift: i32, i: u32) -> u32 { + let shift = shift as u32 & 0xff; + if shift > 15 || i % 16 < shift { + 0 + } else { + 32 + (i - shift) + } + } + { + let a = a.as_i8x32(); + let r: i8x32 = simd_shuffle( + i8x32::ZERO(), + a, + [ + mask(IMM8, 0), + mask(IMM8, 1), + mask(IMM8, 2), + mask(IMM8, 3), + mask(IMM8, 4), + mask(IMM8, 5), + mask(IMM8, 6), + mask(IMM8, 7), + mask(IMM8, 8), + mask(IMM8, 9), + mask(IMM8, 10), + mask(IMM8, 11), + mask(IMM8, 12), + mask(IMM8, 13), + mask(IMM8, 14), + mask(IMM8, 15), + mask(IMM8, 16), + mask(IMM8, 17), + mask(IMM8, 18), + mask(IMM8, 19), + mask(IMM8, 20), + mask(IMM8, 21), + mask(IMM8, 22), + mask(IMM8, 23), + mask(IMM8, 24), + mask(IMM8, 25), + mask(IMM8, 26), + mask(IMM8, 27), + mask(IMM8, 28), + mask(IMM8, 29), + mask(IMM8, 30), + mask(IMM8, 31), + ], + ); + transmute(r) + } +} +/// Shifts packed 32-bit integers in `a` left by the amount +/// specified by the corresponding element in `count` while +/// shifting in zeros, and returns the result. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32) +pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { + { + transmute(psllvd(a.as_i32x4(), count.as_i32x4())) + } +} +/// Shifts packed 32-bit integers in `a` left by the amount +/// specified by the corresponding element in `count` while +/// shifting in zeros, and returns the result. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32) +pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { + { + transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) + } +} +/// Shifts packed 64-bit integers in `a` left by the amount +/// specified by the corresponding element in `count` while +/// shifting in zeros, and returns the result. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64) +pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { + { + transmute(psllvq(a.as_i64x2(), count.as_i64x2())) + } +} +/// Shifts packed 64-bit integers in `a` left by the amount +/// specified by the corresponding element in `count` while +/// shifting in zeros, and returns the result. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64) +pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i { + { + transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) + } +} +/// Shifts packed 16-bit integers in `a` right by `count` while +/// shifting in sign bits. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16) +pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i { + { + transmute(psraw(a.as_i16x16(), count.as_i16x8())) + } +} +/// Shifts packed 32-bit integers in `a` right by `count` while +/// shifting in sign bits. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32) +pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i { + { + transmute(psrad(a.as_i32x8(), count.as_i32x4())) + } +} +/// Shifts packed 16-bit integers in `a` right by `IMM8` while +/// shifting in sign bits. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16) +pub fn _mm256_srai_epi16(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) + } +} +/// Shifts packed 32-bit integers in `a` right by `IMM8` while +/// shifting in sign bits. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32) +pub fn _mm256_srai_epi32(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) + } +} +/// Shifts packed 32-bit integers in `a` right by the amount specified by the +/// corresponding element in `count` while shifting in sign bits. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32) +pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { + { + transmute(psravd(a.as_i32x4(), count.as_i32x4())) + } +} +/// Shifts packed 32-bit integers in `a` right by the amount specified by the +/// corresponding element in `count` while shifting in sign bits. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32) +pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { + { + transmute(psravd256(a.as_i32x8(), count.as_i32x8())) + } +} +/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256) +pub fn _mm256_srli_si256(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + _mm256_bsrli_epi128::(a) +} +/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128) +pub fn _mm256_bsrli_epi128(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + const fn mask(shift: i32, i: u32) -> u32 { + let shift = shift as u32 & 0xff; + if shift > 15 || (15 - (i % 16)) < shift { + 0 + } else { + 32 + (i + shift) + } + } + { + let a = a.as_i8x32(); + let r: i8x32 = simd_shuffle( + i8x32::ZERO(), + a, + [ + mask(IMM8, 0), + mask(IMM8, 1), + mask(IMM8, 2), + mask(IMM8, 3), + mask(IMM8, 4), + mask(IMM8, 5), + mask(IMM8, 6), + mask(IMM8, 7), + mask(IMM8, 8), + mask(IMM8, 9), + mask(IMM8, 10), + mask(IMM8, 11), + mask(IMM8, 12), + mask(IMM8, 13), + mask(IMM8, 14), + mask(IMM8, 15), + mask(IMM8, 16), + mask(IMM8, 17), + mask(IMM8, 18), + mask(IMM8, 19), + mask(IMM8, 20), + mask(IMM8, 21), + mask(IMM8, 22), + mask(IMM8, 23), + mask(IMM8, 24), + mask(IMM8, 25), + mask(IMM8, 26), + mask(IMM8, 27), + mask(IMM8, 28), + mask(IMM8, 29), + mask(IMM8, 30), + mask(IMM8, 31), + ], + ); + transmute(r) + } +} +/// Shifts packed 16-bit integers in `a` right by `count` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16) +pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i { + { + transmute(psrlw(a.as_i16x16(), count.as_i16x8())) + } +} +/// Shifts packed 32-bit integers in `a` right by `count` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32) +pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i { + { + transmute(psrld(a.as_i32x8(), count.as_i32x4())) + } +} +/// Shifts packed 64-bit integers in `a` right by `count` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64) +pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i { + { + transmute(psrlq(a.as_i64x4(), count.as_i64x2())) + } +} +/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in +/// zeros +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16) +pub fn _mm256_srli_epi16(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + if IMM8 >= 16 { + _mm256_setzero_si256() + } else { + transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16))) + } + } +} +/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in +/// zeros +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32) +pub fn _mm256_srli_epi32(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + if IMM8 >= 32 { + _mm256_setzero_si256() + } else { + transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32))) + } + } +} +/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in +/// zeros +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64) +pub fn _mm256_srli_epi64(a: __m256i) -> __m256i { + static_assert_uimm_bits!(IMM8, 8); + { + if IMM8 >= 64 { + _mm256_setzero_si256() + } else { + transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64))) + } + } +} +/// Shifts packed 32-bit integers in `a` right by the amount specified by +/// the corresponding element in `count` while shifting in zeros, +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32) +pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { + { + transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) + } +} +/// Shifts packed 32-bit integers in `a` right by the amount specified by +/// the corresponding element in `count` while shifting in zeros, +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32) +pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { + { + transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) + } +} +/// Shifts packed 64-bit integers in `a` right by the amount specified by +/// the corresponding element in `count` while shifting in zeros, +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64) +pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { + { + transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) + } +} +/// Shifts packed 64-bit integers in `a` right by the amount specified by +/// the corresponding element in `count` while shifting in zeros, +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64) +pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { + { + transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) + } +} +/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16) +pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) + } +} +/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a` +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32) +pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) + } +} +/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a` +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64) +pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) + } +} +/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8) +pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) + } +} +/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in +/// `a` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16) +pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) + } +} +/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in +/// `a` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8) +pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) + } +} +/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit +/// integers in `a` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16) +pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) + } +} +/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit +/// integers in `a` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8) +pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) + } +} +/// Unpacks and interleave 8-bit integers from the high half of each +/// 128-bit lane in `a` and `b`. +/// +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8) +pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i { + { + #[rustfmt::skip] + let r: i8x32 = simd_shuffle( + a.as_i8x32(), b.as_i8x32(), [8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, + 14, 46, 15, 47, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, + 63,] + ); + transmute(r) + } +} +/// Unpacks and interleave 8-bit integers from the low half of each +/// 128-bit lane of `a` and `b`. +/// +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8) +pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i { + { + #[rustfmt::skip] + let r: i8x32 = simd_shuffle( + a.as_i8x32(), b.as_i8x32(), [0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, + 7, 39, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,] + ); + transmute(r) + } +} +/// Unpacks and interleave 16-bit integers from the high half of each +/// 128-bit lane of `a` and `b`. +/// +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16) +pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i { + { + let r: i16x16 = simd_shuffle( + a.as_i16x16(), + b.as_i16x16(), + [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31], + ); + transmute(r) + } +} +/// Unpacks and interleave 16-bit integers from the low half of each +/// 128-bit lane of `a` and `b`. +/// +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16) +pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i { + { + let r: i16x16 = simd_shuffle( + a.as_i16x16(), + b.as_i16x16(), + [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27], + ); + transmute(r) + } +} +/// Unpacks and interleave 32-bit integers from the high half of each +/// 128-bit lane of `a` and `b`. +/// +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32) +pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { + { + let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]); + transmute(r) + } +} +/// Unpacks and interleave 32-bit integers from the low half of each +/// 128-bit lane of `a` and `b`. +/// +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32) +pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i { + { + let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]); + transmute(r) + } +} +/// Unpacks and interleave 64-bit integers from the high half of each +/// 128-bit lane of `a` and `b`. +/// +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64) +pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i { + { + let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]); + transmute(r) + } +} +/// Unpacks and interleave 64-bit integers from the low half of each +/// 128-bit lane of `a` and `b`. +/// +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64) +pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i { + { + let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]); + transmute(r) + } +} +/// Computes the bitwise XOR of 256 bits (representing integer data) +/// in `a` and `b` +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256) +pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { + { + transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) + } +} +/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit +/// integer containing the zero-extended integer data. +/// +/// See [LLVM commit D20468](https://reviews.llvm.org/D20468). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8) +pub fn _mm256_extract_epi8(a: __m256i) -> i32 { + static_assert_uimm_bits!(INDEX, 5); + { + simd_extract(a.as_u8x32(), INDEX as u32) as i32 + } +} +/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit +/// integer containing the zero-extended integer data. +/// +/// See [LLVM commit D20468](https://reviews.llvm.org/D20468). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16) +pub fn _mm256_extract_epi16(a: __m256i) -> i32 { + static_assert_uimm_bits!(INDEX, 4); + { + simd_extract(a.as_u16x16(), INDEX as u32) as i32 + } +} diff --git a/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs new file mode 100644 index 0000000000000..43f0a840b54bd --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs @@ -0,0 +1,620 @@ +use crate::abstractions::{bit::MachineInteger, simd::*}; +pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 { + i16x16::from_fn(|i| { + if i < 4 { + a[2 * i].wrapping_add(a[2 * i + 1]) + } else if i < 8 { + b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1]) + } else if i < 12 { + a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1]) + } else { + b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1]) + } + }) +} + +pub fn phaddd(a: i32x8, b: i32x8) -> i32x8 { + i32x8::from_fn(|i| { + if i < 2 { + a[2 * i].wrapping_add(a[2 * i + 1]) + } else if i < 4 { + b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1]) + } else if i < 6 { + a[2 * (i - 2)].wrapping_add(a[2 * (i - 2) + 1]) + } else { + b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1]) + } + }) +} + +pub fn phaddsw(a: i16x16, b: i16x16) -> i16x16 { + i16x16::from_fn(|i| { + if i < 4 { + a[2 * i].saturating_add(a[2 * i + 1]) + } else if i < 8 { + b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1]) + } else if i < 12 { + a[2 * (i - 4)].saturating_add(a[2 * (i - 4) + 1]) + } else { + b[2 * (i - 8)].saturating_add(b[2 * (i - 8) + 1]) + } + }) +} + +pub fn phsubw(a: i16x16, b: i16x16) -> i16x16 { + i16x16::from_fn(|i| { + if i < 4 { + a[2 * i].wrapping_sub(a[2 * i + 1]) + } else if i < 8 { + b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1]) + } else if i < 12 { + a[2 * (i - 4)].wrapping_sub(a[2 * (i - 4) + 1]) + } else { + b[2 * (i - 8)].wrapping_sub(b[2 * (i - 8) + 1]) + } + }) +} + +pub fn phsubd(a: i32x8, b: i32x8) -> i32x8 { + i32x8::from_fn(|i| { + if i < 2 { + a[2 * i].wrapping_sub(a[2 * i + 1]) + } else if i < 4 { + b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1]) + } else if i < 6 { + a[2 * (i - 2)].wrapping_sub(a[2 * (i - 2) + 1]) + } else { + b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1]) + } + }) +} + +pub fn phsubsw(a: i16x16, b: i16x16) -> i16x16 { + i16x16::from_fn(|i| { + if i < 4 { + a[2 * i].saturating_sub(a[2 * i + 1]) + } else if i < 8 { + b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1]) + } else if i < 12 { + a[2 * (i - 4)].saturating_sub(a[2 * (i - 4) + 1]) + } else { + b[2 * (i - 8)].saturating_sub(b[2 * (i - 8) + 1]) + } + }) +} +pub fn pmaddwd(a: i16x16, b: i16x16) -> i32x8 { + i32x8::from_fn(|i| { + (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32) + }) +} + +pub fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16 { + i16x16::from_fn(|i| { + ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16)) + .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16)) + }) +} +pub fn packsswb(a: i16x16, b: i16x16) -> i8x32 { + i8x32::from_fn(|i| { + if i < 8 { + if a[i] > (i8::MAX as i16) { + i8::MAX + } else if a[i] < (i8::MIN as i16) { + i8::MIN + } else { + a[i] as i8 + } + } else if i < 16 { + if b[i - 8] > (i8::MAX as i16) { + i8::MAX + } else if b[i - 8] < (i8::MIN as i16) { + i8::MIN + } else { + b[i - 8] as i8 + } + } else if i < 24 { + if a[i - 8] > (i8::MAX as i16) { + i8::MAX + } else if a[i - 8] < (i8::MIN as i16) { + i8::MIN + } else { + a[i - 8] as i8 + } + } else { + if b[i - 16] > (i8::MAX as i16) { + i8::MAX + } else if b[i - 16] < (i8::MIN as i16) { + i8::MIN + } else { + b[i - 16] as i8 + } + } + }) +} + +pub fn packssdw(a: i32x8, b: i32x8) -> i16x16 { + i16x16::from_fn(|i| { + if i < 4 { + if a[i] > (i16::MAX as i32) { + i16::MAX + } else if a[i] < (i16::MIN as i32) { + i16::MIN + } else { + a[i] as i16 + } + } else if i < 8 { + if b[i - 4] > (i16::MAX as i32) { + i16::MAX + } else if b[i - 4] < (i16::MIN as i32) { + i16::MIN + } else { + b[i - 4] as i16 + } + } else if i < 12 { + if a[i - 4] > (i16::MAX as i32) { + i16::MAX + } else if a[i - 4] < (i16::MIN as i32) { + i16::MIN + } else { + a[i - 4] as i16 + } + } else { + if b[i - 8] > (i16::MAX as i32) { + i16::MAX + } else if b[i - 8] < (i16::MIN as i32) { + i16::MIN + } else { + b[i - 8] as i16 + } + } + }) +} + +pub fn packuswb(a: i16x16, b: i16x16) -> u8x32 { + u8x32::from_fn(|i| { + if i < 8 { + if a[i] > (u8::MAX as i16) { + u8::MAX + } else if a[i] < (u8::MIN as i16) { + u8::MIN + } else { + a[i] as u8 + } + } else if i < 16 { + if b[i - 8] > (u8::MAX as i16) { + u8::MAX + } else if b[i - 8] < (u8::MIN as i16) { + u8::MIN + } else { + b[i - 8] as u8 + } + } else if i < 24 { + if a[i - 8] > (u8::MAX as i16) { + u8::MAX + } else if a[i - 8] < (u8::MIN as i16) { + u8::MIN + } else { + a[i - 8] as u8 + } + } else { + if b[i - 16] > (u8::MAX as i16) { + u8::MAX + } else if b[i - 16] < (u8::MIN as i16) { + u8::MIN + } else { + b[i - 16] as u8 + } + } + }) +} + +pub fn packusdw(a: i32x8, b: i32x8) -> u16x16 { + u16x16::from_fn(|i| { + if i < 4 { + if a[i] > (u16::MAX as i32) { + u16::MAX + } else if a[i] < (u16::MIN as i32) { + u16::MIN + } else { + a[i] as u16 + } + } else if i < 8 { + if b[i - 4] > (u16::MAX as i32) { + u16::MAX + } else if b[i - 4] < (u16::MIN as i32) { + u16::MIN + } else { + b[i - 4] as u16 + } + } else if i < 12 { + if a[i - 4] > (u16::MAX as i32) { + u16::MAX + } else if a[i - 4] < (u16::MIN as i32) { + u16::MIN + } else { + a[i - 4] as u16 + } + } else { + if b[i - 8] > (u16::MAX as i32) { + u16::MAX + } else if b[i - 8] < (u16::MIN as i32) { + u16::MIN + } else { + b[i - 8] as u16 + } + } + }) +} + +pub fn psignb(a: i8x32, b: i8x32) -> i8x32 { + i8x32::from_fn(|i| { + if b[i] < 0 { + if a[i] == i8::MIN { + a[i] + } else { + -a[i] + } + } else if b[i] > 0 { + a[i] + } else { + 0 + } + }) +} +pub fn psignw(a: i16x16, b: i16x16) -> i16x16 { + i16x16::from_fn(|i| { + if b[i] < 0 { + if a[i] == i16::MIN { + a[i] + } else { + -a[i] + } + } else if b[i] > 0 { + a[i] + } else { + 0 + } + }) +} + +pub fn psignd(a: i32x8, b: i32x8) -> i32x8 { + i32x8::from_fn(|i| { + if b[i] < 0 { + if a[i] == i32::MIN { + a[i] + } else { + -a[i] + } + } else if b[i] > 0 { + a[i] + } else { + 0 + } + }) +} + +pub fn psllw(a: i16x16, count: i16x8) -> i16x16 { + let count4 = (count[0] as u16) as u64; + let count3 = ((count[1] as u16) as u64) * 65536; + let count2 = ((count[2] as u16) as u64) * 4294967296; + let count1 = ((count[3] as u16) as u64) * 281474976710656; + let count = count1 + count2 + count3 + count4; + i16x16::from_fn(|i| { + if count > 15 { + 0 + } else { + ((a[i] as u16) << count) as i16 + } + }) +} + +pub fn pslld(a: i32x8, count: i32x4) -> i32x8 { + let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64); + + i32x8::from_fn(|i| { + if count > 31 { + 0 + } else { + ((a[i] as u32) << count) as i32 + } + }) +} +pub fn psllq(a: i64x4, count: i64x2) -> i64x4 { + let count = count[0] as u32; + + i64x4::from_fn(|i| { + if count > 63 { + 0 + } else { + ((a[i] as u32) << count) as i64 + } + }) +} + +pub fn psllvd(a: i32x4, count: i32x4) -> i32x4 { + i32x4::from_fn(|i| { + if count[i] > 31 || count[i] < 0 { + 0 + } else { + ((a[i] as u32) << count[i]) as i32 + } + }) +} +pub fn psllvd256(a: i32x8, count: i32x8) -> i32x8 { + i32x8::from_fn(|i| { + if count[i] > 31 || count[i] < 0 { + 0 + } else { + ((a[i] as u32) << count[i]) as i32 + } + }) +} + +pub fn psllvq(a: i64x2, count: i64x2) -> i64x2 { + i64x2::from_fn(|i| { + if count[i] > 63 || count[i] < 0 { + 0 + } else { + ((a[i] as u32) << count[i]) as i64 + } + }) +} +pub fn psllvq256(a: i64x4, count: i64x4) -> i64x4 { + i64x4::from_fn(|i| { + if count[i] > 63 || count[i] < 0 { + 0 + } else { + ((a[i] as u32) << count[i]) as i64 + } + }) +} + +pub fn psraw(a: i16x16, count: i16x8) -> i16x16 { + let count = ((count[3] as u16) as u64) * 281474976710656 + + ((count[2] as u16) as u64) * 4294967296 + + ((count[1] as u16) as u64) * 65536 + + ((count[0] as u16) as u64); + + i16x16::from_fn(|i| { + if count > 15 { + if a[i] < 0 { + -1 + } else { + 0 + } + } else { + a[i] >> count + } + }) +} + +pub fn psrad(a: i32x8, count: i32x4) -> i32x8 { + let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64); + + i32x8::from_fn(|i| { + if count > 31 { + if a[i] < 0 { + -1 + } else { + 0 + } + } else { + a[i] << count + } + }) +} + +pub fn psravd(a: i32x4, count: i32x4) -> i32x4 { + i32x4::from_fn(|i| { + if count[i] > 31 || count[i] < 0 { + if a[i] < 0 { + -1 + } else { + 0 + } + } else { + a[i] >> count[i] + } + }) +} + +pub fn psravd256(a: i32x8, count: i32x8) -> i32x8 { + dbg!(a, count); + i32x8::from_fn(|i| { + if count[i] > 31 || count[i] < 0 { + if a[i] < 0 { + -1 + } else { + 0 + } + } else { + a[i] >> count[i] + } + }) +} + +pub fn psrlw(a: i16x16, count: i16x8) -> i16x16 { + let count = (count[3] as u16 as u64) * 281474976710656 + + (count[2] as u16 as u64) * 4294967296 + + (count[1] as u16 as u64) * 65536 + + (count[0] as u16 as u64); + + i16x16::from_fn(|i| { + if count > 15 { + 0 + } else { + ((a[i] as u16) >> count) as i16 + } + }) +} + +pub fn psrld(a: i32x8, count: i32x4) -> i32x8 { + let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64); + + i32x8::from_fn(|i| { + if count > 31 { + 0 + } else { + ((a[i] as u32) >> count) as i32 + } + }) +} + +pub fn psrlq(a: i64x4, count: i64x2) -> i64x4 { + let count: u64 = count[0] as u64; + + i64x4::from_fn(|i| { + if count > 63 { + 0 + } else { + ((a[i] as u32) >> count) as i64 + } + }) +} + +pub fn psrlvd(a: i32x4, count: i32x4) -> i32x4 { + i32x4::from_fn(|i| { + if count[i] > 31 || count[i] < 0 { + 0 + } else { + ((a[i] as u32) >> count[i]) as i32 + } + }) +} + +pub fn psrlvd256(a: i32x8, count: i32x8) -> i32x8 { + i32x8::from_fn(|i| { + if count[i] > 31 || count[i] < 0 { + 0 + } else { + ((a[i] as u32) >> count[i]) as i32 + } + }) +} + +pub fn psrlvq(a: i64x2, count: i64x2) -> i64x2 { + i64x2::from_fn(|i| { + if count[i] > 63 || count[i] < 0 { + 0 + } else { + ((a[i] as u32) >> count[i]) as i64 + } + }) +} +pub fn psrlvq256(a: i64x4, count: i64x4) -> i64x4 { + i64x4::from_fn(|i| { + if count[i] > 63 || count[i] < 0 { + 0 + } else { + ((a[i] as u32) >> count[i]) as i64 + } + }) +} + +pub fn pshufb(a: u8x32, b: u8x32) -> u8x32 { + u8x32::from_fn(|i| { + if i < 16 { + if b[i] > 127 { + 0 + } else { + let index = (b[i] % 16) as u32; + a[index] + } + } else { + if b[i] > 127 { + 0 + } else { + let index = (b[i] % 16) as u32; + a[index + 16] + } + } + }) +} + +pub fn permd(a: u32x8, b: u32x8) -> u32x8 { + u32x8::from_fn(|i| { + let id = b[i] % 8; + a[id] + }) +} + +pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16 { + u16x16::from_fn(|i| { + if i < 8 { + let a_offset = (((imm8 & 4) >> 2) * 4) as u32; + let b_offset = ((imm8 & 3) * 4) as u32; + let k = a_offset + i; + let l = b_offset; + ((a[k].wrapping_abs_diff(b[l]) as i8) as u8 as u16) + + ((a[k + 1].wrapping_abs_diff(b[l + 1]) as i8) as u8 as u16) + + ((a[k + 2].wrapping_abs_diff(b[l + 2]) as i8) as u8 as u16) + + ((a[k + 3].wrapping_abs_diff(b[l + 3]) as i8) as u8 as u16) + } else { + let i = i - 8; + let imm8 = imm8 >> 3; + let a_offset = (((imm8 & 4) >> 2) * 4) as u32; + let b_offset = ((imm8 & 3) * 4) as u32; + let k = a_offset + i; + let l = b_offset; + ((a[16 + k].wrapping_abs_diff(b[16 + l]) as i8) as u8 as u16) + + ((a[16 + k + 1].wrapping_abs_diff(b[16 + l + 1]) as i8) as u8 as u16) + + ((a[16 + k + 2].wrapping_abs_diff(b[16 + l + 2]) as i8) as u8 as u16) + + ((a[16 + k + 3].wrapping_abs_diff(b[16 + l + 3]) as i8) as u8 as u16) + } + }) +} + +pub fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4 { + let a = i128x2::from_fn(|i| { + ((a[2 * i] as u64 as u128) + ((a[2 * i + 1] as u64 as u128) << 64)) as i128 + }); + let b = i128x2::from_fn(|i| { + ((b[2 * i] as u64 as u128) + ((b[2 * i + 1] as u64 as u128) << 64)) as i128 + }); + let imm8 = imm8 as u8 as u32 as i32; + let r = i128x2::from_fn(|i| { + let control = imm8 >> (i * 4); + if (control >> 3) % 2 == 1 { + 0 + } else { + match control % 4 { + 0 => a[0], + 1 => a[1], + 2 => b[0], + 3 => b[1], + _ => unreachable!(), + } + } + }); + i64x4::from_fn(|i| { + let index = i >> 1; + let hilo = i.rem_euclid(2); + let val = r[index]; + if hilo == 0 { + i64::cast(val) + } else { + i64::cast(val >> 64) + } + }) +} +pub fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16 { + i16x16::from_fn(|i| { + let temp = (a[i] as i32) * (b[i] as i32); + let temp = (temp >> 14).wrapping_add(1) >> 1; + temp as i16 + }) +} + +pub fn psadbw(a: u8x32, b: u8x32) -> u64x4 { + let tmp = u8x32::from_fn(|i| a[i].wrapping_abs_diff(b[i])); + u64x4::from_fn(|i| { + (tmp[i * 8] as u16) + .wrapping_add(tmp[i * 8 + 1] as u16) + .wrapping_add(tmp[i * 8 + 2] as u16) + .wrapping_add(tmp[i * 8 + 3] as u16) + .wrapping_add(tmp[i * 8 + 4] as u16) + .wrapping_add(tmp[i * 8 + 5] as u16) + .wrapping_add(tmp[i * 8 + 6] as u16) + .wrapping_add(tmp[i * 8 + 7] as u16) as u64 + }) +} diff --git a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs new file mode 100644 index 0000000000000..ba61996851392 --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs @@ -0,0 +1,31 @@ +use crate::abstractions::simd::*; + +pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 { + let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) { + 0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128), + 1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128), + 2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128), + 3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128), + _ => unreachable!(), + }); + + i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32) +} + +pub fn ptestz256(a: i64x4, b: i64x4) -> i32 { + let c = i64x4::from_fn(|i| a[i] & b[i]); + if c == i64x4::ZERO() { + 1 + } else { + 0 + } +} + +pub fn ptestc256(a: i64x4, b: i64x4) -> i32 { + let c = i64x4::from_fn(|i| !a[i] & b[i]); + if c == i64x4::ZERO() { + 1 + } else { + 0 + } +} diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs new file mode 100644 index 0000000000000..79b660019c07c --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/models/mod.rs @@ -0,0 +1,48 @@ +//! Rust models for x86 intrinsics. +//! +//! This module contains models for the intrinsics as they are defined in the Rust core. +//! Since this is supposed to model the Rust core, the implemented functions must +//! mirror the Rust implementations as closely as they can. +//! +//! For example, calls to simd functions like simd_add and simd_sub are left as is, +//! with their implementations defined in `crate::abstractions::simd`. Some other +//! operations like simd_cast or simd_shuffle might need a little modification +//! for correct compilation. +//! +//! Calls to transmute are replaced with either an explicit call to a `BitVec::from_ function`, +//! or with `.into()`. +//! +//! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding +//! LLVM instruction via an `unsafe extern "C"` module. In those cases, the corresponding +//! function is defined in the `c_extern` module in each file, which contain manually +//! written implementations made by consulting the appropriate Intel documentation. +//! +//! In general, it is best to gain an idea of how an implementation should be written by looking +//! at how other functions are implemented. Also see `core::arch::x86` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch). + +pub mod avx; +pub mod avx2; +pub mod avx2_handwritten; +pub mod avx_handwritten; +pub mod sse; +pub mod sse2; +pub mod sse2_handwritten; +pub mod ssse3; +pub mod ssse3_handwritten; + +pub(crate) mod types { + use crate::abstractions::bitvec::*; + + #[allow(non_camel_case_types)] + pub type __m256i = BitVec<256>; + #[allow(non_camel_case_types)] + pub type __m256 = BitVec<256>; + #[allow(non_camel_case_types)] + pub type __m256d = BitVec<256>; + #[allow(non_camel_case_types)] + pub type __m128 = BitVec<128>; + #[allow(non_camel_case_types)] + pub type __m128i = BitVec<128>; + #[allow(non_camel_case_types)] + pub type __m128d = BitVec<128>; +} diff --git a/testable-simd-models/src/core_arch/x86/models/sse.rs b/testable-simd-models/src/core_arch/x86/models/sse.rs new file mode 100644 index 0000000000000..f975c2814438a --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/models/sse.rs @@ -0,0 +1,21 @@ +//! Streaming SIMD Extensions (SSE) +use super::types::*; +use crate::abstractions::simd::*; +use crate::abstractions::utilities::*; + +/// Returns vector of type __m128 with indeterminate elements.with indetermination elements. +/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically +/// picks some valid value and is not equivalent to [`mem::MaybeUninit`]. +/// In practice, this is typically equivalent to [`mem::zeroed`]. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps) +pub fn _mm_undefined_ps() -> __m128 { + transmute(f32x4::ZERO()) +} + +/// Construct a `__m128` with all elements initialized to zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps) +pub fn _mm_setzero_ps() -> __m128 { + transmute(f32x4::ZERO()) +} diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs new file mode 100644 index 0000000000000..c9c90e3e9e267 --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs @@ -0,0 +1,1618 @@ +//! Streaming SIMD Extensions 2 (SSE2) +use super::sse2_handwritten::*; +use super::types::*; +use crate::abstractions::simd::*; +use crate::abstractions::utilities::*; + +/// Adds packed 8-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8) +pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_add(a.as_i8x16(), b.as_i8x16())) +} +/// Adds packed 16-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16) +pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_add(a.as_i16x8(), b.as_i16x8())) +} +/// Adds packed 32-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32) +pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_add(a.as_i32x4(), b.as_i32x4())) +} +/// Adds packed 64-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64) +pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_add(a.as_i64x2(), b.as_i64x2())) +} +/// Adds packed 8-bit integers in `a` and `b` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8) +pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) +} +/// Adds packed 16-bit integers in `a` and `b` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16) +pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) +} +/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8) +pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) +} +/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16) +pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) +} +/// Averages packed unsigned 8-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8) +pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { + { + let a = simd_cast::<16, _, u16>(a.as_u8x16()); + let b = simd_cast::<16, _, u16>(b.as_u8x16()); + let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1)); + transmute(simd_cast::<16, _, u8>(r)) + } +} +/// Averages packed unsigned 16-bit integers in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16) +pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i { + { + let a = simd_cast::<8, _, u32>(a.as_u16x8()); + let b = simd_cast::<8, _, u32>(b.as_u16x8()); + let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1)); + transmute(simd_cast::<8, _, u16>(r)) + } +} +/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`. +/// +/// Multiplies packed signed 16-bit integers in `a` and `b`, producing +/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of +/// intermediate 32-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16) +pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) +} +/// Compares packed 16-bit integers in `a` and `b`, and returns the packed +/// maximum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16) +pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i { + { + let a = a.as_i16x8(); + let b = b.as_i16x8(); + transmute(simd_select(simd_gt(a, b), a, b)) + } +} +/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the +/// packed maximum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8) +pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i { + { + let a = a.as_u8x16(); + let b = b.as_u8x16(); + transmute(simd_select(simd_gt(a, b), a, b)) + } +} +/// Compares packed 16-bit integers in `a` and `b`, and returns the packed +/// minimum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16) +pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i { + { + let a = a.as_i16x8(); + let b = b.as_i16x8(); + transmute(simd_select(simd_lt(a, b), a, b)) + } +} +/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the +/// packed minimum values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8) +pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i { + { + let a = a.as_u8x16(); + let b = b.as_u8x16(); + transmute(simd_select(simd_lt(a, b), a, b)) + } +} +/// Multiplies the packed 16-bit integers in `a` and `b`. +/// +/// The multiplication produces intermediate 32-bit integers, and returns the +/// high 16 bits of the intermediate integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16) +pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { + { + let a = simd_cast::<8, _, i32>(a.as_i16x8()); + let b = simd_cast::<8, _, i32>(b.as_i16x8()); + let r = simd_shr(simd_mul(a, b), i32x8::splat(16)); + transmute(simd_cast::<8, i32, i16>(r)) + } +} +/// Multiplies the packed unsigned 16-bit integers in `a` and `b`. +/// +/// The multiplication produces intermediate 32-bit integers, and returns the +/// high 16 bits of the intermediate integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16) +pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i { + { + let a = simd_cast::<8, _, u32>(a.as_u16x8()); + let b = simd_cast::<8, _, u32>(b.as_u16x8()); + let r = simd_shr(simd_mul(a, b), u32x8::splat(16)); + transmute(simd_cast::<8, u32, u16>(r)) + } +} +/// Multiplies the packed 16-bit integers in `a` and `b`. +/// +/// The multiplication produces intermediate 32-bit integers, and returns the +/// low 16 bits of the intermediate integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16) +pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) +} +/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element +/// in `a` and `b`. +/// +/// Returns the unsigned 64-bit results. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32) +pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i { + { + let a = a.as_u64x2(); + let b = b.as_u64x2(); + let mask = u64x2::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) + } +} +/// Sum the absolute differences of packed unsigned 8-bit integers. +/// +/// Computes the absolute differences of packed unsigned 8-bit integers in `a` +/// and `b`, then horizontally sum each consecutive 8 differences to produce +/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in +/// the low 16 bits of 64-bit elements returned. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8) +pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i { + transmute(psadbw(a.as_u8x16(), b.as_u8x16())) +} +/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8) +pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) +} +/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16) +pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) +} +/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32) +pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) +} +/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64) +pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) +} +/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` +/// using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8) +pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) +} +/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` +/// using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16) +pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) +} +/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit +/// integers in `a` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8) +pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) +} +/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit +/// integers in `a` using saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16) +pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) +} +/// Shifts `a` left by `IMM8` bytes while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128) +pub fn _mm_slli_si128(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + _mm_slli_si128_impl::(a) +} + +fn _mm_slli_si128_impl(a: __m128i) -> __m128i { + const fn mask(shift: i32, i: u32) -> u32 { + let shift = shift as u32 & 0xff; + if shift > 15 { + i + } else { + 16 - shift + i + } + } + transmute::(simd_shuffle( + i8x16::ZERO(), + a.as_i8x16(), + [ + mask(IMM8, 0), + mask(IMM8, 1), + mask(IMM8, 2), + mask(IMM8, 3), + mask(IMM8, 4), + mask(IMM8, 5), + mask(IMM8, 6), + mask(IMM8, 7), + mask(IMM8, 8), + mask(IMM8, 9), + mask(IMM8, 10), + mask(IMM8, 11), + mask(IMM8, 12), + mask(IMM8, 13), + mask(IMM8, 14), + mask(IMM8, 15), + ], + )) +} + +/// Shifts `a` left by `IMM8` bytes while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128) +pub fn _mm_bslli_si128(a: __m128i) -> __m128i { + { + static_assert_uimm_bits!(IMM8, 8); + _mm_slli_si128_impl::(a) + } +} +/// Shifts `a` right by `IMM8` bytes while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128) +pub fn _mm_bsrli_si128(a: __m128i) -> __m128i { + { + static_assert_uimm_bits!(IMM8, 8); + _mm_srli_si128_impl::(a) + } +} + +fn _mm_srli_si128_impl(a: __m128i) -> __m128i { + const fn mask(shift: i32, i: u32) -> u32 { + if (shift as u32) > 15 { + i + 16 + } else { + i + (shift as u32) + } + } + let x: i8x16 = simd_shuffle( + a.as_i8x16(), + i8x16::ZERO(), + [ + mask(IMM8, 0), + mask(IMM8, 1), + mask(IMM8, 2), + mask(IMM8, 3), + mask(IMM8, 4), + mask(IMM8, 5), + mask(IMM8, 6), + mask(IMM8, 7), + mask(IMM8, 8), + mask(IMM8, 9), + mask(IMM8, 10), + mask(IMM8, 11), + mask(IMM8, 12), + mask(IMM8, 13), + mask(IMM8, 14), + mask(IMM8, 15), + ], + ); + transmute(x) +} +/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16) +pub fn _mm_slli_epi16(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + { + if IMM8 >= 16 { + _mm_setzero_si128() + } else { + transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16))) + } + } +} +/// Shifts packed 16-bit integers in `a` left by `count` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16) +pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i { + transmute(psllw(a.as_i16x8(), count.as_i16x8())) +} +/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32) +pub fn _mm_slli_epi32(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + { + if IMM8 >= 32 { + _mm_setzero_si128() + } else { + transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32))) + } + } +} +/// Shifts packed 32-bit integers in `a` left by `count` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32) +pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i { + transmute(pslld(a.as_i32x4(), count.as_i32x4())) +} +/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64) +pub fn _mm_slli_epi64(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + { + if IMM8 >= 64 { + _mm_setzero_si128() + } else { + transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64))) + } + } +} +/// Shifts packed 64-bit integers in `a` left by `count` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64) +pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i { + transmute(psllq(a.as_i64x2(), count.as_i64x2())) +} +/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign +/// bits. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16) +pub fn _mm_srai_epi16(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) +} +/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign +/// bits. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16) +pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i { + transmute(psraw(a.as_i16x8(), count.as_i16x8())) +} +/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign +/// bits. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32) +pub fn _mm_srai_epi32(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) +} +/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign +/// bits. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32) +pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i { + transmute(psrad(a.as_i32x4(), count.as_i32x4())) +} +/// Shifts `a` right by `IMM8` bytes while shifting in zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128) +pub fn _mm_srli_si128(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + _mm_srli_si128_impl::(a) +} +/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16) +pub fn _mm_srli_epi16(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + { + if IMM8 >= 16 { + _mm_setzero_si128() + } else { + transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16))) + } + } +} +/// Shifts packed 16-bit integers in `a` right by `count` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16) +pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i { + transmute(psrlw(a.as_i16x8(), count.as_i16x8())) +} +/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32) +pub fn _mm_srli_epi32(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + { + if IMM8 >= 32 { + _mm_setzero_si128() + } else { + transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32))) + } + } +} +/// Shifts packed 32-bit integers in `a` right by `count` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32) +pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i { + transmute(psrld(a.as_i32x4(), count.as_i32x4())) +} +/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64) +pub fn _mm_srli_epi64(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + { + if IMM8 >= 64 { + _mm_setzero_si128() + } else { + transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64))) + } + } +} +/// Shifts packed 64-bit integers in `a` right by `count` while shifting in +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64) +pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i { + transmute(psrlq(a.as_i64x2(), count.as_i64x2())) +} +/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and +/// `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128) +pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_and(a.as_i32x4(), b.as_i32x4())) +} +/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and +/// then AND with `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128) +pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_and( + simd_xor(_mm_set1_epi8(-1).as_i32x4(), a.as_i32x4()), + b.as_i32x4(), + )) +} +/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and +/// `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128) +pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_or(a.as_i32x4(), b.as_i32x4())) +} +/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and +/// `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128) +pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i { + transmute(simd_xor(a.as_i32x4(), b.as_i32x4())) +} +/// Compares packed 8-bit integers in `a` and `b` for equality. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8) +pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_eq(a.as_i8x16(), b.as_i8x16())) +} +/// Compares packed 16-bit integers in `a` and `b` for equality. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16) +pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_eq(a.as_i16x8(), b.as_i16x8())) +} +/// Compares packed 32-bit integers in `a` and `b` for equality. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32) +pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_eq(a.as_i32x4(), b.as_i32x4())) +} +/// Compares packed 8-bit integers in `a` and `b` for greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8) +pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_gt(a.as_i8x16(), b.as_i8x16())) +} +/// Compares packed 16-bit integers in `a` and `b` for greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16) +pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_gt(a.as_i16x8(), b.as_i16x8())) +} +/// Compares packed 32-bit integers in `a` and `b` for greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32) +pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_gt(a.as_i32x4(), b.as_i32x4())) +} +/// Compares packed 8-bit integers in `a` and `b` for less-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8) +pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_lt(a.as_i8x16(), b.as_i8x16())) +} +/// Compares packed 16-bit integers in `a` and `b` for less-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16) +pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_lt(a.as_i16x8(), b.as_i16x8())) +} +/// Compares packed 32-bit integers in `a` and `b` for less-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32) +pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_lt(a.as_i32x4(), b.as_i32x4())) +} +/// Converts the lower two packed 32-bit integers in `a` to packed +/// double-precision (64-bit) floating-point elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd) +pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d { + { + let a = a.as_i32x4(); + transmute(simd_cast::<2, i32, f64>(simd_shuffle(a, a, [0, 1]))) + } +} +/// Returns `a` with its lower element replaced by `b` after converting it to +/// an `f64`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd) +pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d { + transmute(simd_insert(a.as_f64x2(), 0, b as f64)) +} +/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit) +/// floating-point elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps) +pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 { + transmute(simd_cast::<4, _, f32>(a.as_i32x4())) +} +/// Converts packed single-precision (32-bit) floating-point elements in `a` +/// to packed 32-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32) +// NOTE: Not modeled yet +// pub fn _mm_cvtps_epi32(a: __m128) -> __m128i { +// { transmute(cvtps2dq(a)) } +// } +/// Returns a vector whose lowest element is `a` and all higher elements are +/// `0`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128) +pub fn _mm_cvtsi32_si128(a: i32) -> __m128i { + transmute(i32x4::new(a, 0, 0, 0)) +} +/// Returns the lowest element of `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32) +pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 { + simd_extract(a.as_i32x4(), 0) +} +/// Sets packed 64-bit integers with the supplied values, from highest to +/// lowest. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x) +pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i { + transmute(i64x2::new(e0, e1)) +} +/// Sets packed 32-bit integers with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32) +pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i { + transmute(i32x4::new(e0, e1, e2, e3)) +} +/// Sets packed 16-bit integers with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16) +pub fn _mm_set_epi16( + e7: i16, + e6: i16, + e5: i16, + e4: i16, + e3: i16, + e2: i16, + e1: i16, + e0: i16, +) -> __m128i { + transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) +} +/// Sets packed 8-bit integers with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8) +pub fn _mm_set_epi8( + e15: i8, + e14: i8, + e13: i8, + e12: i8, + e11: i8, + e10: i8, + e9: i8, + e8: i8, + e7: i8, + e6: i8, + e5: i8, + e4: i8, + e3: i8, + e2: i8, + e1: i8, + e0: i8, +) -> __m128i { + { + transmute(i8x16::new( + e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, + )) + } +} +/// Broadcasts 64-bit integer `a` to all elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x) +pub fn _mm_set1_epi64x(a: i64) -> __m128i { + _mm_set_epi64x(a, a) +} +/// Broadcasts 32-bit integer `a` to all elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32) +pub fn _mm_set1_epi32(a: i32) -> __m128i { + _mm_set_epi32(a, a, a, a) +} +/// Broadcasts 16-bit integer `a` to all elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16) +pub fn _mm_set1_epi16(a: i16) -> __m128i { + _mm_set_epi16(a, a, a, a, a, a, a, a) +} +/// Broadcasts 8-bit integer `a` to all elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8) +pub fn _mm_set1_epi8(a: i8) -> __m128i { + _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} +/// Sets packed 32-bit integers with the supplied values in reverse order. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32) +pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i { + _mm_set_epi32(e0, e1, e2, e3) +} +/// Sets packed 16-bit integers with the supplied values in reverse order. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16) +pub fn _mm_setr_epi16( + e7: i16, + e6: i16, + e5: i16, + e4: i16, + e3: i16, + e2: i16, + e1: i16, + e0: i16, +) -> __m128i { + _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7) +} +/// Sets packed 8-bit integers with the supplied values in reverse order. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8) +pub fn _mm_setr_epi8( + e15: i8, + e14: i8, + e13: i8, + e12: i8, + e11: i8, + e10: i8, + e9: i8, + e8: i8, + e7: i8, + e6: i8, + e5: i8, + e4: i8, + e3: i8, + e2: i8, + e1: i8, + e0: i8, +) -> __m128i { + _mm_set_epi8( + e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, + ) +} +/// Returns a vector with all elements set to zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128) +pub fn _mm_setzero_si128() -> __m128i { + transmute(i32x4::ZERO()) +} +/// Returns a vector where the low element is extracted from `a` and its upper +/// element is zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64) +pub fn _mm_move_epi64(a: __m128i) -> __m128i { + { + let r: i64x2 = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 2]); + transmute(r) + } +} +/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// using signed saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16) +pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(packsswb(a.as_i16x8(), b.as_i16x8())) +} +/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// using signed saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32) +pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute(packssdw(a.as_i32x4(), b.as_i32x4())) +} +/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// using unsigned saturation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16) +pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(packuswb(a.as_i16x8(), b.as_i16x8())) +} +/// Returns the `imm8` element of `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16) +pub fn _mm_extract_epi16(a: __m128i) -> i32 { + static_assert_uimm_bits!(IMM8, 3); + simd_extract(a.as_u16x8(), IMM8 as u32) as i32 +} +/// Returns a new vector where the `imm8` element of `a` is replaced with `i`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16) +pub fn _mm_insert_epi16(a: __m128i, i: i32) -> __m128i { + static_assert_uimm_bits!(IMM8, 3); + transmute(simd_insert(a.as_i16x8(), IMM8 as u32, i as i16)) +} +/// Returns a mask of the most significant bit of each element in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8) +pub fn _mm_movemask_epi8(a: __m128i) -> i32 { + { + let z = i8x16::ZERO(); + let m: i8x16 = simd_lt(a.as_i8x16(), z); + simd_bitmask_little!(15, m, u16) as u32 as i32 + } +} +/// Shuffles 32-bit integers in `a` using the control in `IMM8`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32) +pub fn _mm_shuffle_epi32(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + { + let a = a.as_i32x4(); + let x: i32x4 = simd_shuffle( + a, + a, + [ + IMM8 as u32 & 0b11, + (IMM8 as u32 >> 2) & 0b11, + (IMM8 as u32 >> 4) & 0b11, + (IMM8 as u32 >> 6) & 0b11, + ], + ); + transmute(x) + } +} +/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in +/// `IMM8`. +/// +/// Put the results in the high 64 bits of the returned vector, with the low 64 +/// bits being copied from `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16) +pub fn _mm_shufflehi_epi16(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + { + let a = a.as_i16x8(); + let x: i16x8 = simd_shuffle( + a, + a, + [ + 0, + 1, + 2, + 3, + (IMM8 as u32 & 0b11) + 4, + ((IMM8 as u32 >> 2) & 0b11) + 4, + ((IMM8 as u32 >> 4) & 0b11) + 4, + ((IMM8 as u32 >> 6) & 0b11) + 4, + ], + ); + transmute(x) + } +} +/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in +/// `IMM8`. +/// +/// Put the results in the low 64 bits of the returned vector, with the high 64 +/// bits being copied from `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16) +pub fn _mm_shufflelo_epi16(a: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + { + let a = a.as_i16x8(); + let x: i16x8 = simd_shuffle( + a, + a, + [ + IMM8 as u32 & 0b11, + (IMM8 as u32 >> 2) & 0b11, + (IMM8 as u32 >> 4) & 0b11, + (IMM8 as u32 >> 6) & 0b11, + 4, + 5, + 6, + 7, + ], + ); + transmute(x) + } +} +/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8) +pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i { + { + transmute::(simd_shuffle( + a.as_i8x16(), + b.as_i8x16(), + [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31], + )) + } +} +/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16) +pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i { + { + let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]); + transmute::(x) + } +} +/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32) +pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) +} +/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64) +pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [1, 3])) +} +/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8) +pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i { + { + transmute::(simd_shuffle( + a.as_i8x16(), + b.as_i8x16(), + [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23], + )) + } +} +/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16) +pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i { + { + let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]); + transmute::(x) + } +} +/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32) +pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) +} +/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64) +pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i { + transmute::(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [0, 2])) +} +/// Returns a new vector with the low element of `a` replaced by the sum of the +/// low elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd) +// NOTE: Not modeled yet +// pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d { +// { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))) } +// } +/// Adds packed double-precision (64-bit) floating-point elements in `a` and +/// `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd) +// NOTE: Not modeled yet +// pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d { +// { simd_add(a, b) } +// } +/// Returns a new vector with the low element of `a` replaced by the result of +/// diving the lower element of `a` by the lower element of `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd) +// NOTE: Not modeled yet +// pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d { +// { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))) } +// } +/// Divide packed double-precision (64-bit) floating-point elements in `a` by +/// packed elements in `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd) +// NOTE: Not modeled yet +// pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d { +// { simd_div(a, b) } +// } +/// Returns a new vector with the low element of `a` replaced by the maximum +/// of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd) +// NOTE: Not modeled yet +// pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d { +// { maxsd(a, b) } +// } +/// Returns a new vector with the maximum values from corresponding elements in +/// `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd) +// NOTE: Not modeled yet +// pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d { +// { maxpd(a, b) } +// } +/// Returns a new vector with the low element of `a` replaced by the minimum +/// of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd) +// NOTE: Not modeled yet +// pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d { +// { minsd(a, b) } +// } +/// Returns a new vector with the minimum values from corresponding elements in +/// `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd) +// NOTE: Not modeled yet +// pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d { +// { minpd(a, b) } +// } +/// Returns a new vector with the low element of `a` replaced by multiplying the +/// low elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd) +// NOTE: Not modeled yet +// pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d { +// { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))) } +// } +/// Multiplies packed double-precision (64-bit) floating-point elements in `a` +/// and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd) +// NOTE: Not modeled yet +// pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d { +// { transmute(simd_mul(a.as_f64x2(), b.as_f64x2())) } +// } +/// Returns a new vector with the low element of `a` replaced by the square +/// root of the lower element `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd) +// NOTE: Not modeled yet +// pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d { +// { simd_insert(a, 0, sqrtf64(_mm_cvtsd_f64(b))) } +// } +/// Returns a new vector with the square root of each of the values in `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd) +// NOTE: Not modeled yet +// pub fn _mm_sqrt_pd(a: __m128d) -> __m128d { +// { simd_fsqrt(a) } +// } +/// Returns a new vector with the low element of `a` replaced by subtracting the +/// low element by `b` from the low element of `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd) +// NOTE: Not modeled yet +// pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d { +// { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))) } +// } +/// Subtract packed double-precision (64-bit) floating-point elements in `b` +/// from `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd) +// NOTE: Not modeled yet +// pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d { +// { simd_sub(a, b) } +// } +/// Computes the bitwise AND of packed double-precision (64-bit) floating-point +/// elements in `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd) +pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d { + { + let a: __m128i = transmute(a); + let b: __m128i = transmute(b); + transmute(_mm_and_si128(a, b)) + } +} +/// Computes the bitwise NOT of `a` and then AND with `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd) +pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d { + { + let a: __m128i = transmute(a); + let b: __m128i = transmute(b); + transmute(_mm_andnot_si128(a, b)) + } +} +/// Computes the bitwise OR of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd) +pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d { + { + let a: __m128i = transmute(a); + let b: __m128i = transmute(b); + transmute(_mm_or_si128(a, b)) + } +} +/// Computes the bitwise XOR of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd) +pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d { + { + let a: __m128i = transmute(a); + let b: __m128i = transmute(b); + transmute(_mm_xor_si128(a, b)) + } +} +/// Returns a new vector with the low element of `a` replaced by the equality +/// comparison of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d { +// { cmpsd(a, b, 0) } +// } +/// Returns a new vector with the low element of `a` replaced by the less-than +/// comparison of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d { +// { cmpsd(a, b, 1) } +// } +/// Returns a new vector with the low element of `a` replaced by the +/// less-than-or-equal comparison of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d { +// { cmpsd(a, b, 2) } +// } +/// Returns a new vector with the low element of `a` replaced by the +/// greater-than comparison of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d { +// { transmute(simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract(a, 1))) } +// } +/// Returns a new vector with the low element of `a` replaced by the +/// greater-than-or-equal comparison of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d { +// { simd_insert(_mm_cmple_sd(b, a), 1, simd_extract(a, 1)) } +// } +/// Returns a new vector with the low element of `a` replaced by the result +/// of comparing both of the lower elements of `a` and `b` to `NaN`. If +/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` +/// otherwise. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d { +// { cmpsd(a, b, 7) } +// } +/// Returns a new vector with the low element of `a` replaced by the result of +/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is +/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d { +// { cmpsd(a, b, 3) } +// } +/// Returns a new vector with the low element of `a` replaced by the not-equal +/// comparison of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d { +// { cmpsd(a, b, 4) } +// } +/// Returns a new vector with the low element of `a` replaced by the +/// not-less-than comparison of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d { +// { cmpsd(a, b, 5) } +// } +/// Returns a new vector with the low element of `a` replaced by the +/// not-less-than-or-equal comparison of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d { +// { cmpsd(a, b, 6) } +// } +/// Returns a new vector with the low element of `a` replaced by the +/// not-greater-than comparison of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d { +// { simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract(a, 1)) } +// } +/// Returns a new vector with the low element of `a` replaced by the +/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd) +// NOTE: Not modeled yet +// pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d { +// { simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract(a, 1)) } +// } +/// Compares corresponding elements in `a` and `b` for equality. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d { +// { cmppd(a, b, 0) } +// } +/// Compares corresponding elements in `a` and `b` for less-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d { +// { cmppd(a, b, 1) } +// } +/// Compares corresponding elements in `a` and `b` for less-than-or-equal +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d { +// { cmppd(a, b, 2) } +// } +/// Compares corresponding elements in `a` and `b` for greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d { +// _mm_cmplt_pd(b, a) +// } +/// Compares corresponding elements in `a` and `b` for greater-than-or-equal. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d { +// _mm_cmple_pd(b, a) +// } +/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d { +// { cmppd(a, b, 7) } +// } +/// Compares corresponding elements in `a` and `b` to see if either is `NaN`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d { +// { cmppd(a, b, 3) } +// } +/// Compares corresponding elements in `a` and `b` for not-equal. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d { +// { cmppd(a, b, 4) } +// } +/// Compares corresponding elements in `a` and `b` for not-less-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d { +// { cmppd(a, b, 5) } +// } +/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d { +// { cmppd(a, b, 6) } +// } +/// Compares corresponding elements in `a` and `b` for not-greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d { +// _mm_cmpnlt_pd(b, a) +// } +/// Compares corresponding elements in `a` and `b` for +/// not-greater-than-or-equal. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd) +// NOTE: Not modeled yet +// pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d { +// _mm_cmpnle_pd(b, a) +// } +/// Compares the lower element of `a` and `b` for equality. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd) +// NOTE: Not modeled yet +// pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 { +// { comieqsd(a, b) } +// } +/// Compares the lower element of `a` and `b` for less-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd) +// NOTE: Not modeled yet +// pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 { +// { comiltsd(a, b) } +// } +/// Compares the lower element of `a` and `b` for less-than-or-equal. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd) +// NOTE: Not modeled yet +// pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 { +// { comilesd(a, b) } +// } +/// Compares the lower element of `a` and `b` for greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd) +// NOTE: Not modeled yet +// pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 { +// { comigtsd(a, b) } +// } +/// Compares the lower element of `a` and `b` for greater-than-or-equal. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd) +// NOTE: Not modeled yet +// pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 { +// { comigesd(a, b) } +// } +/// Compares the lower element of `a` and `b` for not-equal. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd) +// NOTE: Not modeled yet +// pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 { +// { comineqsd(a, b) } +// } +/// Compares the lower element of `a` and `b` for equality. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd) +// NOTE: Not modeled yet +// pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 { +// { ucomieqsd(a, b) } +// } +/// Compares the lower element of `a` and `b` for less-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd) +// NOTE: Not modeled yet +// pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 { +// { ucomiltsd(a, b) } +// } +/// Compares the lower element of `a` and `b` for less-than-or-equal. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd) +// NOTE: Not modeled yet +// pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 { +// { ucomilesd(a, b) } +// } +/// Compares the lower element of `a` and `b` for greater-than. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd) +// NOTE: Not modeled yet +// pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 { +// { ucomigtsd(a, b) } +// } +/// Compares the lower element of `a` and `b` for greater-than-or-equal. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd) +// NOTE: Not modeled yet +// pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 { +// { ucomigesd(a, b) } +// } +/// Compares the lower element of `a` and `b` for not-equal. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd) +// NOTE: Not modeled yet +// pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 { +// { ucomineqsd(a, b) } +// } +/// Converts packed double-precision (64-bit) floating-point elements in `a` to +/// packed single-precision (32-bit) floating-point elements +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps) +pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 { + { + let r = simd_cast::<2, _, f32>(a.as_f64x2()); + let zero = f32x2::ZERO(); + transmute::(simd_shuffle(r, zero, [0, 1, 2, 3])) + } +} +/// Converts packed single-precision (32-bit) floating-point elements in `a` to +/// packed +/// double-precision (64-bit) floating-point elements. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd) +pub fn _mm_cvtps_pd(a: __m128) -> __m128d { + { + let a = a.as_f32x4(); + transmute(simd_cast::<2, f32, f64>(simd_shuffle(a, a, [0, 1]))) + } +} +/// Converts packed double-precision (64-bit) floating-point elements in `a` to +/// packed 32-bit integers. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32) +// NOTE: Not modeled yet +// pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i { +// { transmute(cvtpd2dq(a)) } +// } +/// Converts the lower double-precision (64-bit) floating-point element in a to +/// a 32-bit integer. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32) +// NOTE: Not modeled yet +// pub fn _mm_cvtsd_si32(a: __m128d) -> i32 { +// { cvtsd2si(a) } +// } +/// Converts the lower double-precision (64-bit) floating-point element in `b` +/// to a single-precision (32-bit) floating-point element, store the result in +/// the lower element of the return value, and copies the upper element from `a` +/// to the upper element the return value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss) +// NOTE: Not modeled yet +// pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 { +// { cvtsd2ss(a, b) } +// } +/// Returns the lower double-precision (64-bit) floating-point element of `a`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64) +// NOTE: Not modeled yet +// pub fn _mm_cvtsd_f64(a: __m128d) -> f64 { +// { simd_extract(a, 0) } +// } +/// Converts the lower single-precision (32-bit) floating-point element in `b` +/// to a double-precision (64-bit) floating-point element, store the result in +/// the lower element of the return value, and copies the upper element from `a` +/// to the upper element the return value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd) +// NOTE: Not modeled yet +// pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d { +// { cvtss2sd(a, b) } +// } +/// Converts packed double-precision (64-bit) floating-point elements in `a` to +/// packed 32-bit integers with truncation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32) +// NOTE: Not modeled yet +// pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i { +// { transmute(cvttpd2dq(a)) } +// } +/// Converts the lower double-precision (64-bit) floating-point element in `a` +/// to a 32-bit integer with truncation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32) +// NOTE: Not modeled yet +// pub fn _mm_cvttsd_si32(a: __m128d) -> i32 { +// { cvttsd2si(a) } +// } +/// Converts packed single-precision (32-bit) floating-point elements in `a` to +/// packed 32-bit integers with truncation. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32) +// NOTE: Not modeled yet +// pub fn _mm_cvttps_epi32(a: __m128) -> __m128i { +// { transmute(cvttps2dq(a)) } +// } +/// Copies double-precision (64-bit) floating-point element `a` to the lower +/// element of the packed 64-bit return value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd) +pub fn _mm_set_sd(a: f64) -> __m128d { + _mm_set_pd(0.0, a) +} +/// Broadcasts double-precision (64-bit) floating-point value a to all elements +/// of the return value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd) +pub fn _mm_set1_pd(a: f64) -> __m128d { + _mm_set_pd(a, a) +} +/// Broadcasts double-precision (64-bit) floating-point value a to all elements +/// of the return value. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1) +pub fn _mm_set_pd1(a: f64) -> __m128d { + _mm_set_pd(a, a) +} +/// Sets packed double-precision (64-bit) floating-point elements in the return +/// value with the supplied values. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd) +pub fn _mm_set_pd(a: f64, b: f64) -> __m128d { + transmute(f64x2::new(b, a)) +} +/// Sets packed double-precision (64-bit) floating-point elements in the return +/// value with the supplied values in reverse order. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd) +pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d { + _mm_set_pd(b, a) +} +/// Returns packed double-precision (64-bit) floating-point elements with all +/// zeros. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd) +pub fn _mm_setzero_pd() -> __m128d { + transmute(f64x2::ZERO()) +} +/// Returns a mask of the most significant bit of each element in `a`. +/// +/// The mask is stored in the 2 least significant bits of the return value. +/// All other bits are set to `0`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd) +pub fn _mm_movemask_pd(a: __m128d) -> i32 { + { + let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO()); + simd_bitmask_little!(1, mask, u8) as i32 + } +} +/// Constructs a 128-bit floating-point vector of `[2 x double]` from two +/// 128-bit vector parameters of `[2 x double]`, using the immediate-value +/// parameter as a specifier. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd) +pub fn _mm_shuffle_pd(a: __m128d, b: __m128d) -> __m128d { + static_assert_uimm_bits!(MASK, 8); + transmute(simd_shuffle( + a.as_f64x2(), + b.as_f64x2(), + [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2], + )) +} +/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower +/// 64 bits are set to the lower 64 bits of the second parameter. The upper +/// 64 bits are set to the upper 64 bits of the first parameter. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd) +pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d { + _mm_setr_pd(simd_extract(b.as_f64x2(), 0), simd_extract(a.as_f64x2(), 1)) +} +/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit +/// floating-point vector of `[4 x float]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps) +pub fn _mm_castpd_ps(a: __m128d) -> __m128 { + transmute(a) +} +/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit +/// integer vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128) +pub fn _mm_castpd_si128(a: __m128d) -> __m128i { + transmute(a) +} +/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit +/// floating-point vector of `[2 x double]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd) +pub fn _mm_castps_pd(a: __m128) -> __m128d { + transmute(a) +} +/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit +/// integer vector. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128) +pub fn _mm_castps_si128(a: __m128) -> __m128i { + transmute(a) +} +/// Casts a 128-bit integer vector into a 128-bit floating-point vector +/// of `[2 x double]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd) +pub fn _mm_castsi128_pd(a: __m128i) -> __m128d { + transmute(a) +} +/// Casts a 128-bit integer vector into a 128-bit floating-point vector +/// of `[4 x float]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps) +pub fn _mm_castsi128_ps(a: __m128i) -> __m128 { + transmute(a) +} +/// Returns vector of type __m128d with indeterminate elements.with indetermination elements. +/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically +/// picks some valid value and is not equivalent to [`mem::MaybeUninit`]. +/// In practice, this is typically equivalent to [`mem::zeroed`]. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd) +pub fn _mm_undefined_pd() -> __m128d { + transmute(f32x4::ZERO()) +} +/// Returns vector of type __m128i with indeterminate elements.with indetermination elements. +/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically +/// picks some valid value and is not equivalent to [`mem::MaybeUninit`]. +/// In practice, this is typically equivalent to [`mem::zeroed`]. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128) +pub fn _mm_undefined_si128() -> __m128i { + transmute(u32x4::ZERO()) +} +/// The resulting `__m128d` element is composed by the low-order values of +/// the two `__m128d` interleaved input elements, i.e.: +/// +/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input +/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd) +pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d { + transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [1, 3])) +} +/// The resulting `__m128d` element is composed by the high-order values of +/// the two `__m128d` interleaved input elements, i.e.: +/// +/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input +/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd) +pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d { + transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [0, 2])) +} diff --git a/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs new file mode 100644 index 0000000000000..217298286968c --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs @@ -0,0 +1,196 @@ +use crate::abstractions::{bit::MachineInteger, simd::*}; +pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 { + i8x16::from_fn(|i| { + if i < 8 { + if a[i] > (i8::MAX as i16) { + i8::MAX + } else if a[i] < (i8::MIN as i16) { + i8::MIN + } else { + a[i] as i8 + } + } else { + if b[i - 8] > (i8::MAX as i16) { + i8::MAX + } else if b[i - 8] < (i8::MIN as i16) { + i8::MIN + } else { + b[i - 8] as i8 + } + } + }) +} +pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4 { + i32x4::from_fn(|i| { + (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32) + }) +} +pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 { + let tmp = u8x16::from_fn(|i| a[i].wrapping_abs_diff(b[i])); + u64x2::from_fn(|i| { + (tmp[i * 8] as u16) + .wrapping_add(tmp[i * 8 + 1] as u16) + .wrapping_add(tmp[i * 8 + 2] as u16) + .wrapping_add(tmp[i * 8 + 3] as u16) + .wrapping_add(tmp[i * 8 + 4] as u16) + .wrapping_add(tmp[i * 8 + 5] as u16) + .wrapping_add(tmp[i * 8 + 6] as u16) + .wrapping_add(tmp[i * 8 + 7] as u16) as u64 + }) +} +pub fn psllw(a: i16x8, count: i16x8) -> i16x8 { + let count4: u64 = (count[0] as u16) as u64; + let count3: u64 = ((count[1] as u16) as u64) * 65536; + let count2: u64 = ((count[2] as u16) as u64) * 4294967296; + let count1: u64 = ((count[3] as u16) as u64) * 281474976710656; + let count = count1 + count2 + count3 + count4; + i16x8::from_fn(|i| { + if count > 15 { + 0 + } else { + ((a[i] as u16) << count) as i16 + } + }) +} + +pub fn pslld(a: i32x4, count: i32x4) -> i32x4 { + let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64); + + i32x4::from_fn(|i| { + if count > 31 { + 0 + } else { + ((a[i] as u32) << count) as i32 + } + }) +} + +pub fn psllq(a: i64x2, count: i64x2) -> i64x2 { + let count: u64 = count[0] as u64; + + i64x2::from_fn(|i| { + if count > 63 { + 0 + } else { + ((a[i] as u64) << count) as i64 + } + }) +} + +pub fn psraw(a: i16x8, count: i16x8) -> i16x8 { + let count: u64 = ((count[3] as u16) as u64) * 281474976710656 + + ((count[2] as u16) as u64) * 4294967296 + + ((count[1] as u16) as u64) * 65536 + + ((count[0] as u16) as u64); + + i16x8::from_fn(|i| { + if count > 15 { + if a[i] < 0 { + -1 + } else { + 0 + } + } else { + a[i] >> count + } + }) +} + +pub fn psrad(a: i32x4, count: i32x4) -> i32x4 { + let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64); + + i32x4::from_fn(|i| { + if count > 31 { + if a[i] < 0 { + -1 + } else { + 0 + } + } else { + a[i] << count + } + }) +} + +pub fn psrlw(a: i16x8, count: i16x8) -> i16x8 { + let count: u64 = (count[3] as u16 as u64) * 281474976710656 + + (count[2] as u16 as u64) * 4294967296 + + (count[1] as u16 as u64) * 65536 + + (count[0] as u16 as u64); + + i16x8::from_fn(|i| { + if count > 15 { + 0 + } else { + ((a[i] as u16) >> count) as i16 + } + }) +} + +pub fn psrld(a: i32x4, count: i32x4) -> i32x4 { + let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64); + + i32x4::from_fn(|i| { + if count > 31 { + 0 + } else { + ((a[i] as u32) >> count) as i32 + } + }) +} + +pub fn psrlq(a: i64x2, count: i64x2) -> i64x2 { + let count: u64 = count[0] as u64; + + i64x2::from_fn(|i| { + if count > 63 { + 0 + } else { + ((a[i] as u64) >> count) as i64 + } + }) +} + +pub fn packssdw(a: i32x4, b: i32x4) -> i16x8 { + i16x8::from_fn(|i| { + if i < 4 { + if a[i] > (i16::MAX as i32) { + i16::MAX + } else if a[i] < (i16::MIN as i32) { + i16::MIN + } else { + a[i] as i16 + } + } else { + if b[i - 4] > (i16::MAX as i32) { + i16::MAX + } else if b[i - 4] < (i16::MIN as i32) { + i16::MIN + } else { + b[i - 4] as i16 + } + } + }) +} + +pub fn packuswb(a: i16x8, b: i16x8) -> u8x16 { + u8x16::from_fn(|i| { + if i < 8 { + if a[i] > (u8::MAX as i16) { + u8::MAX + } else if a[i] < (u8::MIN as i16) { + u8::MIN + } else { + a[i] as u8 + } + } else { + if b[i - 8] > (u8::MAX as i16) { + u8::MAX + } else if b[i - 8] < (u8::MIN as i16) { + u8::MIN + } else { + b[i - 8] as u8 + } + } + }) +} diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs new file mode 100644 index 0000000000000..665e83460fca6 --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs @@ -0,0 +1,238 @@ +//! Supplemental Streaming SIMD Extensions 3 (SSSE3) +use crate::abstractions::simd::*; +use crate::abstractions::utilities::*; + +use super::sse2::*; +use super::ssse3_handwritten::*; +use super::types::*; + +/// Computes the absolute value of packed 8-bit signed integers in `a` and +/// return the unsigned results. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8) +pub fn _mm_abs_epi8(a: __m128i) -> __m128i { + { + let a = a.as_i8x16(); + let zero = i8x16::ZERO(); + let r = simd_select(simd_lt(a, zero), simd_neg(a), a); + transmute(r) + } +} +/// Computes the absolute value of each of the packed 16-bit signed integers in +/// `a` and +/// return the 16-bit unsigned integer +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16) +pub fn _mm_abs_epi16(a: __m128i) -> __m128i { + { + let a = a.as_i16x8(); + let zero = i16x8::ZERO(); + let r = simd_select(simd_lt(a, zero), simd_neg(a), a); + transmute(r) + } +} +/// Computes the absolute value of each of the packed 32-bit signed integers in +/// `a` and +/// return the 32-bit unsigned integer +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32) +pub fn _mm_abs_epi32(a: __m128i) -> __m128i { + { + let a = a.as_i32x4(); + let zero = i32x4::ZERO(); + let r = simd_select(simd_lt(a, zero), simd_neg(a), a); + transmute(r) + } +} +/// Shuffles bytes from `a` according to the content of `b`. +/// +/// The last 4 bits of each byte of `b` are used as addresses +/// into the 16 bytes of `a`. +/// +/// In addition, if the highest significant bit of a byte of `b` +/// is set, the respective destination byte is set to 0. +/// +/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is +/// logically equivalent to: +/// +/// ``` +/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] { +/// let mut r = [0u8; 16]; +/// for i in 0..16 { +/// // if the most significant bit of b is set, +/// // then the destination byte is set to 0. +/// if b[i] & 0x80 == 0u8 { +/// r[i] = a[(b[i] % 16) as usize]; +/// } +/// } +/// r +/// } +/// ``` +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8) +pub fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i { + { + transmute(pshufb128(a.as_u8x16(), b.as_u8x16())) + } +} +/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, +/// shift the result right by `n` bytes, and returns the low 16 bytes. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8) +pub fn _mm_alignr_epi8(a: __m128i, b: __m128i) -> __m128i { + static_assert_uimm_bits!(IMM8, 8); + if IMM8 > 32 { + return _mm_setzero_si128(); + } + let (a, b) = if IMM8 > 16 { + (_mm_setzero_si128(), a) + } else { + (a, b) + }; + const fn mask(shift: u32, i: u32) -> u32 { + if shift > 32 { + i + } else if shift > 16 { + shift - 16 + i + } else { + shift + i + } + } + { + let r: i8x16 = simd_shuffle( + b.as_i8x16(), + a.as_i8x16(), + [ + mask(IMM8 as u32, 0), + mask(IMM8 as u32, 1), + mask(IMM8 as u32, 2), + mask(IMM8 as u32, 3), + mask(IMM8 as u32, 4), + mask(IMM8 as u32, 5), + mask(IMM8 as u32, 6), + mask(IMM8 as u32, 7), + mask(IMM8 as u32, 8), + mask(IMM8 as u32, 9), + mask(IMM8 as u32, 10), + mask(IMM8 as u32, 11), + mask(IMM8 as u32, 12), + mask(IMM8 as u32, 13), + mask(IMM8 as u32, 14), + mask(IMM8 as u32, 15), + ], + ); + transmute(r) + } +} +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of `[8 x i16]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16) +pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i { + { + transmute(phaddw128(a.as_i16x8(), b.as_i16x8())) + } +} +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are +/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16) +pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i { + { + transmute(phaddsw128(a.as_i16x8(), b.as_i16x8())) + } +} +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of `[4 x i32]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32) +pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i { + { + transmute(phaddd128(a.as_i32x4(), b.as_i32x4())) + } +} +/// Horizontally subtract the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of `[8 x i16]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16) +pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i { + { + transmute(phsubw128(a.as_i16x8(), b.as_i16x8())) + } +} +/// Horizontally subtract the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than +/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are +/// saturated to 8000h. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16) +pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i { + { + transmute(phsubsw128(a.as_i16x8(), b.as_i16x8())) + } +} +/// Horizontally subtract the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of `[4 x i32]`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32) +pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i { + { + transmute(phsubd128(a.as_i32x4(), b.as_i32x4())) + } +} +/// Multiplies corresponding pairs of packed 8-bit unsigned integer +/// values contained in the first source operand and packed 8-bit signed +/// integer values contained in the second source operand, add pairs of +/// contiguous products with signed saturation, and writes the 16-bit sums to +/// the corresponding bits in the destination. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16) +pub fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i { + { + transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16())) + } +} +/// Multiplies packed 16-bit signed integer values, truncate the 32-bit +/// product to the 18 most significant bits by right-shifting, round the +/// truncated value by adding 1, and write bits `[16:1]` to the destination. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16) +pub fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i { + { + transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8())) + } +} +/// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit +/// integer in `b` is negative, and returns the result. +/// Elements in result are zeroed out when the corresponding element in `b` +/// is zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8) +pub fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i { + { + transmute(psignb128(a.as_i8x16(), b.as_i8x16())) + } +} +/// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit +/// integer in `b` is negative, and returns the results. +/// Elements in result are zeroed out when the corresponding element in `b` +/// is zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16) +pub fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i { + { + transmute(psignw128(a.as_i16x8(), b.as_i16x8())) + } +} +/// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit +/// integer in `b` is negative, and returns the results. +/// Element in result are zeroed out when the corresponding element in `b` +/// is zero. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32) +pub fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i { + { + transmute(psignd128(a.as_i32x4(), b.as_i32x4())) + } +} diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs new file mode 100644 index 0000000000000..4e911a83fb457 --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs @@ -0,0 +1,127 @@ +use crate::abstractions::simd::*; +pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 { + u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u32] }) +} + +pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 { + i16x8::from_fn(|i| { + if i < 4 { + a[2 * i].wrapping_add(a[2 * i + 1]) + } else { + b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1]) + } + }) +} + +pub fn phaddsw128(a: i16x8, b: i16x8) -> i16x8 { + i16x8::from_fn(|i| { + if i < 4 { + a[2 * i].saturating_add(a[2 * i + 1]) + } else { + b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1]) + } + }) +} + +pub fn phaddd128(a: i32x4, b: i32x4) -> i32x4 { + i32x4::from_fn(|i| { + if i < 2 { + a[2 * i].wrapping_add(a[2 * i + 1]) + } else { + b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1]) + } + }) +} + +pub fn phsubw128(a: i16x8, b: i16x8) -> i16x8 { + i16x8::from_fn(|i| { + if i < 4 { + a[2 * i].wrapping_sub(a[2 * i + 1]) + } else { + b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1]) + } + }) +} + +pub fn phsubsw128(a: i16x8, b: i16x8) -> i16x8 { + i16x8::from_fn(|i| { + if i < 4 { + a[2 * i].saturating_sub(a[2 * i + 1]) + } else { + b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1]) + } + }) +} + +pub fn phsubd128(a: i32x4, b: i32x4) -> i32x4 { + i32x4::from_fn(|i| { + if i < 2 { + a[2 * i].wrapping_sub(a[2 * i + 1]) + } else { + b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1]) + } + }) +} + +pub fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8 { + i16x8::from_fn(|i| { + ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16)) + .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16)) + }) +} + +pub fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8 { + i16x8::from_fn(|i| { + let temp = (a[i] as i32) * (b[i] as i32); + let temp = (temp >> 14).wrapping_add(1) >> 1; + temp as i16 + }) +} + +pub fn psignb128(a: i8x16, b: i8x16) -> i8x16 { + i8x16::from_fn(|i| { + if b[i] < 0 { + if a[i] == i8::MIN { + a[i] + } else { + -a[i] + } + } else if b[i] > 0 { + a[i] + } else { + 0 + } + }) +} + +pub fn psignw128(a: i16x8, b: i16x8) -> i16x8 { + i16x8::from_fn(|i| { + if b[i] < 0 { + if a[i] == i16::MIN { + a[i] + } else { + -a[i] + } + } else if b[i] > 0 { + a[i] + } else { + 0 + } + }) +} + +pub fn psignd128(a: i32x4, b: i32x4) -> i32x4 { + i32x4::from_fn(|i| { + if b[i] < 0 { + if a[i] == i32::MIN { + a[i] + } else { + -a[i] + } + } else if b[i] > 0 { + a[i] + } else { + 0 + } + }) +} diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs new file mode 100644 index 0000000000000..02b1d81173ad0 --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs @@ -0,0 +1,258 @@ +use super::types::*; +use super::upstream; +use crate::abstractions::bitvec::BitVec; +use crate::helpers::test::HasRandom; + +macro_rules! assert_feq { + ($lhs:expr, $rhs:expr) => { + assert!(($lhs.is_nan() && $rhs.is_nan()) || $lhs == $rhs) + }; +} + +/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default). +macro_rules! mk { + ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => { + #[test] + fn $name() { + #[allow(unused)] + const N: usize = { + let n: usize = 1000; + $(let n: usize = $N;)? + n + }; + mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*)); + } + }; + (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => { + for _ in 0..$N { + $(let $x = $ty::random();)* + assert_eq!(super::super::models::avx::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe { + BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into() + }); + } + }; + (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => { + let one = || { + mk!(@[$N]$name<$($c1),*>($($x : $ty),*)); + }; + one(); + mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*)); + } +} +mk!(_mm256_blendv_ps(a: __m256, b: __m256, c: __m256)); + +#[test] +fn _mm256_movemask_ps() { + let n = 1000; + + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx::_mm256_movemask_ps(a.into()), + unsafe { upstream::_mm256_movemask_ps(a.into()) } + ); + } +} + +#[test] +fn _mm256_movemask_pd() { + let n = 1000; + + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx::_mm256_movemask_pd(a.into()), + unsafe { upstream::_mm256_movemask_pd(a.into()) } + ); + } +} + +#[test] +fn _mm256_testz_si256() { + let n = 1000; + + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + let b: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx::_mm256_testz_si256(a.into(), b.into()), + unsafe { upstream::_mm256_testz_si256(a.into(), b.into()) } + ); + } +} + +#[test] +fn _mm256_testc_si256() { + let n = 1000; + + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + let b: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx::_mm256_testc_si256(a.into(), b.into()), + unsafe { upstream::_mm256_testc_si256(a.into(), b.into()) } + ); + } +} + +#[test] +fn _mm256_cvtsd_f64() { + let n = 1000; + + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_feq!( + super::super::models::avx::_mm256_cvtsd_f64(a.into()), + unsafe { upstream::_mm256_cvtsd_f64(a.into()) } + ); + } +} + +#[test] +fn _mm256_cvtsi256_si32() { + let n = 1000; + + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx::_mm256_cvtsi256_si32(a.into()), + unsafe { upstream::_mm256_cvtsi256_si32(a.into()) } + ); + } +} + +#[test] +fn _mm256_cvtss_f32() { + let n = 1000; + + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_feq!( + super::super::models::avx::_mm256_cvtss_f32(a.into()), + unsafe { upstream::_mm256_cvtss_f32(a.into()) } + ); + } +} + +mk!(_mm256_setzero_ps()); +mk!(_mm256_setzero_si256()); +mk!(_mm256_set_epi8( + e00: i8, + e01: i8, + e02: i8, + e03: i8, + e04: i8, + e05: i8, + e06: i8, + e07: i8, + e08: i8, + e09: i8, + e10: i8, + e11: i8, + e12: i8, + e13: i8, + e14: i8, + e15: i8, + e16: i8, + e17: i8, + e18: i8, + e19: i8, + e20: i8, + e21: i8, + e22: i8, + e23: i8, + e24: i8, + e25: i8, + e26: i8, + e27: i8, + e28: i8, + e29: i8, + e30: i8, + e31: i8 +)); +mk!(_mm256_set_epi16( + e00: i16, + e01: i16, + e02: i16, + e03: i16, + e04: i16, + e05: i16, + e06: i16, + e07: i16, + e08: i16, + e09: i16, + e10: i16, + e11: i16, + e12: i16, + e13: i16, + e14: i16, + e15: i16 +)); +mk!(_mm256_set_epi32( + e0: i32, + e1: i32, + e2: i32, + e3: i32, + e4: i32, + e5: i32, + e6: i32, + e7: i32 +)); +mk!(_mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64)); +mk!(_mm256_set1_epi8(a: i8)); +mk!(_mm256_set1_epi16(a: i16)); +mk!(_mm256_set1_epi32(a: i32)); +mk!(_mm256_set1_epi64x(a: i64)); +mk!(_mm256_set_pd(a: f64, b: f64, c: f64, d: f64)); +mk!(_mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32)); +mk!(_mm256_setr_pd(a: f64, b: f64, c: f64, d: f64)); +mk!(_mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32)); +mk!(_mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64)); +mk!(_mm256_set1_pd(a: f64)); +mk!(_mm256_set1_ps(a: f32)); + +mk!(_mm256_and_pd(a: __m256d, b: __m256d)); +mk!(_mm256_and_ps(a: __m256, b: __m256)); +mk!(_mm256_or_pd(a: __m256d, b: __m256d)); +mk!(_mm256_or_ps(a: __m256, b: __m256)); +mk!(_mm256_andnot_pd(a: __m256d, b: __m256d)); +mk!(_mm256_andnot_ps(a: __m256, b: __m256)); +mk!(_mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d)); +mk!(_mm256_xor_pd(a: __m256d, b: __m256d)); +mk!(_mm256_xor_ps(a: __m256, b: __m256)); +mk!(_mm256_cvtepi32_pd(a: __m128i)); +mk!(_mm256_cvtepi32_ps(a: __m256i)); +mk!(_mm256_cvtpd_ps(a: __m256d)); +mk!(_mm256_cvtps_pd(a: __m128)); +mk!(_mm256_movehdup_ps(a: __m256)); +mk!(_mm256_moveldup_ps(a: __m256)); +mk!(_mm256_movedup_pd(a: __m256d)); +mk!(_mm256_unpackhi_pd(a: __m256d, b: __m256d)); +mk!(_mm256_unpackhi_ps(a: __m256, b: __m256)); +mk!(_mm256_unpacklo_pd(a: __m256d, b: __m256d)); +mk!(_mm256_unpacklo_ps(a: __m256, b: __m256)); +mk!(_mm256_setzero_pd()); +mk!(_mm256_castpd_ps(a: __m256d)); +mk!(_mm256_castps_pd(a: __m256)); +mk!(_mm256_castps_si256(a: __m256)); +mk!(_mm256_castsi256_ps(a: __m256i)); +mk!(_mm256_castpd_si256(a: __m256d)); +mk!(_mm256_castsi256_pd(a: __m256i)); +mk!(_mm256_castps256_ps128(a: __m256)); +mk!(_mm256_castpd256_pd128(a: __m256d)); +mk!(_mm256_castsi256_si128(a: __m256i)); +mk!(_mm256_castps128_ps256(a: __m128)); +mk!(_mm256_castpd128_pd256(a: __m128d)); +mk!(_mm256_castsi128_si256(a: __m128i)); +mk!(_mm256_zextps128_ps256(a: __m128)); +mk!(_mm256_zextsi128_si256(a: __m128i)); +mk!(_mm256_zextpd128_pd256(a: __m128d)); +mk!(_mm256_undefined_ps()); +mk!(_mm256_undefined_pd()); +mk!(_mm256_undefined_si256()); +mk!(_mm256_set_m128(hi: __m128, lo: __m128)); +mk!(_mm256_set_m128d(hi: __m128d, lo: __m128d)); +mk!(_mm256_set_m128i(hi: __m128i, lo: __m128i)); +mk!(_mm256_setr_m128(lo: __m128, hi: __m128)); +mk!(_mm256_setr_m128d(lo: __m128d, hi: __m128d)); +mk!(_mm256_setr_m128i(lo: __m128i, hi: __m128i)); diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs new file mode 100644 index 0000000000000..dcabcbb58b1e0 --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs @@ -0,0 +1,541 @@ +use super::types::*; +use super::upstream; +use crate::abstractions::bitvec::BitVec; +use crate::helpers::test::HasRandom; + +/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default). +macro_rules! mk { + ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => { + #[test] + fn $name() { + #[allow(unused)] + const N: usize = { + let n: usize = 1000; + $(let n: usize = $N;)? + n + }; + mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*)); + } + }; + (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => { + for _ in 0..$N { + $(let $x = $ty::random();)* + assert_eq!(super::super::models::avx2::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe { + BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into() + }); + } + }; + (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => { + let one = || { + mk!(@[$N]$name<$($c1),*>($($x : $ty),*)); + }; + one(); + mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*)); + } +} + +mk!(_mm256_abs_epi32(a: BitVec)); +mk!(_mm256_abs_epi16(a: BitVec)); +mk!(_mm256_abs_epi8(a: BitVec)); +mk!(_mm256_add_epi64(a: BitVec, b: BitVec)); +mk!(_mm256_add_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_add_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_add_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_adds_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_adds_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_adds_epu8(a: BitVec, b: BitVec)); +mk!(_mm256_adds_epu16(a: BitVec, b: BitVec)); +mk!([100]_mm256_alignr_epi8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec)); +mk!([100]_mm256_permute2x128_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec)); +mk!(_mm256_blendv_epi8(a: BitVec, b: BitVec, mask: BitVec)); +mk!(_mm_broadcastb_epi8(a: BitVec)); +mk!(_mm256_broadcastb_epi8(a: BitVec)); +mk!(_mm_broadcastd_epi32(a: BitVec)); +mk!(_mm256_broadcastd_epi32(a: BitVec)); +mk!(_mm_broadcastq_epi64(a: BitVec)); +mk!(_mm256_broadcastq_epi64(a: BitVec)); +mk!(_mm_broadcastsi128_si256(a: BitVec)); +mk!(_mm256_broadcastsi128_si256(a: BitVec)); +mk!(_mm_broadcastw_epi16(a: BitVec)); +mk!(_mm256_broadcastw_epi16(a: BitVec)); +mk!(_mm256_cmpeq_epi64(a: BitVec, b: BitVec)); +mk!(_mm256_cmpeq_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_cmpeq_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_cmpeq_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_cmpgt_epi64(a: BitVec, b: BitVec)); +mk!(_mm256_cmpgt_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_cmpgt_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_cmpgt_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_cvtepi16_epi32(a: BitVec)); +mk!(_mm256_cvtepi16_epi64(a: BitVec)); +mk!(_mm256_cvtepi32_epi64(a: BitVec)); +mk!(_mm256_cvtepi8_epi16(a: BitVec)); +mk!(_mm256_cvtepi8_epi32(a: BitVec)); +mk!(_mm256_cvtepi8_epi64(a: BitVec)); +mk!(_mm256_cvtepu16_epi32(a: BitVec)); +mk!(_mm256_cvtepu16_epi64(a: BitVec)); +mk!(_mm256_cvtepu32_epi64(a: BitVec)); +mk!(_mm256_cvtepu8_epi16(a: BitVec)); +mk!(_mm256_cvtepu8_epi32(a: BitVec)); +mk!(_mm256_cvtepu8_epi64(a: BitVec)); +mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec)); +mk!(_mm256_hadd_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_hadd_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_hsub_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_hsub_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_hsubs_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_inserti128_si256{<0>,<1>}(a: BitVec, b: BitVec)); +mk!(_mm256_madd_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_maddubs_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_max_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_max_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_max_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_max_epu16(a: BitVec, b: BitVec)); +mk!(_mm256_max_epu32(a: BitVec, b: BitVec)); +mk!(_mm256_max_epu8(a: BitVec, b: BitVec)); +mk!(_mm256_min_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_min_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_min_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_min_epu16(a: BitVec, b: BitVec)); +mk!(_mm256_min_epu32(a: BitVec, b: BitVec)); +mk!(_mm256_min_epu8(a: BitVec, b: BitVec)); +mk!(_mm256_mul_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_mul_epu32(a: BitVec, b: BitVec)); +mk!(_mm256_mulhi_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_mulhi_epu16(a: BitVec, b: BitVec)); +mk!(_mm256_mullo_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_mullo_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_mulhrs_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_or_si256(a: BitVec, b: BitVec)); +mk!(_mm256_packs_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_packs_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_packus_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_packus_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_permutevar8x32_epi32(a: BitVec, b: BitVec)); +#[test] +fn _mm256_movemask_epi8() { + let n = 1000; + + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_movemask_epi8(a.into()), + unsafe { upstream::_mm256_movemask_epi8(a.into()) } + ); + } +} +mk!([100]_mm256_mpsadbw_epu8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec)); + +mk!([100]_mm256_permute4x64_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!([100]_mm256_shuffle_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!([100]_mm256_shufflehi_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!([100]_mm256_shufflelo_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!(_mm256_sad_epu8(a: BitVec, b: BitVec)); +mk!(_mm256_shuffle_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_sign_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_sign_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_sign_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_sll_epi16(a: BitVec, count: BitVec)); +mk!(_mm256_sll_epi32(a: BitVec, count: BitVec)); +mk!(_mm256_sll_epi64(a: BitVec, count: BitVec)); +mk!([100]_mm256_slli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!([100]_mm256_slli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!([100]_mm256_slli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!([100]_mm256_slli_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!([100]_mm256_bslli_epi128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!(_mm_sllv_epi32(a: BitVec, count: BitVec)); +mk!(_mm256_sllv_epi32(a: BitVec, count: BitVec)); +mk!(_mm_sllv_epi64(a: BitVec, count: BitVec)); +mk!(_mm256_sllv_epi64(a: BitVec, count: BitVec)); +mk!(_mm256_sra_epi16(a: BitVec, count: BitVec)); +mk!(_mm256_sra_epi32(a: BitVec, count: BitVec)); +mk!([100]_mm256_srai_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!(_mm256_srai_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!(_mm_srav_epi32(a: BitVec, count: BitVec)); +mk!(_mm256_srav_epi32(a: BitVec, count: BitVec)); +mk!([100]_mm256_srli_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!(_mm256_srl_epi16(a: BitVec, count: BitVec)); +mk!(_mm256_srl_epi32(a: BitVec, count: BitVec)); +mk!(_mm256_srl_epi64(a: BitVec, count: BitVec)); +mk!([100]_mm256_srli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!([100]_mm256_srli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!([100]_mm256_srli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec)); +mk!(_mm_srlv_epi32(a: BitVec, count: BitVec)); +mk!(_mm256_srlv_epi32(a: BitVec, count: BitVec)); +mk!(_mm_srlv_epi64(a: BitVec, count: BitVec)); +mk!(_mm256_srlv_epi64(a: BitVec, count: BitVec)); +mk!(_mm256_sub_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_sub_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_sub_epi64(a: BitVec, b: BitVec)); +mk!(_mm256_sub_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_subs_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_subs_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_subs_epu16(a: BitVec, b: BitVec)); +mk!(_mm256_subs_epu8(a: BitVec, b: BitVec)); +mk!(_mm256_unpackhi_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_unpacklo_epi8(a: BitVec, b: BitVec)); +mk!(_mm256_unpackhi_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_unpacklo_epi16(a: BitVec, b: BitVec)); +mk!(_mm256_unpackhi_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_unpacklo_epi32(a: BitVec, b: BitVec)); +mk!(_mm256_unpackhi_epi64(a: BitVec, b: BitVec)); +mk!(_mm256_unpacklo_epi64(a: BitVec, b: BitVec)); +mk!(_mm256_xor_si256(a: BitVec, b: BitVec)); + +#[test] +fn _mm256_extract_epi8() { + let n = 100; + + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<0>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<0>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<1>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<1>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<2>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<2>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<3>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<3>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<4>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<4>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<5>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<5>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<6>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<6>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<7>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<7>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<8>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<8>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<9>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<9>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<10>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<10>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<11>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<11>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<12>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<12>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<13>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<13>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<14>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<14>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<15>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<15>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<16>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<16>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<17>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<17>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<18>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<18>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<19>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<19>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<20>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<20>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<21>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<21>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<22>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<22>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<23>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<23>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<24>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<24>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<25>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<25>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<26>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<26>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<27>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<27>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<28>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<28>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<29>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<29>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<30>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<30>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi8::<31>(a.into()), + unsafe { upstream::_mm256_extract_epi8::<31>(a.into()) } + ); + } +} + +#[test] +fn _mm256_extract_epi16() { + let n = 100; + + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<0>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<0>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<1>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<1>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<2>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<2>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<3>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<3>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<4>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<4>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<5>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<5>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<6>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<6>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<7>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<7>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<8>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<8>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<9>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<9>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<10>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<10>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<11>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<11>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<12>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<12>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<13>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<13>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<14>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<14>(a.into()) } + ); + } + for _ in 0..n { + let a: BitVec<256> = BitVec::random(); + assert_eq!( + super::super::models::avx2::_mm256_extract_epi16::<15>(a.into()), + unsafe { upstream::_mm256_extract_epi16::<15>(a.into()) } + ); + } +} + +mk!(_mm256_and_si256(a: __m256i, b: __m256i)); +mk!(_mm256_andnot_si256(a: __m256i, b: __m256i)); +mk!(_mm256_avg_epu16(a: __m256i, b: __m256i)); +mk!(_mm256_avg_epu8(a: __m256i, b: __m256i)); +mk!(_mm_broadcastsd_pd(a: __m128d)); +mk!(_mm256_broadcastsd_pd(a: __m128d)); +mk!(_mm_broadcastss_ps(a: __m128)); +mk!(_mm256_broadcastss_ps(a: __m128)); diff --git a/testable-simd-models/src/core_arch/x86/tests/mod.rs b/testable-simd-models/src/core_arch/x86/tests/mod.rs new file mode 100644 index 0000000000000..217ff55623dbf --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/tests/mod.rs @@ -0,0 +1,172 @@ +//! Tests for intrinsics defined in `crate::core_arch::x86::models` +//! +//! Each and every modelled intrinsic is tested against the Rust +//! implementation here. For the most part, the tests work by +//! generating random inputs, passing them as arguments +//! to both the models in this crate, and the corresponding intrinsics +//! in the Rust core and then comparing their outputs. +//! +//! To add a test for a modelled intrinsic, go the appropriate file, and +//! use the `mk!` macro to define it. +//! +//! A `mk!` macro invocation looks like the following, +//! `mk!([]{<,>}()) +//! +//! For example, some valid invocations are +//! +//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: __m256i));` +//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: __m256i));` +//! `mk!(_mm256_abs_epi16(a: __m256i));` +//! +//! The number of random tests is optional. If not provided, it is taken to be 1000 by default. +//! The const values are necessary if the function has constant arguments, but should be discarded if not. +//! The function name and the function arguments are necessary in all cases. +//! +//! Note: This only works if the function returns a bit-vector or funarray. If it returns an integer, the +//! test has to be written manually. It is recommended that the manually defined test follows +//! the pattern of tests defined via the `mk!` invocation. It is also recommended that, in the +//! case that the intrinsic takes constant arguments, each and every possible constant value +//! (upto a maximum of 255) that can be passed to the function be used for testing. The number +//! of constant values passed depends on if the Rust intrinsics statically asserts that the +//! length of the constant argument be less than or equal to a certain number of bits. + +mod avx; +mod avx2; +mod sse2; +mod ssse3; +use crate::abstractions::bitvec::*; + +pub(crate) mod types { + use crate::abstractions::bitvec::*; + + #[allow(non_camel_case_types)] + pub type __m256i = BitVec<256>; + #[allow(non_camel_case_types)] + pub type __m256 = BitVec<256>; + #[allow(non_camel_case_types)] + pub type __m128i = BitVec<128>; + #[allow(non_camel_case_types)] + pub type __m256d = BitVec<256>; + #[allow(non_camel_case_types)] + pub type __m128 = BitVec<128>; + #[allow(non_camel_case_types)] + pub type __m128d = BitVec<128>; +} + +pub(crate) mod upstream { + #[cfg(target_arch = "x86")] + pub use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + pub use core::arch::x86_64::*; +} + +mod conversions { + use super::upstream::{ + __m128, __m128d, __m128i, __m256, __m256d, __m256i, _mm256_castpd_si256, + _mm256_castps_si256, _mm256_castsi256_pd, _mm256_castsi256_ps, _mm256_loadu_si256, + _mm256_storeu_si256, _mm_castpd_si128, _mm_castps_si128, _mm_castsi128_pd, + _mm_castsi128_ps, _mm_loadu_si128, _mm_storeu_si128, + }; + use super::BitVec; + + impl From> for __m256i { + fn from(bv: BitVec<256>) -> __m256i { + let bv: &[u8] = &bv.to_vec()[..]; + unsafe { _mm256_loadu_si256(bv.as_ptr() as *const _) } + } + } + impl From> for __m256 { + fn from(bv: BitVec<256>) -> __m256 { + let bv: &[u8] = &bv.to_vec()[..]; + unsafe { _mm256_castsi256_ps(_mm256_loadu_si256(bv.as_ptr() as *const _)) } + } + } + + impl From> for __m128i { + fn from(bv: BitVec<128>) -> __m128i { + let slice: &[u8] = &bv.to_vec()[..]; + unsafe { _mm_loadu_si128(slice.as_ptr() as *const __m128i) } + } + } + + impl From> for __m128 { + fn from(bv: BitVec<128>) -> __m128 { + let slice: &[u8] = &bv.to_vec()[..]; + unsafe { _mm_castsi128_ps(_mm_loadu_si128(slice.as_ptr() as *const __m128i)) } + } + } + + impl From> for __m128d { + fn from(bv: BitVec<128>) -> __m128d { + let slice: &[u8] = &bv.to_vec()[..]; + unsafe { _mm_castsi128_pd(_mm_loadu_si128(slice.as_ptr() as *const __m128i)) } + } + } + + impl From> for __m256d { + fn from(bv: BitVec<256>) -> __m256d { + let bv: &[u8] = &bv.to_vec()[..]; + unsafe { _mm256_castsi256_pd(_mm256_loadu_si256(bv.as_ptr() as *const _)) } + } + } + + impl From<__m256i> for BitVec<256> { + fn from(vec: __m256i) -> BitVec<256> { + let mut v = [0u8; 32]; + unsafe { + _mm256_storeu_si256(v.as_mut_ptr() as *mut _, vec); + } + BitVec::from_slice(&v[..], 8) + } + } + + impl From<__m256> for BitVec<256> { + fn from(vec: __m256) -> BitVec<256> { + let mut v = [0u8; 32]; + unsafe { + _mm256_storeu_si256(v.as_mut_ptr() as *mut _, _mm256_castps_si256(vec)); + } + BitVec::from_slice(&v[..], 8) + } + } + + impl From<__m256d> for BitVec<256> { + fn from(vec: __m256d) -> BitVec<256> { + let mut v = [0u8; 32]; + unsafe { + _mm256_storeu_si256(v.as_mut_ptr() as *mut _, _mm256_castpd_si256(vec)); + } + BitVec::from_slice(&v[..], 8) + } + } + + impl From<__m128i> for BitVec<128> { + fn from(vec: __m128i) -> BitVec<128> { + let mut v = [0u8; 16]; + unsafe { + _mm_storeu_si128(v.as_mut_ptr() as *mut _, vec); + } + BitVec::from_slice(&v[..], 8) + } + } + + impl From<__m128> for BitVec<128> { + fn from(vec: __m128) -> BitVec<128> { + let mut v = [0u8; 16]; + unsafe { + _mm_storeu_si128(v.as_mut_ptr() as *mut _, _mm_castps_si128(vec)); + } + BitVec::from_slice(&v[..], 8) + } + } + + impl From<__m128d> for BitVec<128> { + fn from(vec: __m128d) -> BitVec<128> { + let mut v = [0u8; 16]; + unsafe { + _mm_storeu_si128(v.as_mut_ptr() as *mut _, _mm_castpd_si128(vec)); + } + BitVec::from_slice(&v[..], 8) + } + } +} diff --git a/testable-simd-models/src/core_arch/x86/tests/sse2.rs b/testable-simd-models/src/core_arch/x86/tests/sse2.rs new file mode 100644 index 0000000000000..ed387f5938524 --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/tests/sse2.rs @@ -0,0 +1,201 @@ +use super::types::*; +use super::upstream; +use crate::abstractions::bitvec::BitVec; +use crate::helpers::test::HasRandom; + +/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default). +macro_rules! mk { + ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => { + #[test] + fn $name() { + #[allow(unused)] + const N: usize = { + let n: usize = 1000; + $(let n: usize = $N;)? + n + }; + mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*)); + } + }; + (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => { + for _ in 0..$N { + $(let $x = $ty::random();)* + assert_eq!(super::super::models::sse2::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe { + BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into() + }); + } + }; + (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => { + let one = || { + mk!(@[$N]$name<$($c1),*>($($x : $ty),*)); + }; + one(); + mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*)); + } +} +mk!(_mm_add_epi8(a: __m128i, b: __m128i)); +mk!(_mm_add_epi16(a: __m128i, b: __m128i)); +mk!(_mm_add_epi32(a: __m128i, b: __m128i)); +mk!(_mm_add_epi64(a: __m128i, b: __m128i)); +mk!(_mm_adds_epi8(a: __m128i, b: __m128i)); +mk!(_mm_adds_epi16(a: __m128i, b: __m128i)); +mk!(_mm_adds_epu8(a: __m128i, b: __m128i)); +mk!(_mm_adds_epu16(a: __m128i, b: __m128i)); +mk!(_mm_avg_epu8(a: __m128i, b: __m128i)); +mk!(_mm_avg_epu16(a: __m128i, b: __m128i)); +mk!(_mm_madd_epi16(a: __m128i, b: __m128i)); +mk!(_mm_max_epi16(a: __m128i, b: __m128i)); +mk!(_mm_max_epu8(a: __m128i, b: __m128i)); +mk!(_mm_min_epi16(a: __m128i, b: __m128i)); +mk!(_mm_min_epu8(a: __m128i, b: __m128i)); +mk!(_mm_mulhi_epi16(a: __m128i, b: __m128i)); +mk!(_mm_mulhi_epu16(a: __m128i, b: __m128i)); +mk!(_mm_mullo_epi16(a: __m128i, b: __m128i)); +mk!(_mm_mul_epu32(a: __m128i, b: __m128i)); +mk!(_mm_sad_epu8(a: __m128i, b: __m128i)); +mk!(_mm_sub_epi8(a: __m128i, b: __m128i)); +mk!(_mm_sub_epi16(a: __m128i, b: __m128i)); +mk!(_mm_sub_epi32(a: __m128i, b: __m128i)); +mk!(_mm_sub_epi64(a: __m128i, b: __m128i)); +mk!(_mm_subs_epi8(a: __m128i, b: __m128i)); +mk!(_mm_subs_epi16(a: __m128i, b: __m128i)); +mk!(_mm_subs_epu8(a: __m128i, b: __m128i)); +mk!(_mm_subs_epu16(a: __m128i, b: __m128i)); + +mk!([100]_mm_slli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); +mk!([100]_mm_bslli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); +mk!([100]_mm_bsrli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); +mk!([100]_mm_slli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); + +mk!([100]_mm_sll_epi16(a: __m128i, count: __m128i)); + +mk!([100]_mm_slli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); + +mk!([100]_mm_sll_epi32(a: __m128i, count: __m128i)); + +mk!([100]_mm_slli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); + +mk!([100]_mm_sll_epi64(a: __m128i, count: __m128i)); + +mk!([100]_mm_srai_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); + +mk!([100]_mm_sra_epi16(a: __m128i, count: __m128i)); + +mk!([100]_mm_srai_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); + +mk!([100]_mm_sra_epi32(a: __m128i, count: __m128i)); +mk!([100]_mm_srli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); +mk!([100]_mm_srli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); + +mk!([100]_mm_srl_epi16(a: __m128i, count: __m128i)); + +mk!([100]_mm_srli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); + +mk!([100]_mm_srl_epi32(a: __m128i, count: __m128i)); + +mk!([100]_mm_srli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); + +mk!(_mm_srl_epi64(a: __m128i, count: __m128i)); +mk!(_mm_and_si128(a: __m128i, b: __m128i)); +mk!(_mm_andnot_si128(a: __m128i, b: __m128i)); +mk!(_mm_or_si128(a: __m128i, b: __m128i)); +mk!(_mm_xor_si128(a: __m128i, b: __m128i)); +mk!(_mm_cmpeq_epi8(a: __m128i, b: __m128i)); +mk!(_mm_cmpeq_epi16(a: __m128i, b: __m128i)); +mk!(_mm_cmpeq_epi32(a: __m128i, b: __m128i)); +mk!(_mm_cmpgt_epi8(a: __m128i, b: __m128i)); +mk!(_mm_cmpgt_epi16(a: __m128i, b: __m128i)); +mk!(_mm_cmpgt_epi32(a: __m128i, b: __m128i)); +mk!(_mm_cmplt_epi8(a: __m128i, b: __m128i)); +mk!(_mm_cmplt_epi16(a: __m128i, b: __m128i)); +mk!(_mm_cmplt_epi32(a: __m128i, b: __m128i)); +mk!(_mm_cvtsi32_si128(a: i32)); + +// mk!(_mm_cvtsi128_si32(a: __m128i)); + +mk!(_mm_set_epi64x(e1: i64, e0: i64)); +mk!(_mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32)); +mk!(_mm_set_epi16( + e7: i16, + e6: i16, + e5: i16, + e4: i16, + e3: i16, + e2: i16, + e1: i16, + e0: i16 +)); +mk!(_mm_set_epi8( + e15: i8, + e14: i8, + e13: i8, + e12: i8, + e11: i8, + e10: i8, + e9: i8, + e8: i8, + e7: i8, + e6: i8, + e5: i8, + e4: i8, + e3: i8, + e2: i8, + e1: i8, + e0: i8 +)); +mk!(_mm_set1_epi64x(a: i64)); +mk!(_mm_set1_epi32(a: i32)); +mk!(_mm_set1_epi16(a: i16)); +mk!(_mm_set1_epi8(a: i8)); +mk!(_mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32)); +mk!(_mm_setr_epi16( + e7: i16, + e6: i16, + e5: i16, + e4: i16, + e3: i16, + e2: i16, + e1: i16, + e0: i16 +)); +mk!(_mm_setr_epi8( + e15: i8, + e14: i8, + e13: i8, + e12: i8, + e11: i8, + e10: i8, + e9: i8, + e8: i8, + e7: i8, + e6: i8, + e5: i8, + e4: i8, + e3: i8, + e2: i8, + e1: i8, + e0: i8 +)); +mk!(_mm_setzero_si128()); +mk!(_mm_move_epi64(a: __m128i)); +mk!(_mm_packs_epi16(a: __m128i, b: __m128i)); +mk!(_mm_packs_epi32(a: __m128i, b: __m128i)); +mk!(_mm_packus_epi16(a: __m128i, b: __m128i)); + +// mk!([100]_mm_extract_epi16(a: __m128i)); +mk!([100]_mm_insert_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>}(a: __m128i, i: i32)); + +// mk!([100]_mm_movemask_epi8(a: __m128i)); + +mk!([100]_mm_shuffle_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); +mk!([100]_mm_shufflehi_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); +mk!([100]_mm_shufflelo_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i)); +mk!(_mm_unpackhi_epi8(a: __m128i, b: __m128i)); +mk!(_mm_unpackhi_epi16(a: __m128i, b: __m128i)); +mk!(_mm_unpackhi_epi32(a: __m128i, b: __m128i)); +mk!(_mm_unpackhi_epi64(a: __m128i, b: __m128i)); +mk!(_mm_unpacklo_epi8(a: __m128i, b: __m128i)); +mk!(_mm_unpacklo_epi16(a: __m128i, b: __m128i)); +mk!(_mm_unpacklo_epi32(a: __m128i, b: __m128i)); +mk!(_mm_unpacklo_epi64(a: __m128i, b: __m128i)); +mk!(_mm_undefined_si128()); diff --git a/testable-simd-models/src/core_arch/x86/tests/ssse3.rs b/testable-simd-models/src/core_arch/x86/tests/ssse3.rs new file mode 100644 index 0000000000000..6382f953f2063 --- /dev/null +++ b/testable-simd-models/src/core_arch/x86/tests/ssse3.rs @@ -0,0 +1,51 @@ +use super::types::*; +use super::upstream; +use crate::abstractions::bitvec::BitVec; +use crate::helpers::test::HasRandom; + +/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default). +macro_rules! mk { + ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => { + #[test] + fn $name() { + #[allow(unused)] + const N: usize = { + let n: usize = 1000; + $(let n: usize = $N;)? + n + }; + mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*)); + } + }; + (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => { + for _ in 0..$N { + $(let $x = $ty::random();)* + assert_eq!(super::super::models::ssse3::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe { + BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into() + }); + } + }; + (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => { + let one = || { + mk!(@[$N]$name<$($c1),*>($($x : $ty),*)); + }; + one(); + mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*)); + } +} +mk!(_mm_abs_epi8(a: __m128i)); +mk!(_mm_abs_epi16(a: __m128i)); +mk!(_mm_abs_epi32(a: __m128i)); +mk!(_mm_shuffle_epi8(a: __m128i, b: __m128i)); +mk!([100]_mm_alignr_epi8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i, b: __m128i)); +mk!(_mm_hadd_epi16(a: __m128i, b: __m128i)); +mk!(_mm_hadds_epi16(a: __m128i, b: __m128i)); +mk!(_mm_hadd_epi32(a: __m128i, b: __m128i)); +mk!(_mm_hsub_epi16(a: __m128i, b: __m128i)); +mk!(_mm_hsubs_epi16(a: __m128i, b: __m128i)); +mk!(_mm_hsub_epi32(a: __m128i, b: __m128i)); +mk!(_mm_maddubs_epi16(a: __m128i, b: __m128i)); +mk!(_mm_mulhrs_epi16(a: __m128i, b: __m128i)); +mk!(_mm_sign_epi8(a: __m128i, b: __m128i)); +mk!(_mm_sign_epi16(a: __m128i, b: __m128i)); +mk!(_mm_sign_epi32(a: __m128i, b: __m128i)); diff --git a/testable-simd-models/src/helpers.rs b/testable-simd-models/src/helpers.rs new file mode 100644 index 0000000000000..1a30bf251a877 --- /dev/null +++ b/testable-simd-models/src/helpers.rs @@ -0,0 +1,67 @@ +#[cfg(test)] +pub mod test { + use crate::abstractions::{bit::Bit, bitvec::BitVec, funarr::FunArray}; + use rand::prelude::*; + + /// Helper trait to generate random values + pub trait HasRandom { + fn random() -> Self; + } + macro_rules! mk_has_random { + ($($ty:ty),*) => { + $(impl HasRandom for $ty { + fn random() -> Self { + let mut rng = rand::rng(); + rng.random() + } + })* + }; + } + + mk_has_random!(bool); + mk_has_random!(i8, i16, i32, i64, i128); + mk_has_random!(u8, u16, u32, u64, u128); + + impl HasRandom for isize { + fn random() -> Self { + i128::random() as isize + } + } + impl HasRandom for usize { + fn random() -> Self { + i128::random() as usize + } + } + + impl HasRandom for f32 { + fn random() -> Self { + u32::random() as f32 + } + } + + impl HasRandom for f64 { + fn random() -> Self { + u64::random() as f64 + } + } + + impl HasRandom for Bit { + fn random() -> Self { + crate::abstractions::bit::Bit::from(bool::random()) + } + } + impl HasRandom for BitVec { + fn random() -> Self { + Self::from_fn(|_| Bit::random()) + } + } + + impl HasRandom for FunArray { + fn random() -> Self { + FunArray::from_fn(|_| T::random()) + } + } +} + +#[cfg(test)] +pub use test::*; diff --git a/testable-simd-models/src/lib.rs b/testable-simd-models/src/lib.rs new file mode 100644 index 0000000000000..13d6ba2e6e7cd --- /dev/null +++ b/testable-simd-models/src/lib.rs @@ -0,0 +1,35 @@ +//! `testable-simd-models`: A Rust Model for the `core` Library +//! +//! `testable-simd-models` is a simplified, self-contained model of Rust’s `core` library. It aims to provide +//! a purely Rust-based specification of `core`'s fundamental operations, making them easier to +//! understand, analyze, and formally verify. Unlike `core`, which may rely on platform-specific +//! intrinsics and compiler magic, `core-models` expresses everything in plain Rust, prioritizing +//! clarity and explicitness over efficiency. +//! +//! ## Key Features +//! +//! - **Partial Modeling**: `core-models` includes only a subset of `core`, focusing on modeling +//! fundamental operations rather than providing a complete replacement. +//! - **Exact Signatures**: Any item that exists in both `core-models` and `core` has the same type signature, +//! ensuring compatibility with formal verification efforts. +//! - **Purely Functional Approach**: Where possible, `core-models` favors functional programming principles, +//! avoiding unnecessary mutation and side effects to facilitate formal reasoning. +//! - **Explicit Implementations**: Even low-level operations, such as SIMD, are modeled explicitly using +//! Rust constructs like bit arrays and partial maps. +//! - **Extra Abstractions**: `core-models` includes additional helper types and functions to support +//! modeling. These extra items are marked appropriately to distinguish them from `core` definitions. +//! +//! ## Intended Use +//! +//! `testable-simd-models` is designed as a reference model for formal verification and reasoning about Rust programs. +//! By providing a readable, testable, well-specified version of `core`'s behavior, it serves as a foundation for +//! proof assistants and other verification tools. + +// This recursion limit is necessary for mk! macro used for tests. +// We test functions with const generics, the macro generate a test per possible (const generic) control value. +#![recursion_limit = "4096"] +pub mod abstractions; +pub mod core_arch; + +pub use core_arch as arch; +pub mod helpers;