diff --git a/.gitignore b/.gitignore
index 39ad701a8883f..82d2291fd22b0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,3 +56,4 @@ goto-transcoder
 # already existing elements were commented out
 
 #/target
+testable-simd-models/target
diff --git a/library/Cargo.lock b/library/Cargo.lock
index a9a611fe1ed56..3e34ee6173741 100644
--- a/library/Cargo.lock
+++ b/library/Cargo.lock
@@ -28,6 +28,7 @@ version = "0.0.0"
 dependencies = [
  "compiler_builtins",
  "core",
+ "safety",
 ]
 
 [[package]]
@@ -67,6 +68,9 @@ dependencies = [
 [[package]]
 name = "core"
 version = "0.0.0"
+dependencies = [
+ "safety",
+]
 
 [[package]]
 name = "coretests"
@@ -200,6 +204,39 @@ dependencies = [
  "unwind",
 ]
 
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
+dependencies = [
+ "unicode-ident",
+]
+
 [[package]]
 name = "proc_macro"
 version = "0.0.0"
@@ -216,6 +253,15 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
 [[package]]
 name = "r-efi"
 version = "5.3.0"
@@ -300,6 +346,16 @@ dependencies = [
  "std",
 ]
 
+[[package]]
+name = "safety"
+version = "0.1.0"
+dependencies = [
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.104",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -329,6 +385,7 @@ dependencies = [
  "rand",
  "rand_xorshift",
  "rustc-demangle",
+ "safety",
  "std_detect",
  "unwind",
  "wasi",
@@ -345,6 +402,27 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.104"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
 [[package]]
 name = "sysroot"
 version = "0.0.0"
@@ -365,6 +443,12 @@ dependencies = [
  "std",
 ]
 
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
 [[package]]
 name = "unicode-width"
 version = "0.2.1"
@@ -397,6 +481,12 @@ dependencies = [
  "rustc-std-workspace-core",
 ]
 
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
 [[package]]
 name = "wasi"
 version = "0.11.1+wasi-snapshot-preview1"
diff --git a/testable-simd-models/Cargo.toml b/testable-simd-models/Cargo.toml
new file mode 100644
index 0000000000000..6e2116fec82e0
--- /dev/null
+++ b/testable-simd-models/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "testable-simd-models"
+version = "0.0.2"
+authors = ["Cryspen"]
+license = "Apache-2.0"
+homepage = "https://github.com/cryspen/verify-rust-std/testable-simd-models"
+edition = "2021"
+repository = "https://github.com/cryspen/verify-rust-std/testable-simd-models"
+readme = "README.md"
+
+[dependencies]
+rand = "0.9"
+pastey = "0.1.0"
+
+[lints.rust]
+unexpected_cfgs = { level = "warn" }
diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
new file mode 100644
index 0000000000000..470c51072c8e5
--- /dev/null
+++ b/testable-simd-models/README.md
@@ -0,0 +1,226 @@
+# testable-simd-models
+
+This crate contains executable, independently testable specifications
+for the SIMD intrinsics provided by the `core::arch` library in Rust. 
+The structure of this crate is based on [rust-lang/stdarch/crates/core_arch](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
+
+## Code Structure
+Within the `core_arch` folder in this crate, there is a different
+folder for each architecture for which we have written models. 
+In particular, it contains folders for `x86` and `arm_shared`.
+Each such folder has 2 sub-folders: `models` and `tests`. 
+
+The `models` folder contains the models of the intrinsics, with
+different files for different target features (e.g. `sse2`, `avx2`
+etc.). The code in this folder is written using the various
+abstractions implemented in `abstractions`, especially those in
+`abstractions::simd`. These models are meant to closely
+resemble their implementations within the Rust core itself.
+
+The `tests` folder contains the tests of these models, and is
+structured the same way as `models`. Each file additionally includes
+the definition of a macro that makes writing these tests easier. The
+tests work by testing the models against the intrinsics in the Rust
+core, trying out random inputs (generally 1000), and comparing their
+outputs.
+
+All tests can be run by executing `cargo test`, and we expect this to be
+run as part of CI.
+
+## Modeling a SIMD Intrinsic
+
+There are three kinds of SIMD intrinsics in `core::arch`.
+
+The first kind are builtin Rust compiler intrinsics, some of which are 
+in the [`intrinsics/simd.rs` file](https://github.com/model-checking/verify-rust-std/blob/main/library/core/src/intrinsics/simd.rs)
+in the `core` crate, and others are in the [`simd.rs` file of `core_arch`](https://github.com/model-checking/verify-rust-std/blob/main/library/stdarch/crates/core_arch/src/simd.rs).
+These builtin intrinsics define generic SIMD operations that the Rust compiler knows how to implement on each platform.
+
+The second kind are `extern` intrinsics that are links to definitions in LLVM.
+See, for example, [this list](https://github.com/rust-lang/stdarch/blob/master/crates/core_arch/src/x86/avx2.rs#L3596C8-L3596C14)
+of `extern` intrinsics used in the Intel x86 AVX2 library.
+These extern intrinsics are typically platform-specific functions that map to low-level instructions.
+
+The third kind are `defined` intrinsics that are given proper definitions in Rust, and their code may
+depend on the builtin intrinsics or the extern intrinsics. These defined intrinsics represent higher-level
+operations that are wrappers around one or more assembly instructions.
+
+### Modeling builtin intrinsics manually
+
+We model all three kinds of intrinsics, but in slightly different
+ways.  For the builtin intrinsics, we can write implementations once
+and for all, and to this end, we use a library within the
+`abstractions/simd.rs` file, where we copy the signatures of the
+intrinsics from Rust but give them our own implementation. In
+particular, we model each SIMD vector as an array of scalars, and
+define each generic operation as functions over such arrays. This can
+be seen as a reference implementation of the builtin intrinsics.
+
+Hence, for example, the SIMD add intrinsic `simd_add` is modeled as follows,
+it takes two arrays of machine integers and adds them pointwise using a
+`wrapping_add` operation:
+
+```rust
+pub fn simd_add<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].wrapping_add(y[i])))
+}
+```
+
+Notably, we model a strongly typed version of `simd_add`, in contrast to the compiler
+intrinsic, which is too generic and unimplementable in safe Rust:
+
+```rust
+/// Adds two simd vectors elementwise.
+///
+/// `T` must be a vector of integers or floats.
+#[rustc_intrinsic]
+#[rustc_nounwind]
+pub unsafe fn simd_add<T>(x: T, y: T) -> T;
+```
+
+The main rules for writing these models are that they should be simple and self-contained,
+relying only on the libraries in `abstractions`, on builtin Rust language features, or 
+other testable models. In particular, they should not themselves directly call Rust libraries
+or external crates, without going through the abstractions API.
+
+
+### Modeling extern intrinsics manually
+
+For each file in `core::arch`, we split the code into extern
+intrinsics that must be modeled by hand and defined intrinsics whose
+models can be derived semi-automatically. The extern intrinsics are
+placed in a module suffixed with `_handwritten`. Hence, for example,
+the extern intrinsics used in `avx2.rs` can be found in `avx2_handwritten.rs`.
+
+Modeling extern intrinsics is similar to modeling the builtin ones,
+in that the models are written by hand and treat the SIMD vectors
+as arrays of machine integers. The main difference is that these intrinsics
+are platform-specific and so their modeling requires looking at the Intel or ARM
+documentation for the underlying operation.
+
+For example, the extern intrinsic `phaddw` used in `avx2` corresponds to an
+Intel instruction called "Packed Horizontal Add" and is used in AVX2 intrinsics
+like `_mm256_hadd_epi16` documented [here](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16&ig_expand=3667_)
+By inspecting the Intel documentation, we can write a Rust model for it
+as follows 
+
+```rust
+pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1])
+        }
+    })
+}
+```
+
+### Modeling defined intrinsics semi-automatically
+
+To model a defined intrinsic, we essentially copy the Rust code of
+the intrinsic from `core::arch` and adapt it to use our underlying abstractions.  The
+changes needed to the code are sometimes scriptable, and indeed most
+of our models were generated from a script, but some changes are still
+needed by hand.
+
+For example, let us say the intrinsic we are modeling is
+`_mm256_bsrli_epi128` from the avx2 feature set.
+
+1. We go to [rust-lang/stdarch/crates/core_arch/src/x86/](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch/src/x86/), and find the implementation of the intrinsic in `avx2.rs`.
+
+2. We see that the implementation looks like this:
+``` rust
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || (15 - (i % 16)) < shift {
+            0
+        } else {
+            32 + (i + shift)
+        }
+    }
+    unsafe {
+        let a = a.as_i8x32();
+        let r: i8x32 = simd_shuffle!(
+            i8x32::ZERO,
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+		...
+                mask(IMM8, 31),
+            ],
+        );
+        transmute(r)
+    }
+}
+```
+
+Thus, we then go to `core_arch/x86/models/avx2.rs`, and add this implementation.
+The only change it requires here is that the `simd_shuffle` macro is a function in our model,
+and we discard all the function attributes.
+
+For other intrinsics, we sometimes need to make more changes. Since our model of the builtin intrinsics
+is more precise concerning the type of their arguments compared to their Rust counterparts, we
+sometimes need to add more type annotations in our defined models. We also remove all `unsafe` guards,
+since our models are always in safe Rust. Otherwise, our code for the defined intrinsics looks very
+similar to the upstream code in `core::arch`.
+  
+3. Next, we add a test for this intrinsic in `core_arch/avx2/tests/avx2.rs`. For convenience purposes, we have defined a `mk!` macro, which can be used to automatically generate
+   tests. The test generated by the macro generates a number of random inputs (by default, 1000), and compares the output generated by the model
+   and that generated by the intrinsic in upstream `core::arch`.  A valid test of the intrinsic above looks like this.
+   ```rust
+	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
+   ```
+   The macro invocation has four parts. 
+   1. `mk!([100]...`: By default, the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that
+      only 100 inputs are generated. 
+   2. `_mm256_bsrli_epi128`: This is the name of the intrinsic being tested, and is necessary in all cases.
+   3. `{<0>,<1>,<2>,<3>,...,<255>}`: This part only appears when the intrinsic has a const generic argument, like the `IMM8` in this intrinsic.
+      As the name indicates, this constant argument is supposed to be at most 8 bits wide.
+      We can confirm this by looking at the implementation and spotting the `static_assert_uimm_bits!(IMM8, 8);`
+      line, which asserts that constant argument is positive and fits in 8 bits. Thus, we add `{<0>,<1>,<2>,<3>,...,<255>}` to test for each possible constant
+      value of the constant argument. 
+   4. `(a: BitVec)`: This part contains all the arguments of the intrinsic and their types.
+   
+   This summarizes the steps needed to use the `mk!` macro to generate a test. There is a caveat: in the case that the output of an intrinsic is _not_
+   a bit-vector (and is instead, say, an integer like `i32`), then the macro will not work, and a manual test has to be written. You can see examples in the test files.
+  
+
+
+## Contributing Models
+
+To contribute new models of intrinsics, we expect the author to follow
+the above steps and provide comprehensive tests.  It is important that
+the model author looks carefully at both the Intel/ARM specifications
+and the Rust `stdarch` implementation, because they may look quite different
+from each other. 
+
+In some cases, the Rust implementation may not be correct.
+Indeed, the previous implementation of `_mm256_bsrli_epi128` (and a
+similar intrinsic called `_mm512_bsrli_epi128`) in `stdarch` had a
+bug, which we found during the process of modeling and testing this
+intrinsic. This bug was [reported by
+us](https://github.com/rust-lang/stdarch/issues/1822) using a failing
+test case generated from the testable model and then fixed by [our
+PR](https://github.com/rust-lang/stdarch/pull/1823) in the 2025-06-30
+version of `stdarch`.
diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
new file mode 100644
index 0000000000000..f8b67f2ca20f1
--- /dev/null
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -0,0 +1,248 @@
+//! # Bit Manipulation and Machine Integer Utilities
+//!
+//! This module provides utilities for working with individual bits and machine integer types.
+//! It defines a [`Bit`] enum to represent a single bit (`0` or `1`) along with convenient
+//! conversion implementations between `Bit`, [`bool`], and various primitive integer types.
+//!
+//! In addition, the module introduces the [`MachineInteger`] trait which abstracts over
+//! integer types, providing associated constants:
+//!
+//! - `BITS`: The size of the integer type in bits.
+//! - `SIGNED`: A flag indicating whether the type is signed.
+//!
+//! The [`Bit`] type includes methods for extracting the value of a specific bit from an integer.
+//! For example, [`Bit::of_int`] returns the bit at a given position for a provided integer,
+//! handling both positive and negative values (assuming a two's complement representation).
+//!
+//! # Examples
+//!
+//! ```rust
+//! use testable_simd_models::abstractions::bit::{Bit, MachineInteger};
+//!
+//! // Extract the 3rd bit (0-indexed) from an integer.
+//! let bit = Bit::nth_bit(42, 2);
+//! println!("The extracted bit is: {:?}", bit);
+//!
+//! // Convert Bit to a primitive integer type.
+//! let num: u8 = bit.into();
+//! println!("As an integer: {}", num);
+//! ```
+//!
+//! [`bool`]: https://doc.rust-lang.org/std/primitive.bool.html
+//! [`Bit::of_int`]: enum.Bit.html#method.of_int
+
+/// Represent a bit: `0` or `1`.
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+pub enum Bit {
+    Zero,
+    One,
+}
+impl std::ops::BitAnd for Bit {
+    type Output = Self;
+    fn bitand(self, rhs: Self) -> Self {
+        match self {
+            Bit::Zero => Bit::Zero,
+            Bit::One => rhs,
+        }
+    }
+}
+
+impl std::ops::BitOr for Bit {
+    type Output = Self;
+    fn bitor(self, rhs: Self) -> Self {
+        match self {
+            Bit::Zero => rhs,
+            Bit::One => Bit::One,
+        }
+    }
+}
+
+impl std::ops::BitXor for Bit {
+    type Output = Self;
+    fn bitxor(self, rhs: Self) -> Self {
+        match (self, rhs) {
+            (Bit::Zero, Bit::Zero) => Bit::Zero,
+            (Bit::One, Bit::One) => Bit::Zero,
+            _ => Bit::One,
+        }
+    }
+}
+
+impl std::ops::Not for Bit {
+    type Output = Self;
+    fn not(self) -> Self {
+        match self {
+            Bit::One => Bit::Zero,
+            Bit::Zero => Bit::One,
+        }
+    }
+}
+
+impl std::ops::Neg for Bit {
+    type Output = Self;
+    fn neg(self) -> Self {
+        match self {
+            Bit::One => Bit::Zero,
+            Bit::Zero => Bit::One,
+        }
+    }
+}
+macro_rules! generate_from_bit_impls {
+    ($($ty:ident),*) => {
+        $(impl From<Bit> for $ty {
+            fn from(bit: Bit) -> Self {
+                bool::from(bit) as $ty
+            }
+        })*
+    };
+}
+generate_from_bit_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
+
+impl From<Bit> for bool {
+    fn from(bit: Bit) -> Self {
+        match bit {
+            Bit::Zero => false,
+            Bit::One => true,
+        }
+    }
+}
+
+impl From<bool> for Bit {
+    fn from(b: bool) -> Bit {
+        match b {
+            false => Bit::Zero,
+            true => Bit::One,
+        }
+    }
+}
+
+/// A trait for integers and floats
+
+pub trait MachineNumeric {
+    /// The size of this integer type in bits.
+    const BITS: u32;
+    /// The signedness of this integer type.
+    const SIGNED: bool;
+    /// Element of the integer type with every bit as 0.
+    const ZEROS: Self;
+    /// Element of the integer type with every bit as 1.
+    const ONES: Self;
+    /// Minimum value of the integer type.
+    const MIN: Self;
+    /// Maximum value of the integer type.
+    const MAX: Self;
+    /// Raw transmutation of bits to u128
+    fn to_u128(self) -> u128;
+    /// Raw transmutation of bits from u128
+    fn from_u128(x: u128) -> Self;
+}
+
+/// A trait for types that represent machine integers.
+pub trait MachineInteger: MachineNumeric {
+    /// Implements functionality for `simd_add` in `crate::abstractions::simd`.
+    fn wrapping_add(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_sub` in `crate::abstractions::simd`.
+    fn wrapping_sub(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_mul` in `crate::abstractions::simd`.
+    fn overflowing_mul(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_saturating_add` in `crate::abstractions::simd`.
+    fn saturating_add(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_saturating_sub` in `crate::abstractions::simd`.
+    fn saturating_sub(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_abs_diff` in `crate::abstractions::simd`.
+    fn wrapping_abs_diff(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_abs` in `crate::abstractions::simd`.
+    fn wrapping_abs(self) -> Self;
+}
+
+macro_rules! generate_imachine_integer_impls {
+    ($($ty:ident),*) => {
+        $(
+        impl MachineNumeric for $ty {
+        const BITS: u32 = $ty::BITS;
+		const SIGNED: bool = true;
+		const ZEROS: $ty = 0;
+		const ONES: $ty = -1;
+		const MIN: $ty = $ty::MIN;
+		const MAX: $ty = $ty::MAX;
+        fn to_u128(self) -> u128 {self as u128}
+        fn from_u128(x:u128) -> Self {x as $ty}
+        }
+	    impl MachineInteger for $ty {
+		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
+		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
+		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
+		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
+		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs) }
+		fn wrapping_abs_diff(self, rhs: Self) -> Self {if self > rhs {$ty::wrapping_sub(self, rhs)} else {$ty::wrapping_sub(rhs, self)}}
+		fn wrapping_abs(self) -> Self {if self == $ty::MIN {self} else {self.abs()}}
+            })*
+    };
+}
+
+macro_rules! generate_umachine_integer_impls {
+    ($($ty:ident),*) => {
+        $(
+        impl MachineNumeric for $ty {
+        const BITS: u32 = $ty::BITS;
+		const SIGNED: bool = false;
+		const ZEROS: $ty = 0;
+		const ONES: $ty = $ty::MAX;
+		const MIN: $ty = $ty::MIN;
+		const MAX: $ty = $ty::MAX;
+        fn to_u128(self) -> u128 {self as u128}
+        fn from_u128(x:u128) -> Self {x as $ty}
+        }
+	    impl MachineInteger for $ty {
+		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
+		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
+		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
+		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
+		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs)}
+		fn wrapping_abs_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
+		fn wrapping_abs(self) -> Self {self}
+        })*
+    };
+}
+generate_imachine_integer_impls!(i8, i16, i32, i64, i128);
+generate_umachine_integer_impls!(u8, u16, u32, u64, u128);
+
+impl MachineNumeric for f32 {
+    const BITS: u32 = 32;
+    const SIGNED: bool = false;
+    const ZEROS: f32 = 0.0;
+    const ONES: f32 = f32::from_bits(0xffffffffu32);
+    const MIN: f32 = f32::MIN;
+    const MAX: f32 = f32::MAX;
+    fn to_u128(self) -> u128 {
+        self.to_bits() as u128
+    }
+    fn from_u128(x: u128) -> Self {
+        f32::from_bits(x as u32)
+    }
+}
+
+impl MachineNumeric for f64 {
+    const BITS: u32 = 64;
+    const SIGNED: bool = false;
+    const ZEROS: f64 = 0.0;
+    const ONES: f64 = f64::from_bits(0xffffffffffffffffu64);
+    const MIN: f64 = f64::MIN;
+    const MAX: f64 = f64::MAX;
+    fn to_u128(self) -> u128 {
+        self.to_bits() as u128
+    }
+    fn from_u128(x: u128) -> Self {
+        f64::from_bits(x as u64)
+    }
+}
+
+impl Bit {
+    pub fn nth_bit<T: MachineNumeric>(x: T, nth: usize) -> Self {
+        if (x.to_u128() >> nth) % 2 == 1 {
+            Self::One
+        } else {
+            Self::Zero
+        }
+    }
+}
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
new file mode 100644
index 0000000000000..ac73749482e37
--- /dev/null
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -0,0 +1,158 @@
+//! This module provides a specification-friendly bit vector type.
+use super::bit::{Bit, MachineNumeric};
+use super::funarr::*;
+
+use std::fmt::Formatter;
+
+/// A fixed-size bit vector type.
+///
+/// `BitVec<N>` is a specification-friendly, fixed-length bit vector that internally
+/// stores an array of [`Bit`] values, where each `Bit` represents a single binary digit (0 or 1).
+///
+/// This type provides several utility methods for constructing and converting bit vectors:
+///
+/// The [`Debug`] implementation for `BitVec` pretty-prints the bits in groups of eight,
+/// making the bit pattern more human-readable. The type also implements indexing,
+/// allowing for easy access to individual bits.
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub struct BitVec<const N: u32>(FunArray<N, Bit>);
+
+impl<const N: u32> BitVec<N> {
+    #[allow(non_snake_case)]
+    pub fn ZERO() -> Self {
+        Self::from_fn(|_| Bit::Zero)
+    }
+}
+
+/// Pretty prints a bit slice by group of 8
+fn bit_slice_to_string(bits: &[Bit]) -> String {
+    bits.iter()
+        .map(|bit| match bit {
+            Bit::Zero => '0',
+            Bit::One => '1',
+        })
+        .collect::<Vec<_>>()
+        .chunks(8)
+        .map(|bits| bits.iter().collect::<String>())
+        .map(|s| format!("{s} "))
+        .collect::<String>()
+        .trim()
+        .into()
+}
+
+impl<const N: u32> core::fmt::Debug for BitVec<N> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(f, "{}", bit_slice_to_string(&self.0.as_vec()))
+    }
+}
+
+impl<const N: u32> core::ops::Index<u32> for BitVec<N> {
+    type Output = Bit;
+    fn index(&self, index: u32) -> &Self::Output {
+        self.0.get(index)
+    }
+}
+
+/// Convert a bit slice into an unsigned number.
+
+fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
+    bits.iter()
+        .enumerate()
+        .map(|(i, bit)| u128::from(*bit) << i)
+        .sum::<u128>()
+}
+
+/// Convert a bit slice into a machine integer of type `T`.
+fn int_from_bit_slice<T: MachineNumeric + Copy>(bits: &[Bit]) -> T {
+    debug_assert!(bits.len() <= T::BITS as usize);
+    let result = if T::SIGNED {
+        let is_negative = matches!(bits[T::BITS as usize - 1], Bit::One);
+        let s = u128_int_from_bit_slice(&bits[0..T::BITS as usize - 1]) as i128;
+        if is_negative {
+            s + (-2i128).pow(T::BITS - 1)
+        } else {
+            s
+        }
+    } else {
+        u128_int_from_bit_slice(bits) as i128
+    };
+    T::from_u128(result as u128)
+}
+impl<const N: u32> BitVec<N> {
+    /// Constructor for BitVec. `BitVec::<N>::from_fn` constructs a bitvector out of a function that takes usizes smaller than `N` and produces bits.
+    pub fn from_fn<F: Fn(u32) -> Bit>(f: F) -> Self {
+        Self(FunArray::from_fn(f))
+    }
+    /// Convert a slice of machine integers where only the `d` least significant bits are relevant.
+    pub fn from_slice<T: MachineNumeric + Copy>(x: &[T], d: u32) -> Self {
+        Self::from_fn(|i| Bit::nth_bit::<T>(x[(i / d) as usize], (i % d) as usize))
+    }
+
+    /// Construct a BitVec out of a machine integer.
+    pub fn from_int<T: MachineNumeric + Copy>(n: T) -> Self {
+        Self::from_slice::<T>(&[n], T::BITS as u32)
+    }
+
+    /// Convert a BitVec into a machine integer of type `T`.
+    pub fn to_int<T: MachineNumeric + Copy>(self) -> T {
+        int_from_bit_slice(&self.0.as_vec())
+    }
+
+    /// Convert a BitVec into a vector of machine integers of type `T`.
+    pub fn to_vec<T: MachineNumeric + Copy>(&self) -> Vec<T> {
+        self.0
+            .as_vec()
+            .chunks(T::BITS as usize)
+            .map(int_from_bit_slice)
+            .collect()
+    }
+
+    /// Generate a random BitVec.
+    pub fn rand() -> Self {
+        use rand::prelude::*;
+        let random_source: Vec<_> = {
+            let mut rng = rand::rng();
+            (0..N).map(|_| rng.random::<bool>()).collect()
+        };
+        Self::from_fn(|i| random_source[i as usize].into())
+    }
+}
+
+impl<const N: u32> BitVec<N> {
+    pub fn chunked_shift<const CHUNK: u32, const SHIFTS: u32>(
+        self,
+        shl: FunArray<SHIFTS, i128>,
+    ) -> BitVec<N> {
+        fn chunked_shift<const N: u32, const CHUNK: u32, const SHIFTS: u32>(
+            bitvec: BitVec<N>,
+            shl: FunArray<SHIFTS, i128>,
+        ) -> BitVec<N> {
+            BitVec::from_fn(|i| {
+                let nth_bit = i % CHUNK;
+                let nth_chunk = i / CHUNK;
+                let shift: i128 = if nth_chunk < SHIFTS {
+                    shl[nth_chunk]
+                } else {
+                    0
+                };
+                let local_index = (nth_bit as i128).wrapping_sub(shift);
+                if local_index < CHUNK as i128 && local_index >= 0 {
+                    let local_index = local_index as u32;
+                    bitvec[nth_chunk * CHUNK + local_index]
+                } else {
+                    Bit::Zero
+                }
+            })
+        }
+        chunked_shift::<N, CHUNK, SHIFTS>(self, shl)
+    }
+
+    /// Folds over the array, accumulating a result.
+    ///
+    /// # Arguments
+    /// * `init` - The initial value of the accumulator.
+    /// * `f` - A function combining the accumulator and each element.
+    pub fn fold<A>(&self, init: A, f: fn(A, Bit) -> A) -> A {
+        self.0.fold(init, f)
+    }
+}
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
new file mode 100644
index 0000000000000..4026efb66c1f5
--- /dev/null
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -0,0 +1,185 @@
+//! This module implements a fixed-size array wrapper with functional semantics
+//! which are used in formulating abstractions.
+
+use crate::abstractions::bit::MachineNumeric;
+
+/// `FunArray<N, T>` represents an array of `T` values of length `N`, where `N` is a compile-time constant.
+/// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
+/// Unused elements beyond `N` are filled with `None`.
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub struct FunArray<const N: u32, T>([Option<T>; 512]);
+
+impl<const N: u32, T> FunArray<N, T> {
+    /// Gets a reference to the element at index `i`.
+    pub fn get(&self, i: u32) -> &T {
+        self.0[i as usize].as_ref().unwrap()
+    }
+    /// Constructor for FunArray. `FunArray<N,T>::from_fn` constructs a funarray out of a function that takes usizes smaller than `N` and produces an element of type T.
+    pub fn from_fn<F: Fn(u32) -> T>(f: F) -> Self {
+        // let vec = (0..N).map(f).collect();
+        let arr = core::array::from_fn(|i| {
+            if (i as u32) < N {
+                Some(f(i as u32))
+            } else {
+                None
+            }
+        });
+        Self(arr)
+    }
+
+    /// Converts the `FunArray` into a `Vec<T>`.
+    pub fn as_vec(&self) -> Vec<T>
+    where
+        T: Clone,
+    {
+        self.0[0..(N as usize)]
+            .iter()
+            .cloned()
+            .map(|x| x.unwrap())
+            .collect()
+    }
+
+    /// Folds over the array, accumulating a result.
+    ///
+    /// # Arguments
+    /// * `init` - The initial value of the accumulator.
+    /// * `f` - A function combining the accumulator and each element.
+    pub fn fold<A>(&self, mut init: A, f: fn(A, T) -> A) -> A
+    where
+        T: Clone,
+    {
+        for i in 0..N {
+            init = f(init, self[i].clone());
+        }
+        init
+    }
+}
+
+impl<const N: u32, T: MachineNumeric> FunArray<N, T> {
+    #[allow(non_snake_case)]
+    pub fn ZERO() -> Self {
+        Self::from_fn(|_| T::ZEROS)
+    }
+}
+
+impl<const N: u32, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
+    type Error = ();
+    fn try_from(v: Vec<T>) -> Result<Self, ()> {
+        if (v.len() as u32) < N {
+            Err(())
+        } else {
+            Ok(Self::from_fn(|i| v[i as usize].clone()))
+        }
+    }
+}
+
+impl<const N: u32, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "{:?}", self.as_vec())
+    }
+}
+
+impl<const N: u32, T> core::ops::Index<u32> for FunArray<N, T> {
+    type Output = T;
+
+    fn index(&self, index: u32) -> &Self::Output {
+        self.get(index)
+    }
+}
+
+impl<T: Copy> FunArray<1, T> {
+    pub fn new(x: T) -> Self {
+        let v = [x];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<2, T> {
+    pub fn new(x0: T, x1: T) -> Self {
+        let v = [x0, x1];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<4, T> {
+    pub fn new(x0: T, x1: T, x2: T, x3: T) -> Self {
+        let v = [x0, x1, x2, x3];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<8, T> {
+    pub fn new(x0: T, x1: T, x2: T, x3: T, x4: T, x5: T, x6: T, x7: T) -> Self {
+        let v = [x0, x1, x2, x3, x4, x5, x6, x7];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<16, T> {
+    pub fn new(
+        x0: T,
+        x1: T,
+        x2: T,
+        x3: T,
+        x4: T,
+        x5: T,
+        x6: T,
+        x7: T,
+        x8: T,
+        x9: T,
+        x10: T,
+        x11: T,
+        x12: T,
+        x13: T,
+        x14: T,
+        x15: T,
+    ) -> Self {
+        let v = [
+            x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
+        ];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<32, T> {
+    pub fn new(
+        x0: T,
+        x1: T,
+        x2: T,
+        x3: T,
+        x4: T,
+        x5: T,
+        x6: T,
+        x7: T,
+        x8: T,
+        x9: T,
+        x10: T,
+        x11: T,
+        x12: T,
+        x13: T,
+        x14: T,
+        x15: T,
+        x16: T,
+        x17: T,
+        x18: T,
+        x19: T,
+        x20: T,
+        x21: T,
+        x22: T,
+        x23: T,
+        x24: T,
+        x25: T,
+        x26: T,
+        x27: T,
+        x28: T,
+        x29: T,
+        x30: T,
+        x31: T,
+    ) -> Self {
+        let v = [
+            x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18,
+            x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,
+        ];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs
new file mode 100644
index 0000000000000..4f840ab60235d
--- /dev/null
+++ b/testable-simd-models/src/abstractions/mod.rs
@@ -0,0 +1,27 @@
+//! This module provides abstractions that are useful for writing
+//! specifications for the intrinsics. Currently it provides two abstractions: bits and
+//! bit vectors.
+//!
+//! # Examples
+//!
+//! Converting an integer to a bit vector and back:
+//!
+//! ```rust
+//! use testable_simd_models::abstractions::{bit::{Bit, MachineInteger}, bitvec::BitVec};
+//!
+//! // Create a BitVec from a machine integer (using the integer's bit-width)
+//! let bv = BitVec::<16>::from_int(42u16);
+//! println!("BitVec: {:?}", bv);
+//!
+//! // Convert the BitVec back into a machine integer
+//! let n: u16 = bv.to_int();
+//! println!("Integer: {}", n);
+//!
+//! assert!(n == 42);
+//! ```
+
+pub mod bit;
+pub mod bitvec;
+pub mod funarr;
+pub mod simd;
+pub mod utilities;
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
new file mode 100644
index 0000000000000..70e0556618288
--- /dev/null
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -0,0 +1,947 @@
+//! Models of SIMD compiler intrinsics.
+//!
+//! Operations are defined on FunArrs.
+
+use crate::abstractions::{bit::*, bitvec::*, funarr::*};
+use std::convert::*;
+use std::ops::*;
+
+#[allow(dead_code)]
+/// Derives interpretations functions, and type synonyms.
+macro_rules! interpretations {
+($n:literal; $($name:ident [$ty:ty; $m:literal]),*) => {
+        $(
+    #[doc = concat!(stringify!($ty), " vectors of size ", stringify!($m))]
+    #[allow(non_camel_case_types)]
+    pub type $name = FunArray<$m, $ty>;
+    pastey::paste! {
+                const _: ()  = {
+        impl BitVec<$n> {
+                        #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
+                        pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
+            let vec: Vec<$ty> = iv.as_vec();
+            Self::from_slice(&vec[..], <$ty>::BITS as u32)
+                        }
+                        #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
+                        pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
+            let vec: Vec<$ty> = bv.to_vec();
+            $name::from_fn(|i| vec[i as usize])
+                        }
+                        #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
+                        pub fn [< as_ $name >](self) -> $name {
+            let vec: Vec<$ty> = self.to_vec();
+            $name::from_fn(|i| vec[i as usize])
+                        }
+
+
+        }
+
+
+        impl From<BitVec<$n>> for $name {
+                        fn from(bv: BitVec<$n>) -> Self {
+            BitVec::[< to_ $name >](bv)
+                        }
+        }
+
+        impl From<$name> for BitVec<$n> {
+                        fn from(iv: $name) -> Self {
+            BitVec::[< from_ $name >](iv)
+                        }
+        }
+
+        impl $name {
+
+            pub fn splat(value: $ty) -> Self {
+            FunArray::from_fn(|_| value)
+            }
+        }
+                };
+    }
+        )*
+};
+}
+
+interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
+            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32], f32x8 [f32; 8], f64x4 [f64; 4]);
+interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
+            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16], f32x4 [f32; 4], f64x2 [f64; 2]);
+
+interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
+interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8], f32x2 [f32; 2], f64x1 [f64; 1]);
+interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
+
+/// Inserts an element into a vector, returning the updated vector.
+///
+/// # Safety
+///
+/// `idx` must be in-bounds of the vector, ie. idx < N
+pub fn simd_insert<const N: u32, T: Copy>(x: FunArray<N, T>, idx: u32, val: T) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if i == idx { val } else { x[i] })
+}
+
+/// Extracts an element from a vector.
+///
+/// # Safety
+///
+/// `idx` must be in-bounds of the vector, ie. idx < N
+pub fn simd_extract<const N: u32, T: Clone>(x: FunArray<N, T>, idx: u32) -> T {
+    x.get(idx).clone()
+}
+
+/// Adds two vectors elementwise with wrapping on overflow/underflow.
+pub fn simd_add<const N: u32, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].wrapping_add(y[i]))
+}
+
+/// Subtracts `rhs` from `lhs` elementwise with wrapping on overflow/underflow.
+pub fn simd_sub<const N: u32, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].wrapping_sub(y[i]))
+}
+
+/// Multiplies two vectors elementwise with wrapping on overflow/underflow.
+pub fn simd_mul<const N: u32, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].overflowing_mul(y[i]))
+}
+
+/// Produces the elementwise absolute values.
+/// For vectors of unsigned integers it returns the vector untouched.
+/// If the element is the minimum value of a signed integer, it returns the element as is.
+pub fn simd_abs<const N: u32, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].wrapping_abs())
+}
+
+/// Produces the elementwise absolute difference of two vectors.
+/// Note: Absolute difference in this case is simply the element with the smaller value subtracted from the element with the larger value, with overflow/underflow.
+/// For example, if the elements are i8, the absolute difference of 255 and -2 is -255.
+pub fn simd_abs_diff<const N: u32, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].wrapping_abs_diff(y[i]))
+}
+
+/// Shifts vector left elementwise, with UB on overflow.
+///
+/// # Safety
+///
+/// Each element of `rhs` must be less than `<int>::BITS`.
+pub fn simd_shl<const N: u32, T: Shl + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as Shl>::Output> {
+    FunArray::from_fn(|i| x[i] << y[i])
+}
+
+/// Shifts vector right elementwise, with UB on overflow.
+///
+/// Shifts `lhs` right by `rhs`, shifting in sign bits for signed types.
+///
+/// # Safety
+///
+/// Each element of `rhs` must be less than `<int>::BITS`.
+
+pub fn simd_shr<const N: u32, T: Shr + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as Shr>::Output> {
+    FunArray::from_fn(|i| x[i] >> y[i])
+}
+
+/// "Ands" vectors elementwise.
+
+pub fn simd_and<const N: u32, T: BitAnd + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as BitAnd>::Output> {
+    FunArray::from_fn(|i| x[i] & y[i])
+}
+
+/// "Ors" vectors elementwise.
+
+pub fn simd_or<const N: u32, T: BitOr + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as BitOr>::Output> {
+    FunArray::from_fn(|i| x[i] | y[i])
+}
+
+/// "Exclusive ors" vectors elementwise.
+
+pub fn simd_xor<const N: u32, T: BitXor + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as BitXor>::Output> {
+    FunArray::from_fn(|i| x[i] ^ y[i])
+}
+
+pub trait CastsFrom<T> {
+    fn cast(a: T) -> Self;
+}
+pub trait TruncateFrom<T> {
+    /// Truncates into [`Self`] from a larger integer
+    fn truncate_from(v: T) -> Self;
+}
+
+macro_rules! from_impls{
+    ($([$ty1:ty, $ty2: ty]),*) => {
+        $(
+	    impl CastsFrom<$ty2> for $ty1 {
+		fn cast(a: $ty2) -> $ty1 {
+		    a as $ty1
+		}
+	    }
+	)*
+    };
+}
+macro_rules! truncate_from_order {
+    ($t:ty, $($from:ty),+) => {
+        $(
+        impl TruncateFrom<$from> for $t {
+            #[inline]
+            fn truncate_from(v: $from) -> $t { v as $t }
+        }
+        )*
+        truncate_from_order!($($from),+);
+    };
+
+    ($t:ty) => {};
+}
+truncate_from_order!(u8, u16, u32, u64, u128);
+truncate_from_order!(i8, i16, i32, i64, i128);
+
+macro_rules! truncate_from_impls{
+    ($([$ty1:ty, $ty2: ty]),*) => {
+        $(
+	    impl CastsFrom<$ty2> for $ty1 {
+		fn cast(a: $ty2) -> $ty1 {
+		    <$ty1>::truncate_from(a)
+		}
+	    }
+	)*
+    };
+}
+
+macro_rules! symm_impls{
+    ($([$ty1:ty, $ty2: ty]),*) => {
+        $(
+	    impl CastsFrom<$ty2> for $ty1 {
+		fn cast(a: $ty2) -> $ty1 {
+		    a as $ty1
+		}
+	    }
+	    impl CastsFrom<$ty1> for $ty2 {
+		fn cast(a: $ty1) -> $ty2 {
+		    a as $ty2
+		}
+	    }
+	)*
+    };
+}
+macro_rules! self_impls{
+    ($($ty1:ty),*) => {
+        $(
+	    impl CastsFrom<$ty1> for $ty1 {
+		fn cast(a: $ty1) -> $ty1 {
+		    a
+		}
+	    }
+
+	)*
+    };
+}
+from_impls!(
+    [u16, u8],
+    [u32, u8],
+    [u32, u16],
+    [u64, u8],
+    [u64, u16],
+    [u64, u32],
+    [u128, u8],
+    [u128, u16],
+    [u128, u32],
+    [u128, u64],
+    [i16, i8],
+    [i32, i8],
+    [i32, i16],
+    [i64, i8],
+    [i64, i16],
+    [i64, i32],
+    [i128, i8],
+    [i128, i16],
+    [i128, i32],
+    [i128, i64],
+    [f64, u32],
+    [f64, i32],
+    [f32, u32],
+    [f32, i32],
+    [f32, f64],
+    [f64, f32]
+);
+truncate_from_impls!(
+    [u8, u16],
+    [u8, u32],
+    [u16, u32],
+    [u8, u64],
+    [u16, u64],
+    [u32, u64],
+    [u8, u128],
+    [u16, u128],
+    [u32, u128],
+    [u64, u128],
+    [i8, i16],
+    [i8, i32],
+    [i16, i32],
+    [i8, i64],
+    [i16, i64],
+    [i32, i64],
+    [i8, i128],
+    [i16, i128],
+    [i32, i128],
+    [i64, i128]
+);
+
+symm_impls!([u8, i8], [u16, i16], [u32, i32], [u64, i64], [u128, i128]);
+
+self_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
+
+// Would like to do the below instead of using the above macros, but currently this is an active issue in Rust (#31844)
+// impl <T,U> CastsFrom<T> for U
+// where
+//     U : From<T> {
+//     fn cast(a: T) -> U {
+// 	U::from(a)
+//     }
+// }
+
+// impl <T,U> CastsFrom<T> for U
+// where
+//     U : TruncateFrom<T> {
+//     fn cast(a: T) -> U {
+// 	U::truncate_from(a)
+//     }
+// }
+
+/// Numerically casts a vector, elementwise.
+///
+/// Casting can only happen between two integers of the same signedness.
+///
+/// When casting from a wider number to a smaller number, the higher bits are removed.
+/// Otherwise, it extends the number, following signedness.
+pub fn simd_cast<const N: u32, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>) -> FunArray<N, T2> {
+    FunArray::from_fn(|i| T2::cast(x[i]))
+}
+
+/// Negates a vector elementwise.
+///
+/// Rust panics for `-<int>::Min` due to overflow, but here, it just returns the element as is.
+
+pub fn simd_neg<const N: u32, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
+    x: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| {
+        if x[i] == T::MIN {
+            T::MIN
+        } else {
+            T::from(-x[i])
+        }
+    })
+}
+/// Tests elementwise equality of two vectors.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_eq<const N: u32, T: Eq + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] == y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests elementwise inequality equality of two vectors.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_ne<const N: u32, T: Eq + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] != y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is less than `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_lt<const N: u32, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] < y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is less than or equal to `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_le<const N: u32, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] <= y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is greater than `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_gt<const N: u32, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] > y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is greater than or equal to `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_ge<const N: u32, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] >= y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Shuffles two vectors by the indices in idx.
+///
+/// For safety, `N2 <= N1 + N3` must hold.
+pub fn simd_shuffle<T: Copy, const N1: u32, const N2: usize, const N3: u32>(
+    x: FunArray<N1, T>,
+    y: FunArray<N1, T>,
+    idx: [u32; N2],
+) -> FunArray<N3, T> {
+    FunArray::from_fn(|i| {
+        let i = idx[i as usize];
+        if i < N1 {
+            x[i]
+        } else {
+            y[i - N1]
+        }
+    })
+}
+
+/// Adds two vectors elementwise, with saturation.
+
+pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u32>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].saturating_add(y[i]))
+}
+
+/// Subtracts `y` from `x` elementwise, with saturation.
+
+pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: u32>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].saturating_sub(y[i]))
+}
+
+/// Truncates an integer vector to a bitmask.
+/// Macro for that expands to an expression which is equivalent to truncating an integer vector to a bitmask, as it would on little endian systems.
+///
+/// The macro takes 3 arguments.
+/// The first is the highest index of the vector.
+/// The second is the vector itself, which should just contain `0` and `!0`.
+/// The third is the type to which the truncation happens, which should be atleast as wide as the number of elements in the vector.
+///
+/// Thus for example, to truncate the vector,
+/// `let a : i32 = [!0, 0, 0, 0, 0, 0, 0, 0, !0, !0, 0, 0, 0, 0, !0, 0]`
+/// to u16, you would call,
+/// `simd_bitmask_little!(15, a, u16)`
+/// to get,
+/// `0b0100001100000001u16`
+///
+/// # Safety
+/// The second argument must be a vector of signed integer types.
+/// The length of the vector must be 64 at most.
+
+// The numbers in here are powers of 2. If it is needed to extend the length of the vector, simply add more cases in the same manner.
+// The reason for doing this is that the expression becomes easier to work with when compiled for a proof assistant.
+macro_rules! simd_bitmask_little {
+    (63, $a:ident, $ty:ty) => {
+        9223372036854775808 * ((if $a[63] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(62, $a, $ty)
+    };
+    (62, $a:ident, $ty:ty) => {
+        4611686018427387904 * ((if $a[62] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(61, $a, $ty)
+    };
+    (61, $a:ident, $ty:ty) => {
+        2305843009213693952 * ((if $a[61] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(60, $a, $ty)
+    };
+    (60, $a:ident, $ty:ty) => {
+        1152921504606846976 * ((if $a[60] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(59, $a, $ty)
+    };
+    (59, $a:ident, $ty:ty) => {
+        576460752303423488 * ((if $a[59] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(58, $a, $ty)
+    };
+    (58, $a:ident, $ty:ty) => {
+        288230376151711744 * ((if $a[58] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(57, $a, $ty)
+    };
+    (57, $a:ident, $ty:ty) => {
+        144115188075855872 * ((if $a[57] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(56, $a, $ty)
+    };
+    (56, $a:ident, $ty:ty) => {
+        72057594037927936 * ((if $a[56] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(55, $a, $ty)
+    };
+    (55, $a:ident, $ty:ty) => {
+        36028797018963968 * ((if $a[55] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(54, $a, $ty)
+    };
+    (54, $a:ident, $ty:ty) => {
+        18014398509481984 * ((if $a[54] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(53, $a, $ty)
+    };
+    (53, $a:ident, $ty:ty) => {
+        9007199254740992 * ((if $a[53] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(52, $a, $ty)
+    };
+    (52, $a:ident, $ty:ty) => {
+        4503599627370496 * ((if $a[52] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(51, $a, $ty)
+    };
+    (51, $a:ident, $ty:ty) => {
+        2251799813685248 * ((if $a[51] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(50, $a, $ty)
+    };
+    (50, $a:ident, $ty:ty) => {
+        1125899906842624 * ((if $a[50] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(49, $a, $ty)
+    };
+    (49, $a:ident, $ty:ty) => {
+        562949953421312 * ((if $a[49] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(48, $a, $ty)
+    };
+    (48, $a:ident, $ty:ty) => {
+        281474976710656 * ((if $a[48] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(47, $a, $ty)
+    };
+    (47, $a:ident, $ty:ty) => {
+        140737488355328 * ((if $a[47] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(46, $a, $ty)
+    };
+    (46, $a:ident, $ty:ty) => {
+        70368744177664 * ((if $a[46] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(45, $a, $ty)
+    };
+    (45, $a:ident, $ty:ty) => {
+        35184372088832 * ((if $a[45] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(44, $a, $ty)
+    };
+    (44, $a:ident, $ty:ty) => {
+        17592186044416 * ((if $a[44] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(43, $a, $ty)
+    };
+    (43, $a:ident, $ty:ty) => {
+        8796093022208 * ((if $a[43] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(42, $a, $ty)
+    };
+    (42, $a:ident, $ty:ty) => {
+        4398046511104 * ((if $a[42] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(41, $a, $ty)
+    };
+    (41, $a:ident, $ty:ty) => {
+        2199023255552 * ((if $a[41] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(40, $a, $ty)
+    };
+    (40, $a:ident, $ty:ty) => {
+        1099511627776 * ((if $a[40] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(39, $a, $ty)
+    };
+    (39, $a:ident, $ty:ty) => {
+        549755813888 * ((if $a[39] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(38, $a, $ty)
+    };
+    (38, $a:ident, $ty:ty) => {
+        274877906944 * ((if $a[38] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(37, $a, $ty)
+    };
+    (37, $a:ident, $ty:ty) => {
+        137438953472 * ((if $a[37] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(36, $a, $ty)
+    };
+    (36, $a:ident, $ty:ty) => {
+        68719476736 * ((if $a[36] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(35, $a, $ty)
+    };
+    (35, $a:ident, $ty:ty) => {
+        34359738368 * ((if $a[35] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(34, $a, $ty)
+    };
+    (34, $a:ident, $ty:ty) => {
+        17179869184 * ((if $a[34] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(33, $a, $ty)
+    };
+    (33, $a:ident, $ty:ty) => {
+        8589934592 * ((if $a[33] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(32, $a, $ty)
+    };
+    (32, $a:ident, $ty:ty) => {
+        4294967296 * ((if $a[32] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(31, $a, $ty)
+    };
+    (31, $a:ident, $ty:ty) => {
+        2147483648 * ((if $a[31] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(30, $a, $ty)
+    };
+    (30, $a:ident, $ty:ty) => {
+        1073741824 * ((if $a[30] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(29, $a, $ty)
+    };
+    (29, $a:ident, $ty:ty) => {
+        536870912 * ((if $a[29] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(28, $a, $ty)
+    };
+    (28, $a:ident, $ty:ty) => {
+        268435456 * ((if $a[28] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(27, $a, $ty)
+    };
+    (27, $a:ident, $ty:ty) => {
+        134217728 * ((if $a[27] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(26, $a, $ty)
+    };
+    (26, $a:ident, $ty:ty) => {
+        67108864 * ((if $a[26] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(25, $a, $ty)
+    };
+    (25, $a:ident, $ty:ty) => {
+        33554432 * ((if $a[25] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(24, $a, $ty)
+    };
+    (24, $a:ident, $ty:ty) => {
+        16777216 * ((if $a[24] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(23, $a, $ty)
+    };
+    (23, $a:ident, $ty:ty) => {
+        8388608 * ((if $a[23] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(22, $a, $ty)
+    };
+    (22, $a:ident, $ty:ty) => {
+        4194304 * ((if $a[22] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(21, $a, $ty)
+    };
+    (21, $a:ident, $ty:ty) => {
+        2097152 * ((if $a[21] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(20, $a, $ty)
+    };
+    (20, $a:ident, $ty:ty) => {
+        1048576 * ((if $a[20] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(19, $a, $ty)
+    };
+    (19, $a:ident, $ty:ty) => {
+        524288 * ((if $a[19] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(18, $a, $ty)
+    };
+    (18, $a:ident, $ty:ty) => {
+        262144 * ((if $a[18] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(17, $a, $ty)
+    };
+    (17, $a:ident, $ty:ty) => {
+        131072 * ((if $a[17] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(16, $a, $ty)
+    };
+    (16, $a:ident, $ty:ty) => {
+        65536 * ((if $a[16] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(15, $a, $ty)
+    };
+    (15, $a:ident, $ty:ty) => {
+        32768 * ((if $a[15] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(14, $a, $ty)
+    };
+    (14, $a:ident, $ty:ty) => {
+        16384 * ((if $a[14] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(13, $a, $ty)
+    };
+    (13, $a:ident, $ty:ty) => {
+        8192 * ((if $a[13] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(12, $a, $ty)
+    };
+    (12, $a:ident, $ty:ty) => {
+        4096 * ((if $a[12] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(11, $a, $ty)
+    };
+    (11, $a:ident, $ty:ty) => {
+        2048 * ((if $a[11] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(10, $a, $ty)
+    };
+    (10, $a:ident, $ty:ty) => {
+        1024 * ((if $a[10] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(9, $a, $ty)
+    };
+    (9, $a:ident, $ty:ty) => {
+        512 * ((if $a[9] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(8, $a, $ty)
+    };
+    (8, $a:ident, $ty:ty) => {
+        256 * ((if $a[8] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(7, $a, $ty)
+    };
+    (7, $a:ident, $ty:ty) => {
+        128 * ((if $a[7] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(6, $a, $ty)
+    };
+    (6, $a:ident, $ty:ty) => {
+        64 * ((if $a[6] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(5, $a, $ty)
+    };
+    (5, $a:ident, $ty:ty) => {
+        32 * ((if $a[5] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(4, $a, $ty)
+    };
+    (4, $a:ident, $ty:ty) => {
+        16 * ((if $a[4] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(3, $a, $ty)
+    };
+    (3, $a:ident, $ty:ty) => {
+        8 * ((if $a[3] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(2, $a, $ty)
+    };
+    (2, $a:ident, $ty:ty) => {
+        4 * ((if $a[2] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(1, $a, $ty)
+    };
+    (1, $a:ident, $ty:ty) => {
+        2 * ((if $a[1] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(0, $a, $ty)
+    };
+    (0, $a:ident, $ty:ty) => {
+        ((if $a[0] < 0 { 1 } else { 0 }) as $ty)
+    };
+}
+pub(crate) use simd_bitmask_little;
+
+/// Truncates an integer vector to a bitmask.
+/// Macro for that expands to an expression which is equivalent to truncating an integer vector to a bitmask, as it would on big endian systems.
+///
+/// The macro takes 3 arguments.
+/// The first is the highest index of the vector.
+/// The second is the vector itself, which should just contain `0` and `!0`.
+/// The third is the type to which the truncation happens, which should be atleast as wide as the number of elements in the vector.
+///
+/// Thus for example, to truncate the vector,
+/// `let a : i32 = [!0, 0, 0, 0, 0, 0, 0, 0, !0, !0, 0, 0, 0, 0, !0, 0]`
+/// to u16, you would call,
+/// `simd_bitmask_big!(15, a, u16)`
+/// to get,
+/// `0b1000000011000010u16`
+///
+/// # Safety
+/// The second argument must be a vector of signed integer types.
+
+#[allow(unused)]
+macro_rules! simd_bitmask_big {
+    (63, $a:ident, $ty:ty) => {
+        1 * ((if $a[63] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(62, $a, $ty)
+    };
+    (62, $a:ident, $ty:ty) => {
+        2 * ((if $a[62] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(61, $a, $ty)
+    };
+    (61, $a:ident, $ty:ty) => {
+        4 * ((if $a[61] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(60, $a, $ty)
+    };
+    (60, $a:ident, $ty:ty) => {
+        8 * ((if $a[60] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(59, $a, $ty)
+    };
+    (59, $a:ident, $ty:ty) => {
+        16 * ((if $a[59] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(58, $a, $ty)
+    };
+    (58, $a:ident, $ty:ty) => {
+        32 * ((if $a[58] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(57, $a, $ty)
+    };
+    (57, $a:ident, $ty:ty) => {
+        64 * ((if $a[57] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(56, $a, $ty)
+    };
+    (56, $a:ident, $ty:ty) => {
+        128 * ((if $a[56] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(55, $a, $ty)
+    };
+    (55, $a:ident, $ty:ty) => {
+        256 * ((if $a[55] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(54, $a, $ty)
+    };
+    (54, $a:ident, $ty:ty) => {
+        512 * ((if $a[54] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(53, $a, $ty)
+    };
+    (53, $a:ident, $ty:ty) => {
+        1024 * ((if $a[53] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(52, $a, $ty)
+    };
+    (52, $a:ident, $ty:ty) => {
+        2048 * ((if $a[52] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(51, $a, $ty)
+    };
+    (51, $a:ident, $ty:ty) => {
+        4096 * ((if $a[51] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(50, $a, $ty)
+    };
+    (50, $a:ident, $ty:ty) => {
+        8192 * ((if $a[50] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(49, $a, $ty)
+    };
+    (49, $a:ident, $ty:ty) => {
+        16384 * ((if $a[49] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(48, $a, $ty)
+    };
+    (48, $a:ident, $ty:ty) => {
+        32768 * ((if $a[48] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(47, $a, $ty)
+    };
+    (47, $a:ident, $ty:ty) => {
+        65536 * ((if $a[47] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(46, $a, $ty)
+    };
+    (46, $a:ident, $ty:ty) => {
+        131072 * ((if $a[46] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(45, $a, $ty)
+    };
+    (45, $a:ident, $ty:ty) => {
+        262144 * ((if $a[45] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(44, $a, $ty)
+    };
+    (44, $a:ident, $ty:ty) => {
+        524288 * ((if $a[44] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(43, $a, $ty)
+    };
+    (43, $a:ident, $ty:ty) => {
+        1048576 * ((if $a[43] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(42, $a, $ty)
+    };
+    (42, $a:ident, $ty:ty) => {
+        2097152 * ((if $a[42] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(41, $a, $ty)
+    };
+    (41, $a:ident, $ty:ty) => {
+        4194304 * ((if $a[41] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(40, $a, $ty)
+    };
+    (40, $a:ident, $ty:ty) => {
+        8388608 * ((if $a[40] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(39, $a, $ty)
+    };
+    (39, $a:ident, $ty:ty) => {
+        16777216 * ((if $a[39] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(38, $a, $ty)
+    };
+    (38, $a:ident, $ty:ty) => {
+        33554432 * ((if $a[38] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(37, $a, $ty)
+    };
+    (37, $a:ident, $ty:ty) => {
+        67108864 * ((if $a[37] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(36, $a, $ty)
+    };
+    (36, $a:ident, $ty:ty) => {
+        134217728 * ((if $a[36] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(35, $a, $ty)
+    };
+    (35, $a:ident, $ty:ty) => {
+        268435456 * ((if $a[35] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(34, $a, $ty)
+    };
+    (34, $a:ident, $ty:ty) => {
+        536870912 * ((if $a[34] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(33, $a, $ty)
+    };
+    (33, $a:ident, $ty:ty) => {
+        1073741824 * ((if $a[33] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(32, $a, $ty)
+    };
+    (32, $a:ident, $ty:ty) => {
+        2147483648 * ((if $a[32] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(31, $a, $ty)
+    };
+    (31, $a:ident, $ty:ty) => {
+        4294967296 * ((if $a[31] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(30, $a, $ty)
+    };
+    (30, $a:ident, $ty:ty) => {
+        8589934592 * ((if $a[30] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(29, $a, $ty)
+    };
+    (29, $a:ident, $ty:ty) => {
+        17179869184 * ((if $a[29] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(28, $a, $ty)
+    };
+    (28, $a:ident, $ty:ty) => {
+        34359738368 * ((if $a[28] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(27, $a, $ty)
+    };
+    (27, $a:ident, $ty:ty) => {
+        68719476736 * ((if $a[27] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(26, $a, $ty)
+    };
+    (26, $a:ident, $ty:ty) => {
+        137438953472 * ((if $a[26] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(25, $a, $ty)
+    };
+    (25, $a:ident, $ty:ty) => {
+        274877906944 * ((if $a[25] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(24, $a, $ty)
+    };
+    (24, $a:ident, $ty:ty) => {
+        549755813888 * ((if $a[24] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(23, $a, $ty)
+    };
+    (23, $a:ident, $ty:ty) => {
+        1099511627776 * ((if $a[23] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(22, $a, $ty)
+    };
+    (22, $a:ident, $ty:ty) => {
+        2199023255552 * ((if $a[22] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(21, $a, $ty)
+    };
+    (21, $a:ident, $ty:ty) => {
+        4398046511104 * ((if $a[21] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(20, $a, $ty)
+    };
+    (20, $a:ident, $ty:ty) => {
+        8796093022208 * ((if $a[20] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(19, $a, $ty)
+    };
+    (19, $a:ident, $ty:ty) => {
+        17592186044416 * ((if $a[19] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(18, $a, $ty)
+    };
+    (18, $a:ident, $ty:ty) => {
+        35184372088832 * ((if $a[18] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(17, $a, $ty)
+    };
+    (17, $a:ident, $ty:ty) => {
+        70368744177664 * ((if $a[17] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(16, $a, $ty)
+    };
+    (16, $a:ident, $ty:ty) => {
+        140737488355328 * ((if $a[16] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(15, $a, $ty)
+    };
+    (15, $a:ident, $ty:ty) => {
+        281474976710656 * ((if $a[15] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(14, $a, $ty)
+    };
+    (14, $a:ident, $ty:ty) => {
+        562949953421312 * ((if $a[14] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(13, $a, $ty)
+    };
+    (13, $a:ident, $ty:ty) => {
+        1125899906842624 * ((if $a[13] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(12, $a, $ty)
+    };
+    (12, $a:ident, $ty:ty) => {
+        2251799813685248 * ((if $a[12] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(11, $a, $ty)
+    };
+    (11, $a:ident, $ty:ty) => {
+        4503599627370496 * ((if $a[11] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(10, $a, $ty)
+    };
+    (10, $a:ident, $ty:ty) => {
+        9007199254740992 * ((if $a[10] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(9, $a, $ty)
+    };
+    (9, $a:ident, $ty:ty) => {
+        18014398509481984 * ((if $a[9] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(8, $a, $ty)
+    };
+    (8, $a:ident, $ty:ty) => {
+        36028797018963968 * ((if $a[8] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(7, $a, $ty)
+    };
+    (7, $a:ident, $ty:ty) => {
+        72057594037927936 * ((if $a[7] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(6, $a, $ty)
+    };
+    (6, $a:ident, $ty:ty) => {
+        144115188075855872 * ((if $a[6] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(5, $a, $ty)
+    };
+    (5, $a:ident, $ty:ty) => {
+        288230376151711744 * ((if $a[5] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(4, $a, $ty)
+    };
+    (4, $a:ident, $ty:ty) => {
+        576460752303423488 * ((if $a[4] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(3, $a, $ty)
+    };
+    (3, $a:ident, $ty:ty) => {
+        1152921504606846976 * ((if $a[3] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(2, $a, $ty)
+    };
+    (2, $a:ident, $ty:ty) => {
+        2305843009213693952 * ((if $a[2] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(1, $a, $ty)
+    };
+    (1, $a:ident, $ty:ty) => {
+        4611686018427387904 * ((if $a[1] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(0, $a, $ty)
+    };
+    (0, $a:ident, $ty:ty) => {
+        9223372036854775808 * ((if $a[0] < 0 { 1 } else { 0 }) as $ty)
+    };
+}
+#[allow(unused)]
+pub(crate) use simd_bitmask_big;
+
+/// Selects elements from a mask.
+///
+/// For each element, if the corresponding value in `mask` is `!0`, select the element from
+/// `if_true`.  If the corresponding value in `mask` is `0`, select the element from
+/// `if_false`.
+///
+/// # Safety
+/// `mask` must only contain `0` and `!0`.
+
+pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy>(
+    mask: FunArray<N, T1>,
+    if_true: FunArray<N, T2>,
+    if_false: FunArray<N, T2>,
+) -> FunArray<N, T2> {
+    FunArray::from_fn(|i| {
+        if mask[i] == T1::ONES {
+            if_true[i]
+        } else {
+            if_false[i]
+        }
+    })
+}
diff --git a/testable-simd-models/src/abstractions/utilities.rs b/testable-simd-models/src/abstractions/utilities.rs
new file mode 100644
index 0000000000000..86e1c0ba52de1
--- /dev/null
+++ b/testable-simd-models/src/abstractions/utilities.rs
@@ -0,0 +1,59 @@
+/// Converts one type to another
+pub fn transmute<T, U: From<T>>(a: T) -> U {
+    a.into()
+}
+
+#[allow(unused)]
+#[macro_export]
+macro_rules! static_assert {
+    ($e:expr) => {
+        const {
+            assert!($e);
+        }
+    };
+    ($e:expr, $msg:expr) => {
+        const {
+            assert!($e, $msg);
+        }
+    };
+}
+
+#[allow(unused_macros)]
+#[macro_export]
+macro_rules! static_assert_uimm_bits {
+    ($imm:ident, $bits:expr) => {
+        // `0 <= $imm` produces a warning if the immediate has an unsigned type
+        #[allow(unused_comparisons)]
+        {
+            static_assert!(
+                0 <= $imm && $imm < (1 << $bits),
+                concat!(
+                    stringify!($imm),
+                    " doesn't fit in ",
+                    stringify!($bits),
+                    " bits",
+                )
+            )
+        }
+    };
+}
+
+#[allow(unused_macros)]
+#[macro_export]
+macro_rules! static_assert_simm_bits {
+    ($imm:ident, $bits:expr) => {
+        static_assert!(
+            (-1 << ($bits - 1)) - 1 <= $imm && $imm < (1 << ($bits - 1)),
+            concat!(
+                stringify!($imm),
+                " doesn't fit in ",
+                stringify!($bits),
+                " bits",
+            )
+        )
+    };
+}
+
+pub use static_assert;
+pub use static_assert_simm_bits;
+pub use static_assert_uimm_bits;
diff --git a/testable-simd-models/src/core_arch.rs b/testable-simd-models/src/core_arch.rs
new file mode 100644
index 0000000000000..19e643885f4ce
--- /dev/null
+++ b/testable-simd-models/src/core_arch.rs
@@ -0,0 +1,5 @@
+/// This is a (partial) mirror of [`core::arch`]
+pub mod x86;
+pub use x86 as x86_64;
+
+pub mod arm_shared;
diff --git a/testable-simd-models/src/core_arch/arm_shared/mod.rs b/testable-simd-models/src/core_arch/arm_shared/mod.rs
new file mode 100644
index 0000000000000..6e2272ec0e50a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/mod.rs
@@ -0,0 +1,4 @@
+pub mod models;
+#[cfg(test)]
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+pub mod tests;
diff --git a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
new file mode 100644
index 0000000000000..fb7844c6d0441
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
@@ -0,0 +1,44 @@
+//! Rust models for ARM intrinsics.
+//!
+//! This module contains models for the intrinsics as they are defined in the Rust core.
+//! Since this is supposed to model the Rust core, the implemented functions must
+//! mirror the Rust implementations as closely as they can.
+//!
+//! For example, calls to simd functions like simd_add and simd_sub are left as is,
+//! with their implementations defined in `crate::abstractions::simd`. Some other
+//! operations like simd_cast or simd_shuffle might need a little modification
+//! for correct compilation.
+//!
+//! Calls to transmute are replaced with either an explicit call to a `BitVec::from_ function`,
+//! or with `.into()`.
+//!
+//! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding
+//! LLVM instruction via an `unsafe extern "C"` module. In those cases, the corresponding
+//! function is defined in the `c_extern` module in each file, which contain manually
+//! written implementations made by consulting the appropriate Intel documentation.
+//!
+//! In general, it is best to gain an idea of how an implementation should be written by looking
+//! at how other functions are implemented. Also see `core::arch::arm` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
+#![allow(unused)]
+#[allow(non_camel_case_types)]
+mod types {
+    use crate::abstractions::simd::*;
+    pub type int32x4_t = i32x4;
+    pub type int64x1_t = i64x1;
+    pub type int64x2_t = i64x2;
+    pub type int16x8_t = i16x8;
+    pub type int8x16_t = i8x16;
+    pub type uint32x4_t = u32x4;
+    pub type uint64x1_t = u64x1;
+    pub type uint64x2_t = u64x2;
+    pub type uint16x8_t = u16x8;
+    pub type uint8x16_t = u8x16;
+    pub type int32x2_t = i32x2;
+    pub type int16x4_t = i16x4;
+    pub type int8x8_t = i8x8;
+    pub type uint32x2_t = u32x2;
+    pub type uint16x4_t = u16x4;
+    pub type uint8x8_t = u8x8;
+}
+
+pub mod neon;
diff --git a/testable-simd-models/src/core_arch/arm_shared/models/neon.rs b/testable-simd-models/src/core_arch/arm_shared/models/neon.rs
new file mode 100644
index 0000000000000..794fd25285b47
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/models/neon.rs
@@ -0,0 +1,873 @@
+use super::types::*;
+use crate::abstractions::simd::*;
+
+pub fn vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    simd_add(a, vabd_s16(b, c))
+}
+
+pub fn vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    simd_add(a, vabd_s32(b, c))
+}
+
+pub fn vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    simd_add(a, vabd_s8(b, c))
+}
+
+pub fn vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    simd_add(a, vabd_u16(b, c))
+}
+
+pub fn vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    simd_add(a, vabd_u32(b, c))
+}
+
+pub fn vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    simd_add(a, vabd_u8(b, c))
+}
+
+pub fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    let d: uint8x8_t = vabd_u8(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+pub fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    let d: uint16x4_t = vabd_u16(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+pub fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    let d: uint32x2_t = vabd_u32(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+pub fn vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    simd_add(a, vabdq_s16(b, c))
+}
+
+pub fn vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    simd_add(a, vabdq_s32(b, c))
+}
+
+pub fn vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    simd_add(a, vabdq_s8(b, c))
+}
+
+pub fn vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    simd_add(a, vabdq_u16(b, c))
+}
+
+pub fn vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    simd_add(a, vabdq_u32(b, c))
+}
+
+pub fn vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    simd_add(a, vabdq_u8(b, c))
+}
+
+pub fn vabd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    simd_cast(vabd_u8(a, b))
+}
+
+pub fn vabdl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    simd_cast(vabd_u16(a, b))
+}
+
+pub fn vabdl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    simd_cast(vabd_u32(a, b))
+}
+
+pub fn vabs_s8(a: int8x8_t) -> int8x8_t {
+    simd_abs(a)
+}
+
+pub fn vabsq_s8(a: int8x16_t) -> int8x16_t {
+    simd_abs(a)
+}
+
+pub fn vabs_s16(a: int16x4_t) -> int16x4_t {
+    simd_abs(a)
+}
+
+pub fn vabsq_s16(a: int16x8_t) -> int16x8_t {
+    simd_abs(a)
+}
+
+pub fn vabs_s32(a: int32x2_t) -> int32x2_t {
+    simd_abs(a)
+}
+
+pub fn vabsq_s32(a: int32x4_t) -> int32x4_t {
+    simd_abs(a)
+}
+
+pub fn vadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_add(a, b)
+}
+
+pub fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+pub fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+pub fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32)));
+    simd_shuffle(r, x, [0, 1, 2, 3])
+}
+
+pub fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+pub fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+pub fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32)));
+    simd_shuffle(r, x, [0, 1, 2, 3])
+}
+
+pub fn vaddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8)))
+}
+
+pub fn vaddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16)))
+}
+
+pub fn vaddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32)))
+}
+
+pub fn vaddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8)))
+}
+
+pub fn vaddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16)))
+}
+
+pub fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32)))
+}
+
+pub fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    let a: int16x4_t = simd_shuffle(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    let a: int32x2_t = simd_shuffle(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle(b, b, [2, 3]);
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    let a: int8x8_t = simd_shuffle(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    let a: uint16x4_t = simd_shuffle(a, a, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    let a: uint32x2_t = simd_shuffle(a, a, [2, 3]);
+    let b: uint32x2_t = simd_shuffle(b, b, [2, 3]);
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    let a: uint8x8_t = simd_shuffle(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    let b: int16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    let b: int32x2_t = simd_shuffle(b, b, [2, 3]);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    let b: int8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    let b: uint16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    let b: uint32x2_t = simd_shuffle(b, b, [2, 3]);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    let b: uint8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vand_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_and(a, b)
+}
+
+pub fn vand_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_and(a, b)
+}
+
+pub fn vand_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_and(a, b)
+}
+
+pub fn vand_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_and(a, b)
+}
+
+pub fn vbic_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    let c = int16x4_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    let c = int32x2_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    let c = int64x1_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    let c = int8x8_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    let c = int16x8_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    let c = int32x4_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    let c = int64x2_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    let c = int8x16_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    let c = int16x4_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbic_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    let c = int32x2_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbic_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    let c = int64x1_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbic_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    let c = int8x8_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    let c = int16x8_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    let c = int32x4_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    let c = int64x2_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    let c = int8x16_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    let not = int16x4_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    let not = int32x2_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t) -> int64x1_t {
+    let not = int64x1_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    let not = int8x8_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    let not = int16x8_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    let not = int32x4_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
+    let not = int64x2_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    let not = int8x16_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    let not = int16x4_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    let not = int32x2_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t) -> uint64x1_t {
+    let not = int64x1_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    let not = int8x8_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    let not = int16x8_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    let not = int32x4_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    let not = int64x2_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    let not = int8x16_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vceq_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceqq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceq_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceqq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceq_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceqq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceq_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_eq(a, b)
+}
+
+pub fn vceqq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_eq(a, b)
+}
+
+pub fn vceq_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_eq(a, b)
+}
+
+pub fn vceqq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_eq(a, b)
+}
+
+pub fn vceq_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_eq(a, b)
+}
+
+pub fn vceqq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_eq(a, b)
+}
+
+pub fn vcge_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcgeq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcge_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcgeq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcge_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcgeq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcge_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgeq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_ge(a, b)
+}
+
+pub fn vcge_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgeq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_ge(a, b)
+}
+
+pub fn vcge_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgeq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgt_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgtq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgt_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgtq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgt_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgtq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgtq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgtq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgtq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_gt(a, b)
+}
+
+pub fn vcle_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcleq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcle_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcleq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcle_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcleq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcle_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_le(a, b)
+}
+
+pub fn vcleq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_le(a, b)
+}
+
+pub fn vcle_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_le(a, b)
+}
+
+pub fn vcleq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_le(a, b)
+}
+
+pub fn vcle_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_le(a, b)
+}
+
+pub fn vcleq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_le(a, b)
+}
diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
new file mode 100644
index 0000000000000..7ec0df1263b7f
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
@@ -0,0 +1,112 @@
+//! Tests for intrinsics defined in `crate::core_arch::models::arm_shared`
+//!
+//! Each and every modelled intrinsic is tested against the Rust
+//! implementation here. For the most part, the tests work by
+//! generating random inputs, passing them as arguments
+//! to both the models in this crate, and the corresponding intrinsics
+//! in the Rust core and then comparing their outputs.
+//!
+//! To add a test for a modelled intrinsic, go the appropriate file, and
+//! use the `mk!` macro to define it.
+//!
+//! A `mk!` macro invocation looks like the following,
+//! `mk!([<number of times the random test happens>]<function name>{<<const values, if the function takes any>,>}(<function arguments : with types,>))
+//!
+//! For example, some valid invocations are
+//!
+//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
+//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
+//! `mk!(_mm256_abs_epi16(a: BitVec));`
+//!
+//! The number of random tests is optional. If not provided, it is taken to be 1000 by default.
+//! The const values are necessary if the function has constant arguments, but should be discarded if not.
+//! The function name and the function arguments are necessary in all cases.
+//!
+//! Note: This only works if the function returns a bit-vector or funarray. If it returns an integer, the
+//! test has to be written manually. It is recommended that the manually defined test follows
+//! the pattern of tests defined via the `mk!` invocation. It is also recommended that, in the
+//! case that the intrinsic takes constant arguments, each and every possible constant value
+//! (upto a maximum of 255) that can be passed to the function be used for testing. The number
+//! of constant values passed depends on if the Rust intrinsics statically asserts that the
+//! length of the constant argument be less than or equal to a certain number of bits.
+
+pub mod neon;
+
+#[allow(non_camel_case_types)]
+mod types {
+    use crate::abstractions::simd::*;
+    pub type int32x4_t = i32x4;
+    pub type int64x1_t = i64x1;
+    pub type int64x2_t = i64x2;
+    pub type int16x8_t = i16x8;
+    pub type int8x16_t = i8x16;
+    pub type uint32x4_t = u32x4;
+    pub type uint64x1_t = u64x1;
+    pub type uint64x2_t = u64x2;
+    pub type uint16x8_t = u16x8;
+    pub type uint8x16_t = u8x16;
+    pub type int32x2_t = i32x2;
+    pub type int16x4_t = i16x4;
+    pub type int8x8_t = i8x8;
+    pub type uint32x2_t = u32x2;
+    pub type uint16x4_t = u16x4;
+    pub type uint8x8_t = u8x8;
+}
+
+pub(crate) mod upstream {
+    #[cfg(target_arch = "aarch64")]
+    pub use core::arch::aarch64::*;
+    #[cfg(target_arch = "arm")]
+    pub use core::arch::arm::*;
+}
+
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+pub mod conversions {
+    use super::upstream::*;
+
+    use super::types;
+    use crate::abstractions::bitvec::BitVec;
+    use crate::abstractions::funarr::FunArray;
+
+    macro_rules! convert{
+	($($ty1:ident [$ty2:ty ; $n:literal]),*) => {
+	    $(
+		impl From<$ty1> for types::$ty1 {
+		    fn from (arg: $ty1) -> types::$ty1 {
+			let stuff = unsafe { *(&arg as *const $ty1 as *const [$ty2; $n])};
+			FunArray::from_fn(|i|
+					  stuff[i as usize]
+			)
+		    }
+		}
+		impl From<types::$ty1> for $ty1 {
+		    fn from (arg: types::$ty1) -> $ty1 {
+			let bv: &[u8] = &(BitVec::from(arg)).to_vec()[..];
+			unsafe {
+			    *(bv.as_ptr() as *const [$ty2; $n] as *const _)
+			}
+		    }
+		}
+	    )*
+	}
+    }
+
+    convert!(
+    int32x4_t [i32; 4],
+    int64x1_t [i64; 1],
+    int64x2_t [i64; 2],
+    int16x8_t [i16; 8],
+    int8x16_t [i8; 16],
+    uint32x4_t [u32; 4],
+    uint64x1_t [u64; 1],
+    uint64x2_t [u64; 2],
+    uint16x8_t [u16; 8],
+    uint8x16_t [u8; 16],
+    int32x2_t [i32; 2],
+    int16x4_t [i16; 4],
+    int8x8_t [i8; 8],
+    uint32x2_t [u32; 2],
+    uint16x4_t [u16; 4],
+    uint8x8_t [u8; 8]
+    );
+}
diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
new file mode 100644
index 0000000000000..e07d385f656f6
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
@@ -0,0 +1,218 @@
+#[cfg(test)]
+use super::upstream;
+use crate::abstractions::funarr::FunArray;
+use crate::helpers::test::HasRandom;
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::neon::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    FunArray::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+
+}
+
+use super::types::*;
+mk!(vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t));
+mk!(vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t));
+mk!(vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t));
+mk!(vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t));
+mk!(vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t));
+mk!(vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t));
+mk!(vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t));
+mk!(vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t));
+mk!(vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t));
+mk!(vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t));
+mk!(vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t));
+mk!(vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t));
+mk!(vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t));
+mk!(vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t));
+mk!(vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t));
+mk!(vabd_s8(a: int8x8_t, b: int8x8_t));
+mk!(vabdq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vabd_s16(a: int16x4_t, b: int16x4_t));
+mk!(vabdq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vabd_s32(a: int32x2_t, b: int32x2_t));
+mk!(vabdq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vabd_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vabdq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vabd_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vabdq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vabd_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vabdq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vabdl_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vabdl_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vabdl_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vabs_s8(a: int8x8_t));
+mk!(vabsq_s8(a: int8x16_t));
+mk!(vabs_s16(a: int16x4_t));
+mk!(vabsq_s16(a: int16x8_t));
+mk!(vabs_s32(a: int32x2_t));
+mk!(vabsq_s32(a: int32x4_t));
+mk!(vadd_s16(a: int16x4_t, b: int16x4_t));
+mk!(vadd_s32(a: int32x2_t, b: int32x2_t));
+mk!(vadd_s8(a: int8x8_t, b: int8x8_t));
+mk!(vadd_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vadd_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vadd_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vaddq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vaddq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vaddq_s64(a: int64x2_t, b: int64x2_t));
+mk!(vaddq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vaddq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vaddq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vaddq_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vaddq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t));
+mk!(vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t));
+mk!(vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t));
+mk!(vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t));
+mk!(vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t));
+mk!(vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t));
+mk!(vaddhn_s16(a: int16x8_t, b: int16x8_t));
+mk!(vaddhn_s32(a: int32x4_t, b: int32x4_t));
+mk!(vaddhn_s64(a: int64x2_t, b: int64x2_t));
+mk!(vaddhn_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vaddhn_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vaddhn_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vaddl_high_s16(a: int16x8_t, b: int16x8_t));
+mk!(vaddl_high_s32(a: int32x4_t, b: int32x4_t));
+mk!(vaddl_high_s8(a: int8x16_t, b: int8x16_t));
+mk!(vaddl_high_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vaddl_high_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vaddl_high_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vaddl_s16(a: int16x4_t, b: int16x4_t));
+mk!(vaddl_s32(a: int32x2_t, b: int32x2_t));
+mk!(vaddl_s8(a: int8x8_t, b: int8x8_t));
+mk!(vaddl_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vaddl_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vaddl_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vaddw_high_s16(a: int32x4_t, b: int16x8_t));
+mk!(vaddw_high_s32(a: int64x2_t, b: int32x4_t));
+mk!(vaddw_high_s8(a: int16x8_t, b: int8x16_t));
+mk!(vaddw_high_u16(a: uint32x4_t, b: uint16x8_t));
+mk!(vaddw_high_u32(a: uint64x2_t, b: uint32x4_t));
+mk!(vaddw_high_u8(a: uint16x8_t, b: uint8x16_t));
+mk!(vaddw_s16(a: int32x4_t, b: int16x4_t));
+mk!(vaddw_s32(a: int64x2_t, b: int32x2_t));
+mk!(vaddw_s8(a: int16x8_t, b: int8x8_t));
+mk!(vaddw_u16(a: uint32x4_t, b: uint16x4_t));
+mk!(vaddw_u32(a: uint64x2_t, b: uint32x2_t));
+mk!(vaddw_u8(a: uint16x8_t, b: uint8x8_t));
+mk!(vand_s8(a: int8x8_t, b: int8x8_t));
+mk!(vandq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vand_s16(a: int16x4_t, b: int16x4_t));
+mk!(vandq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vand_s32(a: int32x2_t, b: int32x2_t));
+mk!(vandq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vand_s64(a: int64x1_t, b: int64x1_t));
+mk!(vandq_s64(a: int64x2_t, b: int64x2_t));
+mk!(vand_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vandq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vand_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vandq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vand_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vandq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vand_u64(a: uint64x1_t, b: uint64x1_t));
+mk!(vandq_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vbic_s16(a: int16x4_t, b: int16x4_t));
+mk!(vbic_s32(a: int32x2_t, b: int32x2_t));
+mk!(vbic_s8(a: int8x8_t, b: int8x8_t));
+mk!(vbicq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vbicq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vbicq_s64(a: int64x2_t, b: int64x2_t));
+mk!(vbicq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vbic_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vbic_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vbic_u64(a: uint64x1_t, b: uint64x1_t));
+mk!(vbic_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vbicq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vbicq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vbicq_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vbicq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t));
+mk!(vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t));
+mk!(vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t));
+mk!(vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t));
+mk!(vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t));
+mk!(vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t));
+mk!(vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t));
+mk!(vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t));
+mk!(vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t));
+mk!(vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t));
+mk!(vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t));
+mk!(vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t));
+mk!(vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t));
+mk!(vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t));
+mk!(vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t));
+mk!(vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t));
+mk!(vceq_s8(a: int8x8_t, b: int8x8_t));
+mk!(vceqq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vceq_s16(a: int16x4_t, b: int16x4_t));
+mk!(vceqq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vceq_s32(a: int32x2_t, b: int32x2_t));
+mk!(vceqq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vceq_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vceqq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vceq_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vceqq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vceq_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vceqq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vcge_s8(a: int8x8_t, b: int8x8_t));
+mk!(vcgeq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vcge_s16(a: int16x4_t, b: int16x4_t));
+mk!(vcgeq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vcge_s32(a: int32x2_t, b: int32x2_t));
+mk!(vcgeq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vcge_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vcgeq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vcge_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vcgeq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vcge_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vcgeq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vcgt_s8(a: int8x8_t, b: int8x8_t));
+mk!(vcgtq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vcgt_s16(a: int16x4_t, b: int16x4_t));
+mk!(vcgtq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vcgt_s32(a: int32x2_t, b: int32x2_t));
+mk!(vcgtq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vcgt_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vcgtq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vcgt_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vcgtq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vcgt_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vcgtq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vcle_s8(a: int8x8_t, b: int8x8_t));
+mk!(vcleq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vcle_s16(a: int16x4_t, b: int16x4_t));
+mk!(vcleq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vcle_s32(a: int32x2_t, b: int32x2_t));
+mk!(vcleq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vcle_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vcleq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vcle_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vcleq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vcle_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vcleq_u32(a: uint32x4_t, b: uint32x4_t));
diff --git a/testable-simd-models/src/core_arch/x86/mod.rs b/testable-simd-models/src/core_arch/x86/mod.rs
new file mode 100644
index 0000000000000..3c5cd51d9c56b
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/mod.rs
@@ -0,0 +1,4 @@
+pub mod models;
+#[cfg(test)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod tests;
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
new file mode 100644
index 0000000000000..8e2fb37319d36
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -0,0 +1,1828 @@
+//! Advanced Vector Extensions (AVX)
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
+//!   Programmer's Manual, Volume 3: General-Purpose and System
+//!   Instructions][amd64_ref].
+//!
+//! [Wikipedia][wiki] provides a quick overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+
+use super::avx_handwritten::*;
+use super::sse::*;
+use super::sse2::*;
+use super::types::*;
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
+
+/// Adds packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_add_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { transmute(simd_add(a.as_f64x4(), b.as_f64x4())) }
+// }
+
+/// Adds packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_add_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
+//     { transmute(simd_add(a.as_f32x8(), b.as_f32x8())) }
+// }
+
+/// Computes the bitwise AND of a packed double-precision (64-bit)
+/// floating-point elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_and_pd)
+pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_and(a, b))
+    }
+}
+/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_and_ps)
+pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_and(a, b))
+    }
+}
+/// Computes the bitwise OR packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_or_pd)
+pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_or(a, b))
+    }
+}
+/// Computes the bitwise OR packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_or_ps)
+pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_or(a, b))
+    }
+}
+/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
+/// lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_shuffle_pd)
+pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(MASK, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(),
+            b.as_f64x4(),
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1) + 4,
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 6,
+            ],
+        ))
+    }
+}
+/// Shuffles single-precision (32-bit) floating-point elements in `a` within
+/// 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_shuffle_ps)
+pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(MASK, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(),
+            b.as_f32x8(),
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
+        ))
+    }
+}
+/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
+/// elements in `a`, and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_andnot_pd)
+pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
+    }
+}
+/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
+/// elements in `a`
+/// and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_andnot_ps)
+pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
+    }
+}
+/// Compares packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and returns packed maximum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_max_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vmaxpd(a, b) }
+// }
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and returns packed maximum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_max_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
+//     { vmaxps(a, b) }
+// }
+
+/// Compares packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and returns packed minimum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_min_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vminpd(a, b) }
+// }
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and returns packed minimum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_min_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
+//     { vminps(a, b) }
+// }
+
+/// Multiplies packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_mul_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { transmute(simd_mul(a.as_f64x4(), b.as_f64x4())) }
+// }
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_mul_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
+//     { transmute(simd_mul(a.as_f32x8(), b.as_f32x8())) }
+// }
+
+/// Alternatively adds and subtracts packed double-precision (64-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_addsub_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
+//     {
+//         let a = a.as_f64x4();
+//         let b = b.as_f64x4();
+//         let add = simd_add(a, b);
+//         let sub = simd_sub(a, b);
+//         simd_shuffle(add, sub, [4, 1, 6, 3])
+//     }
+// }
+
+/// Alternatively adds and subtracts packed single-precision (32-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_addsub_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
+//     {
+//         let a = a.as_f32x8();
+//         let b = b.as_f32x8();
+//         let add = simd_add(a, b);
+//         let sub = simd_sub(a, b);
+//         simd_shuffle(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+//     }
+// }
+
+/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sub_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { simd_sub(a, b) }
+// }
+
+/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sub_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
+//     { simd_sub(a, b) }
+// }
+
+/// Computes the division of each of the 8 packed 32-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_div_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
+//     { simd_div(a, b) }
+// }
+
+/// Computes the division of each of the 4 packed 64-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_div_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { simd_div(a, b) }
+// }
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_round_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(ROUNDING, 4);
+//     { roundpd256(a, ROUNDING) }
+// }
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_ceil_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
+//     { simd_ceil(a) }
+// }
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_floor_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
+//     { simd_floor(a) }
+// }
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_round_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
+//     static_assert_uimm_bits!(ROUNDING, 4);
+//     { roundps256(a, ROUNDING) }
+// }
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_ceil_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
+//     { simd_ceil(a) }
+// }
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_floor_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_floor_ps(a: __m256) -> __m256 {
+//     { simd_floor(a) }
+// }
+
+/// Returns the square root of packed single-precision (32-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sqrt_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
+//     { simd_fsqrt(a) }
+// }
+
+/// Returns the square root of packed double-precision (64-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sqrt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
+//     { simd_fsqrt(a) }
+// }
+
+/// Blends packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blend_pd)
+pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM4, 4);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(),
+            b.as_f64x4(),
+            [
+                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
+                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
+                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
+                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
+            ],
+        ))
+    }
+}
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blend_ps)
+pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(),
+            b.as_f32x8(),
+            [
+                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
+                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
+                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
+                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
+                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
+                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
+                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
+                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
+            ],
+        ))
+    }
+}
+/// Blends packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blendv_pd)
+pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    {
+        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO());
+        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
+    }
+}
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blendv_ps)
+pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    {
+        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO());
+        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
+    }
+}
+/// Conditionally multiplies the packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` using the high 4 bits in `imm8`,
+/// sum the four products, and conditionally return the sum
+///  using the low 4 bits of `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_dp_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     { vdpps(a, b, IMM8 as i8) }
+// }
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hadd_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vhaddpd(a, b) }
+// }
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hadd_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
+//     { vhaddps(a, b) }
+// }
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hsub_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vhsubpd(a, b) }
+// }
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hsub_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
+//     { vhsubps(a, b) }
+// }
+
+/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_xor_pd)
+pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_xor(a, b))
+    }
+}
+/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_xor_ps)
+pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_xor(a, b))
+    }
+}
+/// Equal (ordered, non-signaling)
+pub const _CMP_EQ_OQ: i32 = 0x00;
+/// Less-than (ordered, signaling)
+pub const _CMP_LT_OS: i32 = 0x01;
+/// Less-than-or-equal (ordered, signaling)
+pub const _CMP_LE_OS: i32 = 0x02;
+/// Unordered (non-signaling)
+pub const _CMP_UNORD_Q: i32 = 0x03;
+/// Not-equal (unordered, non-signaling)
+pub const _CMP_NEQ_UQ: i32 = 0x04;
+/// Not-less-than (unordered, signaling)
+pub const _CMP_NLT_US: i32 = 0x05;
+/// Not-less-than-or-equal (unordered, signaling)
+pub const _CMP_NLE_US: i32 = 0x06;
+/// Ordered (non-signaling)
+pub const _CMP_ORD_Q: i32 = 0x07;
+/// Equal (unordered, non-signaling)
+pub const _CMP_EQ_UQ: i32 = 0x08;
+/// Not-greater-than-or-equal (unordered, signaling)
+pub const _CMP_NGE_US: i32 = 0x09;
+/// Not-greater-than (unordered, signaling)
+pub const _CMP_NGT_US: i32 = 0x0a;
+/// False (ordered, non-signaling)
+pub const _CMP_FALSE_OQ: i32 = 0x0b;
+/// Not-equal (ordered, non-signaling)
+pub const _CMP_NEQ_OQ: i32 = 0x0c;
+/// Greater-than-or-equal (ordered, signaling)
+pub const _CMP_GE_OS: i32 = 0x0d;
+/// Greater-than (ordered, signaling)
+pub const _CMP_GT_OS: i32 = 0x0e;
+/// True (unordered, non-signaling)
+pub const _CMP_TRUE_UQ: i32 = 0x0f;
+/// Equal (ordered, signaling)
+pub const _CMP_EQ_OS: i32 = 0x10;
+/// Less-than (ordered, non-signaling)
+pub const _CMP_LT_OQ: i32 = 0x11;
+/// Less-than-or-equal (ordered, non-signaling)
+pub const _CMP_LE_OQ: i32 = 0x12;
+/// Unordered (signaling)
+pub const _CMP_UNORD_S: i32 = 0x13;
+/// Not-equal (unordered, signaling)
+pub const _CMP_NEQ_US: i32 = 0x14;
+/// Not-less-than (unordered, non-signaling)
+pub const _CMP_NLT_UQ: i32 = 0x15;
+/// Not-less-than-or-equal (unordered, non-signaling)
+pub const _CMP_NLE_UQ: i32 = 0x16;
+/// Ordered (signaling)
+pub const _CMP_ORD_S: i32 = 0x17;
+/// Equal (unordered, signaling)
+pub const _CMP_EQ_US: i32 = 0x18;
+/// Not-greater-than-or-equal (unordered, non-signaling)
+pub const _CMP_NGE_UQ: i32 = 0x19;
+/// Not-greater-than (unordered, non-signaling)
+pub const _CMP_NGT_UQ: i32 = 0x1a;
+/// False (ordered, signaling)
+pub const _CMP_FALSE_OS: i32 = 0x1b;
+/// Not-equal (ordered, signaling)
+pub const _CMP_NEQ_OS: i32 = 0x1c;
+/// Greater-than-or-equal (ordered, non-signaling)
+pub const _CMP_GE_OQ: i32 = 0x1d;
+/// Greater-than (ordered, non-signaling)
+pub const _CMP_GT_OQ: i32 = 0x1e;
+/// True (unordered, signaling)
+pub const _CMP_TRUE_US: i32 = 0x1f;
+/// Compares packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmppd(a, b, const { IMM5 as i8 }) }
+// }
+
+/// Compares packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cmp_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmppd256(a, b, IMM5 as u8) }
+// }
+
+/// Compares packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_ps)
+// NOTE: Not modeled yet
+// pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpps(a, b, const { IMM5 as i8 }) }
+// }
+
+/// Compares packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cmp_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpps256(a, b, const { IMM5 as u8 }) }
+// }
+
+/// Compares the lower double-precision (64-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
+/// store the result in the lower element of returned vector,
+/// and copies the upper element from `a` to the upper element of returned
+/// vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpsd(a, b, IMM5 as i8) }
+// }
+
+/// Compares the lower single-precision (32-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
+/// store the result in the lower element of returned vector,
+/// and copies the upper 3 packed elements from `a` to the upper elements of
+/// returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_ss)
+// NOTE: Not modeled yet
+// pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpss(a, b, IMM5 as i8) }
+// }
+
+/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtepi32_pd)
+pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
+    transmute(simd_cast::<4, i32, f64>(a.as_i32x4()))
+}
+/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtepi32_ps)
+pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
+    transmute(simd_cast::<8, _, f32>(a.as_i32x8()))
+}
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtpd_ps)
+pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
+    transmute(simd_cast::<4, _, f32>(a.as_f64x4()))
+}
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtps_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
+//     { transmute(vcvtps2dq(a)) }
+// }
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtps_pd)
+pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
+    transmute(simd_cast::<4, _, f64>(a.as_f32x4()))
+}
+/// Returns the first element of the input vector of `[4 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtsd_f64)
+pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
+    simd_extract(a.as_f64x4(), 0)
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvttpd_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
+//     { transmute(vcvttpd2dq(a)) }
+// }
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtpd_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
+//     { transmute(vcvtpd2dq(a)) }
+// }
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvttps_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
+//     { transmute(vcvttps2dq(a)) }
+// }
+
+/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_ps)
+pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(),
+            _mm256_undefined_ps().as_f32x8(),
+            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
+        ))
+    }
+}
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_pd)
+pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
+    static_assert_uimm_bits!(IMM1, 1);
+    transmute(simd_shuffle(
+        a.as_f64x4(),
+        _mm256_undefined_pd().as_f64x4(),
+        [[0, 1], [2, 3]][IMM1 as usize],
+    ))
+}
+/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_si256)
+pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let dst: i64x2 = simd_shuffle(a.as_i64x4(), i64x4::ZERO(), [[0, 1], [2, 3]][IMM1 as usize]);
+        transmute(dst)
+    }
+}
+/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extract_epi32)
+pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_uimm_bits!(INDEX, 3);
+    simd_extract(a.as_i32x8(), INDEX as u32)
+}
+/// Returns the first element of the input vector of `[8 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtsi256_si32)
+pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
+    simd_extract(a.as_i32x8(), 0)
+}
+/// Zeroes the contents of all XMM or YMM registers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zeroall)
+// NOTE: Not modeled yet
+// pub fn _mm256_zeroall() {
+//     { vzeroall() }
+// }
+
+/// Zeroes the upper 128 bits of all YMM registers;
+/// the lower 128-bits of the registers are unmodified.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zeroupper)
+// NOTE: Not modeled yet
+// pub fn _mm256_zeroupper() {
+//     { vzeroupper() }
+// }
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permutevar_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
+//     { vpermilps256(a, b.as_i32x8()) }
+// }
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permutevar_ps)
+// NOTE: Not modeled yet
+// pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
+//     { vpermilps(a, b.as_i32x4()) }
+// }
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute_ps)
+pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(),
+            _mm256_undefined_ps().as_f32x8(),
+            [
+                (IMM8 as u32 >> 0) & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                ((IMM8 as u32 >> 0) & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+            ],
+        ))
+    }
+}
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_ps)
+pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_undefined_ps().as_f32x4(),
+            [
+                (IMM8 as u32 >> 0) & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        ))
+    }
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// within 256-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permutevar_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
+//     { vpermilpd256(a, b.as_i64x4()) }
+// }
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permutevar_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
+//     { vpermilpd(a, b.as_i64x2()) }
+// }
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute_pd)
+pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM4, 4);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(),
+            _mm256_undefined_pd().as_f64x4(),
+            [
+                ((IMM4 as u32 >> 0) & 1),
+                ((IMM4 as u32 >> 1) & 1),
+                ((IMM4 as u32 >> 2) & 1) + 2,
+                ((IMM4 as u32 >> 3) & 1) + 2,
+            ],
+        ))
+    }
+}
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_pd)
+pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM2, 2);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_undefined_pd().as_f64x2(),
+            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
+        ))
+    }
+}
+/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     { vperm2f128ps256(a, b, IMM8 as i8) }
+// }
+/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     { vperm2f128pd256(a, b, IMM8 as i8) }
+// }
+/// Shuffles 128-bits (composed of integer data) selected by `imm8`
+/// from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_si256)
+pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8))
+}
+/// Broadcasts a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ss)
+pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
+    _mm256_set1_ps(*f)
+}
+/// Broadcasts a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_broadcast_ss)
+// NOTE: Not modeled yet
+// pub fn _mm_broadcast_ss(f: &f32) -> __m128 {
+//     _mm_set1_ps(*f)
+// }
+/// Broadcasts a double-precision (64-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_sd)
+// NOTE: Not modeled yet
+// pub fn _mm256_broadcast_sd(f: &f64) -> __m256d {
+//     _mm256_set1_pd(*f)
+// }
+/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
+/// (32-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ps)
+pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
+    {
+        transmute(simd_shuffle(
+            (*a).as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0, 1, 2, 3, 0, 1, 2, 3],
+        ))
+    }
+}
+/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
+/// (64-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_pd)
+pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
+    transmute(simd_shuffle(
+        (*a).as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        [0, 1, 0, 1],
+    ))
+}
+/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_ps)
+pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(),
+            _mm256_castps128_ps256(b).as_f32x8(),
+            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
+        ))
+    }
+}
+/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_pd)
+pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(),
+            _mm256_castpd128_pd256(b).as_f64x4(),
+            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        ))
+    }
+}
+/// Copies `a` to result, then inserts 128 bits from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_si256)
+pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let dst: i64x4 = simd_shuffle(
+            a.as_i64x4(),
+            _mm256_castsi128_si256(b).as_i64x4(),
+            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        );
+        transmute(dst)
+    }
+}
+/// Copies `a` to result, and inserts the 8-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi8)
+pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
+    static_assert_uimm_bits!(INDEX, 5);
+    transmute(simd_insert(a.as_i8x32(), INDEX as u32, i))
+}
+/// Copies `a` to result, and inserts the 16-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi16)
+pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
+    static_assert_uimm_bits!(INDEX, 4);
+    transmute(simd_insert(a.as_i16x16(), INDEX as u32, i))
+}
+/// Copies `a` to result, and inserts the 32-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi32)
+pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
+    static_assert_uimm_bits!(INDEX, 3);
+    transmute(simd_insert(a.as_i32x8(), INDEX as u32, i))
+}
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movehdup_ps)
+pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        a.as_f32x8(),
+        [1, 1, 3, 3, 5, 5, 7, 7],
+    ))
+}
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_moveldup_ps)
+pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        a.as_f32x8(),
+        [0, 0, 2, 2, 4, 4, 6, 6],
+    ))
+}
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movedup_pd)
+pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
+    transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 0, 2, 2]))
+}
+/// Computes the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`, and returns the results. The maximum
+/// relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_rcp_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
+//     { vrcpps(a) }
+// }
+/// Computes the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`, and returns the results.
+/// The maximum relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_rsqrt_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
+//     { vrsqrtps(a) }
+// }
+/// Unpacks and interleave double-precision (64-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpackhi_pd)
+pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
+    transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [1, 5, 3, 7]))
+}
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpackhi_ps)
+pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        b.as_f32x8(),
+        [2, 10, 3, 11, 6, 14, 7, 15],
+    ))
+}
+/// Unpacks and interleave double-precision (64-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpacklo_pd)
+pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
+    transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [0, 4, 2, 6]))
+}
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpacklo_ps)
+pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        b.as_f32x8(),
+        [0, 8, 1, 9, 4, 12, 5, 13],
+    ))
+}
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_si256)
+pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
+    ptestz256(a.as_i64x4(), b.as_i64x4())
+}
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_si256)
+pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
+    ptestc256(a.as_i64x4(), b.as_i64x4())
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
+/// `CF` values are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_si256)
+// NOTE: Not modeled yet
+// pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
+//     { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
+//     { vtestzpd256(a, b) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
+//     { vtestcpd256(a, b) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
+//     { vtestnzcpd256(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testz_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
+//     { vtestzpd(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testc_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
+//     { vtestcpd(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testnzc_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
+//     { vtestnzcpd(a, b) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
+//     { vtestzps256(a, b) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
+//     { vtestcps256(a, b) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
+//     { vtestnzcps256(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testz_ps)
+// NOTE: Not modeled yet
+// pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
+//     { vtestzps(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testc_ps)
+// NOTE: Not modeled yet
+// pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
+//     { vtestcps(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testnzc_ps)
+// NOTE: Not modeled yet
+// pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
+//     { vtestnzcps(a, b) }
+// }
+
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed double-precision (64-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movemask_pd)
+pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
+    {
+        let mask: i64x4 = simd_lt(a.as_i64x4(), i64x4::ZERO());
+        simd_bitmask_little!(3, mask, u8) as i32
+    }
+}
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed single-precision (32-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movemask_ps)
+pub fn _mm256_movemask_ps(a: __m256) -> i32 {
+    {
+        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO());
+        simd_bitmask_little!(7, mask, u8) as i32
+    }
+}
+/// Returns vector of type __m256d with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_pd)
+pub fn _mm256_setzero_pd() -> __m256d {
+    transmute(f64x4::ZERO())
+}
+/// Returns vector of type __m256 with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_ps)
+pub fn _mm256_setzero_ps() -> __m256 {
+    transmute(f32x8::ZERO())
+}
+/// Returns vector of type __m256i with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_si256)
+pub fn _mm256_setzero_si256() -> __m256i {
+    transmute(i64x4::ZERO())
+}
+/// Sets packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_pd)
+pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    _mm256_setr_pd(d, c, b, a)
+}
+/// Sets packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_ps)
+pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
+    _mm256_setr_ps(h, g, f, e, d, c, b, a)
+}
+/// Sets packed 8-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi8)
+pub fn _mm256_set_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    _mm256_setr_epi8(
+        e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14,
+        e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00,
+    )
+}
+/// Sets packed 16-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi16)
+pub fn _mm256_set_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    _mm256_setr_epi16(
+        e15, e14, e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00,
+    )
+}
+/// Sets packed 32-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi32)
+pub fn _mm256_set_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+/// Sets packed 64-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi64x)
+pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    _mm256_setr_epi64x(d, c, b, a)
+}
+/// Sets packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_pd)
+pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    transmute(f64x4::new(a, b, c, d))
+}
+/// Sets packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_ps)
+pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
+    transmute(f32x8::new(a, b, c, d, e, f, g, h))
+}
+/// Sets packed 8-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi8)
+pub fn _mm256_setr_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    {
+        transmute(i8x32::new(
+            e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16,
+            e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+        ))
+    }
+}
+/// Sets packed 16-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi16)
+pub fn _mm256_setr_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    {
+        transmute(i16x16::new(
+            e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
+        ))
+    }
+}
+/// Sets packed 32-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi32)
+pub fn _mm256_setr_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+}
+/// Sets packed 64-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi64x)
+pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    transmute(i64x4::new(a, b, c, d))
+}
+/// Broadcasts double-precision (64-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_pd)
+pub fn _mm256_set1_pd(a: f64) -> __m256d {
+    _mm256_setr_pd(a, a, a, a)
+}
+/// Broadcasts single-precision (32-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_ps)
+pub fn _mm256_set1_ps(a: f32) -> __m256 {
+    _mm256_setr_ps(a, a, a, a, a, a, a, a)
+}
+/// Broadcasts 8-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastb`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi8)
+pub fn _mm256_set1_epi8(a: i8) -> __m256i {
+    _mm256_setr_epi8(
+        a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,
+        a, a,
+    )
+}
+/// Broadcasts 16-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastw`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi16)
+pub fn _mm256_set1_epi16(a: i16) -> __m256i {
+    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+/// Broadcasts 32-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastd`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi32)
+pub fn _mm256_set1_epi32(a: i32) -> __m256i {
+    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
+}
+/// Broadcasts 64-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastq`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi64x)
+pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
+    _mm256_setr_epi64x(a, a, a, a)
+}
+/// Cast vector of type __m256d to type __m256.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd_ps)
+pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
+    transmute(a)
+}
+/// Cast vector of type __m256 to type __m256d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps_pd)
+pub fn _mm256_castps_pd(a: __m256) -> __m256d {
+    transmute(a)
+}
+/// Casts vector of type __m256 to type __m256i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps_si256)
+pub fn _mm256_castps_si256(a: __m256) -> __m256i {
+    transmute(a)
+}
+/// Casts vector of type __m256i to type __m256.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_ps)
+pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
+    transmute(a)
+}
+/// Casts vector of type __m256d to type __m256i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd_si256)
+pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
+    transmute(a)
+}
+/// Casts vector of type __m256i to type __m256d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_pd)
+pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
+    transmute(a)
+}
+/// Casts vector of type __m256 to type __m128.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps256_ps128)
+pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
+    transmute(simd_shuffle(a.as_f32x8(), a.as_f32x8(), [0, 1, 2, 3]))
+}
+/// Casts vector of type __m256d to type __m128d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd256_pd128)
+pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
+    transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 1]))
+}
+/// Casts vector of type __m256i to type __m128i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_si128)
+pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
+    {
+        let a = a.as_i64x4();
+        let dst: i64x2 = simd_shuffle(a, a, [0, 1]);
+        transmute(dst)
+    }
+}
+/// Casts vector of type __m128 to type __m256;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps128_ps256)
+pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_undefined_ps().as_f32x4(),
+            [0, 1, 2, 3, 4, 4, 4, 4],
+        ))
+    }
+}
+/// Casts vector of type __m128d to type __m256d;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd128_pd256)
+pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
+    transmute(simd_shuffle(
+        a.as_f64x2(),
+        _mm_undefined_pd().as_f64x2(),
+        [0, 1, 2, 2],
+    ))
+}
+/// Casts vector of type __m128i to type __m256i;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi128_si256)
+pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
+    {
+        let a = a.as_i64x2();
+        let undefined = i64x2::ZERO();
+        let dst: i64x4 = simd_shuffle(a, undefined, [0, 1, 2, 2]);
+        transmute(dst)
+    }
+}
+/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
+/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
+/// the value of the source vector. The upper 128 bits are set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextps128_ps256)
+pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ))
+    }
+}
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 128 bits are set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextsi128_si256)
+pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
+    {
+        let b = i64x2::ZERO();
+        let dst: i64x4 = simd_shuffle(a.as_i64x2(), b, [0, 1, 2, 3]);
+        transmute(dst)
+    }
+}
+/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
+/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
+/// contain the value of the source vector. The upper 128 bits are set
+/// to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextpd128_pd256)
+// NOTE: Not modeled yet
+pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_setzero_pd().as_f64x2(),
+            [0, 1, 2, 3],
+        ))
+    }
+}
+/// Returns vector of type `__m256` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_ps)
+pub fn _mm256_undefined_ps() -> __m256 {
+    transmute(f32x8::ZERO())
+}
+/// Returns vector of type `__m256d` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_pd)
+pub fn _mm256_undefined_pd() -> __m256d {
+    transmute(f32x8::ZERO())
+}
+/// Returns vector of type __m256i with with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_si256)
+pub fn _mm256_undefined_si256() -> __m256i {
+    transmute(i32x8::ZERO())
+}
+/// Sets packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128)
+pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
+    transmute(simd_shuffle(
+        lo.as_i32x4(),
+        hi.as_i32x4(),
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ))
+}
+/// Sets packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128d)
+pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
+    {
+        let hi: __m128 = transmute(hi);
+        let lo: __m128 = transmute(lo);
+        transmute(_mm256_set_m128(hi, lo))
+    }
+}
+/// Sets packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128i)
+pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
+    {
+        let hi: __m128 = transmute(hi);
+        let lo: __m128 = transmute(lo);
+        transmute(_mm256_set_m128(hi, lo))
+    }
+}
+/// Sets packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128)
+pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
+    _mm256_set_m128(hi, lo)
+}
+/// Sets packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128d)
+pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
+    _mm256_set_m128d(hi, lo)
+}
+/// Sets packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128i)
+pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
+    _mm256_set_m128i(hi, lo)
+}
+/// Returns the first element of the input vector of `[8 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtss_f32)
+pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
+    simd_extract(a.as_f32x8(), 0)
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
new file mode 100644
index 0000000000000..2626d04635bd6
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -0,0 +1,1873 @@
+//! Advanced Vector Extensions 2 (AVX)
+//!
+//!
+//! This module contains models for AVX2 intrinsics.
+//! AVX2 expands most AVX commands to 256-bit wide vector registers and
+//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
+//! overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
+
+use super::avx::*;
+use super::avx2_handwritten::*;
+use super::sse::*;
+use super::sse2::*;
+use super::types::*;
+
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
+pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
+    {
+        let a = a.as_i32x8();
+        let r = simd_select(simd_lt(a, i32x8::ZERO()), simd_neg(a), a);
+        transmute(r)
+    }
+}
+/// Computes the absolute values of packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
+pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
+    {
+        let a = a.as_i16x16();
+        let r = simd_select(simd_lt(a, i16x16::ZERO()), simd_neg(a), a);
+        transmute(r)
+    }
+}
+/// Computes the absolute values of packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
+pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
+    {
+        let a = a.as_i8x32();
+        let r = simd_select(simd_lt(a, i8x32::ZERO()), simd_neg(a), a);
+        transmute(r)
+    }
+}
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
+pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
+    }
+}
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
+pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
+pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
+pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
+    }
+}
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
+pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
+    }
+}
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
+pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
+pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
+    }
+}
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
+pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
+    }
+}
+/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
+/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
+pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    if IMM8 >= 32 {
+        return _mm256_setzero_si256();
+    }
+    let (a, b) = if IMM8 > 16 {
+        (_mm256_setzero_si256(), a)
+    } else {
+        (a, b)
+    };
+    {
+        if IMM8 == 16 {
+            return transmute(a);
+        }
+    }
+    const fn mask(shift: u32, i: u32) -> u32 {
+        let shift = shift % 16;
+        let mod_i = i % 16;
+        if mod_i < (16 - shift) {
+            i + shift
+        } else {
+            i + 16 + shift
+        }
+    }
+    {
+        let r: i8x32 = simd_shuffle(
+            b.as_i8x32(),
+            a.as_i8x32(),
+            [
+                mask(IMM8 as u32, 0),
+                mask(IMM8 as u32, 1),
+                mask(IMM8 as u32, 2),
+                mask(IMM8 as u32, 3),
+                mask(IMM8 as u32, 4),
+                mask(IMM8 as u32, 5),
+                mask(IMM8 as u32, 6),
+                mask(IMM8 as u32, 7),
+                mask(IMM8 as u32, 8),
+                mask(IMM8 as u32, 9),
+                mask(IMM8 as u32, 10),
+                mask(IMM8 as u32, 11),
+                mask(IMM8 as u32, 12),
+                mask(IMM8 as u32, 13),
+                mask(IMM8 as u32, 14),
+                mask(IMM8 as u32, 15),
+                mask(IMM8 as u32, 16),
+                mask(IMM8 as u32, 17),
+                mask(IMM8 as u32, 18),
+                mask(IMM8 as u32, 19),
+                mask(IMM8 as u32, 20),
+                mask(IMM8 as u32, 21),
+                mask(IMM8 as u32, 22),
+                mask(IMM8 as u32, 23),
+                mask(IMM8 as u32, 24),
+                mask(IMM8 as u32, 25),
+                mask(IMM8 as u32, 26),
+                mask(IMM8 as u32, 27),
+                mask(IMM8 as u32, 28),
+                mask(IMM8 as u32, 29),
+                mask(IMM8 as u32, 30),
+                mask(IMM8 as u32, 31),
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Computes the bitwise AND of 256 bits (representing integer data)
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
+pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
+    }
+}
+/// Computes the bitwise NOT of 256 bits (representing integer data)
+/// in `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
+pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let all_ones = _mm256_set1_epi8(-1);
+        transmute(simd_and(
+            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
+            b.as_i64x4(),
+        ))
+    }
+}
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
+pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = simd_cast::<16, _, u32>(a.as_u16x16());
+        let b = simd_cast::<16, _, u32>(b.as_u16x16());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
+        transmute(simd_cast::<16, _, u16>(r))
+    }
+}
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
+pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = simd_cast::<32, _, u16>(a.as_u8x32());
+        let b = simd_cast::<32, _, u16>(b.as_u8x32());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
+        transmute(simd_cast::<32, _, u8>(r))
+    }
+}
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
+pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM4, 4);
+    {
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let r: i32x4 = simd_shuffle(
+            a,
+            b,
+            [
+                [0, 4, 0, 4][IMM4 as usize & 0b11],
+                [1, 1, 5, 5][IMM4 as usize & 0b11],
+                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
+                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
+pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r: i32x8 = simd_shuffle(
+            a,
+            b,
+            [
+                [0, 8, 0, 8][IMM8 as usize & 0b11],
+                [1, 1, 9, 9][IMM8 as usize & 0b11],
+                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
+                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
+                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
+                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
+                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
+                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
+pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        let r: i16x16 = simd_shuffle(
+            a,
+            b,
+            [
+                [0, 16, 0, 16][IMM8 as usize & 0b11],
+                [1, 1, 17, 17][IMM8 as usize & 0b11],
+                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
+                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
+                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
+                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
+                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
+                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
+                [8, 24, 8, 24][IMM8 as usize & 0b11],
+                [9, 9, 25, 25][IMM8 as usize & 0b11],
+                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
+                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
+                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
+                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
+                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
+                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Blends packed 8-bit integers from `a` and `b` using `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
+pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
+    {
+        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO());
+        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
+    }
+}
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
+pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
+    {
+        let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 16]);
+        transmute::<i8x16, _>(ret)
+    }
+}
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
+pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
+    {
+        let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 32]);
+        transmute::<i8x32, _>(ret)
+    }
+}
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
+pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
+    {
+        let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 4]);
+        transmute::<i32x4, _>(ret)
+    }
+}
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
+pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
+    {
+        let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 8]);
+        transmute::<i32x8, _>(ret)
+    }
+}
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
+pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
+    {
+        let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
+        transmute::<i64x2, _>(ret)
+    }
+}
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
+pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
+    {
+        let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
+        transmute::<i64x4, _>(ret)
+    }
+}
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
+pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_setzero_pd().as_f64x2(),
+            [0_u32; 2],
+        ))
+    }
+}
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
+pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_setzero_pd().as_f64x2(),
+            [0_u32; 4],
+        ))
+    }
+}
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
+pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
+    {
+        let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
+        transmute::<i64x4, _>(ret)
+    }
+}
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
+pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
+    {
+        let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
+        transmute::<i64x4, _>(ret)
+    }
+}
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
+pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0_u32; 4],
+        ))
+    }
+}
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
+pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0_u32; 8],
+        ))
+    }
+}
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 128-bit returned value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
+pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
+    {
+        let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 8]);
+        transmute::<i16x8, _>(ret)
+    }
+}
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 256-bit returned value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
+pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
+    {
+        let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 16]);
+        transmute::<i16x16, _>(ret)
+    }
+}
+/// Compares packed 64-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
+pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
+    }
+}
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
+pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
+pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
+pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
+    }
+}
+/// Compares packed 64-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
+pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
+    }
+}
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
+pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
+pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
+pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
+    }
+}
+/// Sign-extend 16-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
+pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
+    {
+        transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
+    }
+}
+/// Sign-extend 16-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
+pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
+    {
+        let a = a.as_i16x8();
+        let v64: i16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute::<i64x4, _>(simd_cast(v64))
+    }
+}
+/// Sign-extend 32-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
+pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
+    {
+        transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
+    }
+}
+/// Sign-extend 8-bit integers to 16-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
+pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
+    {
+        transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
+    }
+}
+/// Sign-extend 8-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
+pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
+    {
+        let a = a.as_i8x16();
+        let v64: i8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i32x8, _>(simd_cast(v64))
+    }
+}
+/// Sign-extend 8-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
+pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
+    {
+        let a = a.as_i8x16();
+        let v32: i8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute::<i64x4, _>(simd_cast(v32))
+    }
+}
+/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
+/// integers, and stores the results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
+pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
+    {
+        transmute(simd_cast::<8, _, u32>(a.as_u16x8()))
+    }
+}
+/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
+/// integers. The upper four elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
+pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
+    {
+        let a = a.as_u16x8();
+        let v64: u16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<4, _, u64>(v64))
+    }
+}
+/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
+pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
+    {
+        transmute(simd_cast::<4, _, u64>(a.as_u32x4()))
+    }
+}
+/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
+pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
+    {
+        transmute(simd_cast::<16, _, u16>(a.as_u8x16()))
+    }
+}
+/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
+/// integers. The upper eight elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
+pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
+    {
+        let a = a.as_u8x16();
+        let v64: u8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(simd_cast::<8, _, u32>(v64))
+    }
+}
+/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
+/// integers. The upper twelve elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
+pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
+    {
+        let a = a.as_u8x16();
+        let v32: u8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<4, _, u64>(v32))
+    }
+}
+/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
+pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let a = a.as_i64x4();
+        let b = i64x4::ZERO();
+        let dst: i64x2 = simd_shuffle(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
+        transmute(dst)
+    }
+}
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
+pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
+pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
+pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
+pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
+pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
+pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
+/// location specified by `IMM1`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
+pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let a = a.as_i64x4();
+        let b = _mm256_castsi128_si256(b).as_i64x4();
+        let dst: i64x4 = simd_shuffle(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
+        transmute(dst)
+    }
+}
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
+/// of intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
+pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Vertically multiplies each unsigned 8-bit integer from `a` with the
+/// corresponding signed 8-bit integer from `b`, producing intermediate
+/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
+/// signed 16-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
+pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
+    }
+}
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
+pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
+}
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
+pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
+}
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
+pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
+}
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
+pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
+}
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
+pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
+}
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
+pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
+}
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
+pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
+}
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
+pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
+}
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
+pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
+}
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
+pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
+}
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
+pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
+}
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
+pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
+}
+/// Creates mask from the most significant bit of each 8-bit element in `a`,
+/// return the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
+pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
+    {
+        let z = i8x32::ZERO();
+        let m: i8x32 = simd_lt(a.as_i8x32(), z);
+        simd_bitmask_little!(31, m, u32) as i32
+    }
+}
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
+/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
+/// results in dst. Eight SADs are performed for each 128-bit lane using one
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight
+/// quadruplets are formed from sequential 8-bit integers selected from `a`
+/// starting at the offset specified in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
+pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8))
+    }
+}
+/// Multiplies the low 32-bit integers from each packed 64-bit element in
+/// `a` and `b`
+///
+/// Returns the 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
+pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(a.as_i64x4()));
+        let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(b.as_i64x4()));
+        transmute(simd_mul(a, b))
+    }
+}
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
+/// element in `a` and `b`
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
+pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        let mask = u64x4::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
+}
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
+pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = simd_cast::<16, _, i32>(a.as_i16x16());
+        let b = simd_cast::<16, _, i32>(b.as_i16x16());
+        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
+        transmute(simd_cast::<16, i32, i16>(r))
+    }
+}
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
+pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let a = simd_cast::<16, _, u32>(a.as_u16x16());
+        let b = simd_cast::<16, _, u32>(b.as_u16x16());
+        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
+        transmute(simd_cast::<16, u32, u16>(r))
+    }
+}
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers, and returns the low 16 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
+pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Multiplies the packed 32-bit integers in `a` and `b`, producing
+/// intermediate 64-bit integers, and returns the low 32 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
+pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Multiplies packed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Truncate each intermediate
+/// integer to the 18 most significant bits, round by adding 1, and
+/// return bits `[16:1]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
+pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
+/// and `b`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
+pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
+pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
+pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
+pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
+pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Permutes packed 32-bit integers from `a` according to the content of `b`.
+///
+/// The last 3 bits of each integer of `b` are used as addresses into the 8
+/// integers of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
+pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(permd(a.as_u32x8(), b.as_u32x8()))
+    }
+}
+/// Permutes 64-bit integers from `a` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
+pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let zero = i64x4::ZERO();
+        let r: i64x4 = simd_shuffle(
+            a.as_i64x4(),
+            zero,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
+pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8))
+    }
+}
+/// Shuffles 64-bit floating-point elements in `a` across lanes using the
+/// control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     {
+//         transmute(simd_shuffle(
+//             a, _mm256_undefined_pd(), [IMM8 as u32 & 0b11, (IMM8 as u32 >> 2) & 0b11,
+//             (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
+//         ))
+//     }
+// }
+
+/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
+/// the corresponding 32-bit integer index in `idx`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
+//     { permps(a, idx.as_i32x8()) }
+// }
+
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to
+/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
+/// integers in the low 16 bits of the 64-bit return value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
+pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
+    }
+}
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// For each of the 128-bit low and high halves of the vectors, the last
+/// 4 bits of each byte of `b` are used as addresses into the respective
+/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
+///
+/// In addition, if the highest significant bit of a byte of `b` is set, the
+/// respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
+/// equivalent to:
+///
+/// ```
+/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
+///     let mut r = [0; 32];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///         if b[i + 16] & 0x80 == 0u8 {
+///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
+pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
+    }
+}
+/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
+/// `imm8`.
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
+pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(MASK, 8);
+    {
+        let r: i32x8 = simd_shuffle(
+            a.as_i32x8(),
+            a.as_i32x8(),
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                (MASK as u32 >> 4) & 0b11,
+                (MASK as u32 >> 6) & 0b11,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
+pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x16();
+        let r: i16x16 = simd_shuffle(
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                4 + (IMM8 as u32 & 0b11),
+                4 + ((IMM8 as u32 >> 2) & 0b11),
+                4 + ((IMM8 as u32 >> 4) & 0b11),
+                4 + ((IMM8 as u32 >> 6) & 0b11),
+                8,
+                9,
+                10,
+                11,
+                12 + (IMM8 as u32 & 0b11),
+                12 + ((IMM8 as u32 >> 2) & 0b11),
+                12 + ((IMM8 as u32 >> 4) & 0b11),
+                12 + ((IMM8 as u32 >> 6) & 0b11),
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
+pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x16();
+        let r: i16x16 = simd_shuffle(
+            a,
+            a,
+            [
+                0 + (IMM8 as u32 & 0b11),
+                0 + ((IMM8 as u32 >> 2) & 0b11),
+                0 + ((IMM8 as u32 >> 4) & 0b11),
+                0 + ((IMM8 as u32 >> 6) & 0b11),
+                4,
+                5,
+                6,
+                7,
+                8 + (IMM8 as u32 & 0b11),
+                8 + ((IMM8 as u32 >> 2) & 0b11),
+                8 + ((IMM8 as u32 >> 4) & 0b11),
+                8 + ((IMM8 as u32 >> 6) & 0b11),
+                12,
+                13,
+                14,
+                15,
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Negates packed 16-bit integers in `a` when the corresponding signed
+/// 16-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
+pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(psignw(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Negates packed 32-bit integers in `a` when the corresponding signed
+/// 32-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
+pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(psignd(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Negates packed 8-bit integers in `a` when the corresponding signed
+/// 8-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
+pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(psignb(a.as_i8x32(), b.as_i8x32()))
+    }
+}
+/// Shifts packed 16-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
+pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
+    {
+        transmute(psllw(a.as_i16x16(), count.as_i16x8()))
+    }
+}
+/// Shifts packed 32-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
+pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
+    {
+        transmute(pslld(a.as_i32x8(), count.as_i32x4()))
+    }
+}
+/// Shifts packed 64-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
+pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
+    {
+        transmute(psllq(a.as_i64x4(), count.as_i64x2()))
+    }
+}
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
+pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+        }
+    }
+}
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
+pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+        }
+    }
+}
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
+pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+        }
+    }
+}
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
+pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_bslli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
+pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || i % 16 < shift {
+            0
+        } else {
+            32 + (i - shift)
+        }
+    }
+    {
+        let a = a.as_i8x32();
+        let r: i8x32 = simd_shuffle(
+            i8x32::ZERO(),
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
+pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    {
+        transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
+    }
+}
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
+pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    {
+        transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
+    }
+}
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
+pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    {
+        transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
+    }
+}
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
+pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    {
+        transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
+    }
+}
+/// Shifts packed 16-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
+pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
+    {
+        transmute(psraw(a.as_i16x16(), count.as_i16x8()))
+    }
+}
+/// Shifts packed 32-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
+pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
+    {
+        transmute(psrad(a.as_i32x8(), count.as_i32x4()))
+    }
+}
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
+pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16)))
+    }
+}
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
+pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31))))
+    }
+}
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
+pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
+    {
+        transmute(psravd(a.as_i32x4(), count.as_i32x4()))
+    }
+}
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
+pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
+    {
+        transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
+    }
+}
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
+pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_bsrli_epi128::<IMM8>(a)
+}
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || (15 - (i % 16)) < shift {
+            0
+        } else {
+            32 + (i + shift)
+        }
+    }
+    {
+        let a = a.as_i8x32();
+        let r: i8x32 = simd_shuffle(
+            i8x32::ZERO(),
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
+pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
+    {
+        transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
+    }
+}
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
+pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
+    {
+        transmute(psrld(a.as_i32x8(), count.as_i32x4()))
+    }
+}
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
+pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
+    {
+        transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
+    }
+}
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
+pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+        }
+    }
+}
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
+pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+        }
+    }
+}
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
+pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+        }
+    }
+}
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
+pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    {
+        transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
+    }
+}
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
+pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    {
+        transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
+    }
+}
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
+pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    {
+        transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
+    }
+}
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
+pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    {
+        transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
+    }
+}
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
+pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
+pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
+    }
+}
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
+pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
+    }
+}
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
+pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
+    }
+}
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
+pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
+    }
+}
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
+pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
+    }
+}
+/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
+pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
+    }
+}
+/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
+pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
+    }
+}
+/// Unpacks and interleave 8-bit integers from the high half of each
+/// 128-bit lane in `a` and `b`.
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
+pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        #[rustfmt::skip]
+        let r: i8x32 = simd_shuffle(
+            a.as_i8x32(), b.as_i8x32(), [8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45,
+            14, 46, 15, 47, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31,
+            63,]
+        );
+        transmute(r)
+    }
+}
+/// Unpacks and interleave 8-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
+pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
+    {
+        #[rustfmt::skip]
+        let r: i8x32 = simd_shuffle(
+            a.as_i8x32(), b.as_i8x32(), [0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38,
+            7, 39, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,]
+        );
+        transmute(r)
+    }
+}
+/// Unpacks and interleave 16-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
+pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let r: i16x16 = simd_shuffle(
+            a.as_i16x16(),
+            b.as_i16x16(),
+            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
+        );
+        transmute(r)
+    }
+}
+/// Unpacks and interleave 16-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
+pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let r: i16x16 = simd_shuffle(
+            a.as_i16x16(),
+            b.as_i16x16(),
+            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
+        );
+        transmute(r)
+    }
+}
+/// Unpacks and interleave 32-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
+pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
+        transmute(r)
+    }
+}
+/// Unpacks and interleave 32-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
+pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
+        transmute(r)
+    }
+}
+/// Unpacks and interleave 64-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
+pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
+        transmute(r)
+    }
+}
+/// Unpacks and interleave 64-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
+pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
+    {
+        let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
+        transmute(r)
+    }
+}
+/// Computes the bitwise XOR of 256 bits (representing integer data)
+/// in `a` and `b`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
+pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
+    {
+        transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
+    }
+}
+/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
+pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_uimm_bits!(INDEX, 5);
+    {
+        simd_extract(a.as_u8x32(), INDEX as u32) as i32
+    }
+}
+/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
+pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_uimm_bits!(INDEX, 4);
+    {
+        simd_extract(a.as_u16x16(), INDEX as u32) as i32
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
new file mode 100644
index 0000000000000..43f0a840b54bd
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
@@ -0,0 +1,620 @@
+use crate::abstractions::{bit::MachineInteger, simd::*};
+pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1])
+        }
+    })
+}
+
+pub fn phaddd(a: i32x8, b: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else if i < 4 {
+            b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+        } else if i < 6 {
+            a[2 * (i - 2)].wrapping_add(a[2 * (i - 2) + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phaddsw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_add(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].saturating_add(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].saturating_add(b[2 * (i - 8) + 1])
+        }
+    })
+}
+
+pub fn phsubw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].wrapping_sub(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].wrapping_sub(b[2 * (i - 8) + 1])
+        }
+    })
+}
+
+pub fn phsubd(a: i32x8, b: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else if i < 4 {
+            b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+        } else if i < 6 {
+            a[2 * (i - 2)].wrapping_sub(a[2 * (i - 2) + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phsubsw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_sub(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].saturating_sub(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].saturating_sub(b[2 * (i - 8) + 1])
+        }
+    })
+}
+pub fn pmaddwd(a: i16x16, b: i16x16) -> i32x8 {
+    i32x8::from_fn(|i| {
+        (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+    })
+}
+
+pub fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16 {
+    i16x16::from_fn(|i| {
+        ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+            .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+    })
+}
+pub fn packsswb(a: i16x16, b: i16x16) -> i8x32 {
+    i8x32::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (i8::MAX as i16) {
+                i8::MAX
+            } else if a[i] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                a[i] as i8
+            }
+        } else if i < 16 {
+            if b[i - 8] > (i8::MAX as i16) {
+                i8::MAX
+            } else if b[i - 8] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                b[i - 8] as i8
+            }
+        } else if i < 24 {
+            if a[i - 8] > (i8::MAX as i16) {
+                i8::MAX
+            } else if a[i - 8] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                a[i - 8] as i8
+            }
+        } else {
+            if b[i - 16] > (i8::MAX as i16) {
+                i8::MAX
+            } else if b[i - 16] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                b[i - 16] as i8
+            }
+        }
+    })
+}
+
+pub fn packssdw(a: i32x8, b: i32x8) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            if a[i] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                a[i] as i16
+            }
+        } else if i < 8 {
+            if b[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 4] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                b[i - 4] as i16
+            }
+        } else if i < 12 {
+            if a[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i - 4] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                a[i - 4] as i16
+            }
+        } else {
+            if b[i - 8] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 8] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                b[i - 8] as i16
+            }
+        }
+    })
+}
+
+pub fn packuswb(a: i16x16, b: i16x16) -> u8x32 {
+    u8x32::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (u8::MAX as i16) {
+                u8::MAX
+            } else if a[i] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                a[i] as u8
+            }
+        } else if i < 16 {
+            if b[i - 8] > (u8::MAX as i16) {
+                u8::MAX
+            } else if b[i - 8] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                b[i - 8] as u8
+            }
+        } else if i < 24 {
+            if a[i - 8] > (u8::MAX as i16) {
+                u8::MAX
+            } else if a[i - 8] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                a[i - 8] as u8
+            }
+        } else {
+            if b[i - 16] > (u8::MAX as i16) {
+                u8::MAX
+            } else if b[i - 16] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                b[i - 16] as u8
+            }
+        }
+    })
+}
+
+pub fn packusdw(a: i32x8, b: i32x8) -> u16x16 {
+    u16x16::from_fn(|i| {
+        if i < 4 {
+            if a[i] > (u16::MAX as i32) {
+                u16::MAX
+            } else if a[i] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                a[i] as u16
+            }
+        } else if i < 8 {
+            if b[i - 4] > (u16::MAX as i32) {
+                u16::MAX
+            } else if b[i - 4] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                b[i - 4] as u16
+            }
+        } else if i < 12 {
+            if a[i - 4] > (u16::MAX as i32) {
+                u16::MAX
+            } else if a[i - 4] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                a[i - 4] as u16
+            }
+        } else {
+            if b[i - 8] > (u16::MAX as i32) {
+                u16::MAX
+            } else if b[i - 8] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                b[i - 8] as u16
+            }
+        }
+    })
+}
+
+pub fn psignb(a: i8x32, b: i8x32) -> i8x32 {
+    i8x32::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i8::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+pub fn psignw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i16::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+
+pub fn psignd(a: i32x8, b: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i32::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+
+pub fn psllw(a: i16x16, count: i16x8) -> i16x16 {
+    let count4 = (count[0] as u16) as u64;
+    let count3 = ((count[1] as u16) as u64) * 65536;
+    let count2 = ((count[2] as u16) as u64) * 4294967296;
+    let count1 = ((count[3] as u16) as u64) * 281474976710656;
+    let count = count1 + count2 + count3 + count4;
+    i16x16::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) << count) as i16
+        }
+    })
+}
+
+pub fn pslld(a: i32x8, count: i32x4) -> i32x8 {
+    let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x8::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) << count) as i32
+        }
+    })
+}
+pub fn psllq(a: i64x4, count: i64x2) -> i64x4 {
+    let count = count[0] as u32;
+
+    i64x4::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u32) << count) as i64
+        }
+    })
+}
+
+pub fn psllvd(a: i32x4, count: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i32
+        }
+    })
+}
+pub fn psllvd256(a: i32x8, count: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i32
+        }
+    })
+}
+
+pub fn psllvq(a: i64x2, count: i64x2) -> i64x2 {
+    i64x2::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i64
+        }
+    })
+}
+pub fn psllvq256(a: i64x4, count: i64x4) -> i64x4 {
+    i64x4::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i64
+        }
+    })
+}
+
+pub fn psraw(a: i16x16, count: i16x8) -> i16x16 {
+    let count = ((count[3] as u16) as u64) * 281474976710656
+        + ((count[2] as u16) as u64) * 4294967296
+        + ((count[1] as u16) as u64) * 65536
+        + ((count[0] as u16) as u64);
+
+    i16x16::from_fn(|i| {
+        if count > 15 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count
+        }
+    })
+}
+
+pub fn psrad(a: i32x8, count: i32x4) -> i32x8 {
+    let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x8::from_fn(|i| {
+        if count > 31 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] << count
+        }
+    })
+}
+
+pub fn psravd(a: i32x4, count: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count[i]
+        }
+    })
+}
+
+pub fn psravd256(a: i32x8, count: i32x8) -> i32x8 {
+    dbg!(a, count);
+    i32x8::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count[i]
+        }
+    })
+}
+
+pub fn psrlw(a: i16x16, count: i16x8) -> i16x16 {
+    let count = (count[3] as u16 as u64) * 281474976710656
+        + (count[2] as u16 as u64) * 4294967296
+        + (count[1] as u16 as u64) * 65536
+        + (count[0] as u16 as u64);
+
+    i16x16::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) >> count) as i16
+        }
+    })
+}
+
+pub fn psrld(a: i32x8, count: i32x4) -> i32x8 {
+    let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x8::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) >> count) as i32
+        }
+    })
+}
+
+pub fn psrlq(a: i64x4, count: i64x2) -> i64x4 {
+    let count: u64 = count[0] as u64;
+
+    i64x4::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u32) >> count) as i64
+        }
+    })
+}
+
+pub fn psrlvd(a: i32x4, count: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i32
+        }
+    })
+}
+
+pub fn psrlvd256(a: i32x8, count: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i32
+        }
+    })
+}
+
+pub fn psrlvq(a: i64x2, count: i64x2) -> i64x2 {
+    i64x2::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i64
+        }
+    })
+}
+pub fn psrlvq256(a: i64x4, count: i64x4) -> i64x4 {
+    i64x4::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i64
+        }
+    })
+}
+
+pub fn pshufb(a: u8x32, b: u8x32) -> u8x32 {
+    u8x32::from_fn(|i| {
+        if i < 16 {
+            if b[i] > 127 {
+                0
+            } else {
+                let index = (b[i] % 16) as u32;
+                a[index]
+            }
+        } else {
+            if b[i] > 127 {
+                0
+            } else {
+                let index = (b[i] % 16) as u32;
+                a[index + 16]
+            }
+        }
+    })
+}
+
+pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
+    u32x8::from_fn(|i| {
+        let id = b[i] % 8;
+        a[id]
+    })
+}
+
+pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16 {
+    u16x16::from_fn(|i| {
+        if i < 8 {
+            let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
+            let b_offset = ((imm8 & 3) * 4) as u32;
+            let k = a_offset + i;
+            let l = b_offset;
+            ((a[k].wrapping_abs_diff(b[l]) as i8) as u8 as u16)
+                + ((a[k + 1].wrapping_abs_diff(b[l + 1]) as i8) as u8 as u16)
+                + ((a[k + 2].wrapping_abs_diff(b[l + 2]) as i8) as u8 as u16)
+                + ((a[k + 3].wrapping_abs_diff(b[l + 3]) as i8) as u8 as u16)
+        } else {
+            let i = i - 8;
+            let imm8 = imm8 >> 3;
+            let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
+            let b_offset = ((imm8 & 3) * 4) as u32;
+            let k = a_offset + i;
+            let l = b_offset;
+            ((a[16 + k].wrapping_abs_diff(b[16 + l]) as i8) as u8 as u16)
+                + ((a[16 + k + 1].wrapping_abs_diff(b[16 + l + 1]) as i8) as u8 as u16)
+                + ((a[16 + k + 2].wrapping_abs_diff(b[16 + l + 2]) as i8) as u8 as u16)
+                + ((a[16 + k + 3].wrapping_abs_diff(b[16 + l + 3]) as i8) as u8 as u16)
+        }
+    })
+}
+
+pub fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4 {
+    let a = i128x2::from_fn(|i| {
+        ((a[2 * i] as u64 as u128) + ((a[2 * i + 1] as u64 as u128) << 64)) as i128
+    });
+    let b = i128x2::from_fn(|i| {
+        ((b[2 * i] as u64 as u128) + ((b[2 * i + 1] as u64 as u128) << 64)) as i128
+    });
+    let imm8 = imm8 as u8 as u32 as i32;
+    let r = i128x2::from_fn(|i| {
+        let control = imm8 >> (i * 4);
+        if (control >> 3) % 2 == 1 {
+            0
+        } else {
+            match control % 4 {
+                0 => a[0],
+                1 => a[1],
+                2 => b[0],
+                3 => b[1],
+                _ => unreachable!(),
+            }
+        }
+    });
+    i64x4::from_fn(|i| {
+        let index = i >> 1;
+        let hilo = i.rem_euclid(2);
+        let val = r[index];
+        if hilo == 0 {
+            i64::cast(val)
+        } else {
+            i64::cast(val >> 64)
+        }
+    })
+}
+pub fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        let temp = (a[i] as i32) * (b[i] as i32);
+        let temp = (temp >> 14).wrapping_add(1) >> 1;
+        temp as i16
+    })
+}
+
+pub fn psadbw(a: u8x32, b: u8x32) -> u64x4 {
+    let tmp = u8x32::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
+    u64x4::from_fn(|i| {
+        (tmp[i * 8] as u16)
+            .wrapping_add(tmp[i * 8 + 1] as u16)
+            .wrapping_add(tmp[i * 8 + 2] as u16)
+            .wrapping_add(tmp[i * 8 + 3] as u16)
+            .wrapping_add(tmp[i * 8 + 4] as u16)
+            .wrapping_add(tmp[i * 8 + 5] as u16)
+            .wrapping_add(tmp[i * 8 + 6] as u16)
+            .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+    })
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
new file mode 100644
index 0000000000000..ba61996851392
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
@@ -0,0 +1,31 @@
+use crate::abstractions::simd::*;
+
+pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
+    let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
+        0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
+        1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
+        2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
+        3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
+        _ => unreachable!(),
+    });
+
+    i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
+}
+
+pub fn ptestz256(a: i64x4, b: i64x4) -> i32 {
+    let c = i64x4::from_fn(|i| a[i] & b[i]);
+    if c == i64x4::ZERO() {
+        1
+    } else {
+        0
+    }
+}
+
+pub fn ptestc256(a: i64x4, b: i64x4) -> i32 {
+    let c = i64x4::from_fn(|i| !a[i] & b[i]);
+    if c == i64x4::ZERO() {
+        1
+    } else {
+        0
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
new file mode 100644
index 0000000000000..79b660019c07c
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -0,0 +1,48 @@
+//! Rust models for x86 intrinsics.
+//!
+//! This module contains models for the intrinsics as they are defined in the Rust core.
+//! Since this is supposed to model the Rust core, the implemented functions must
+//! mirror the Rust implementations as closely as they can.
+//!
+//! For example, calls to simd functions like simd_add and simd_sub are left as is,
+//! with their implementations defined in `crate::abstractions::simd`. Some other
+//! operations like simd_cast or simd_shuffle might need a little modification
+//! for correct compilation.
+//!
+//! Calls to transmute are replaced with either an explicit call to a `BitVec::from_ function`,
+//! or with `.into()`.
+//!
+//! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding
+//! LLVM instruction via an `unsafe extern "C"` module. In those cases, the corresponding
+//! function is defined in the `c_extern` module in each file, which contain manually
+//! written implementations made by consulting the appropriate Intel documentation.
+//!
+//! In general, it is best to gain an idea of how an implementation should be written by looking
+//! at how other functions are implemented. Also see `core::arch::x86` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
+
+pub mod avx;
+pub mod avx2;
+pub mod avx2_handwritten;
+pub mod avx_handwritten;
+pub mod sse;
+pub mod sse2;
+pub mod sse2_handwritten;
+pub mod ssse3;
+pub mod ssse3_handwritten;
+
+pub(crate) mod types {
+    use crate::abstractions::bitvec::*;
+
+    #[allow(non_camel_case_types)]
+    pub type __m256i = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m256 = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m256d = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128 = BitVec<128>;
+    #[allow(non_camel_case_types)]
+    pub type __m128i = BitVec<128>;
+    #[allow(non_camel_case_types)]
+    pub type __m128d = BitVec<128>;
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/sse.rs b/testable-simd-models/src/core_arch/x86/models/sse.rs
new file mode 100644
index 0000000000000..f975c2814438a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/sse.rs
@@ -0,0 +1,21 @@
+//! Streaming SIMD Extensions (SSE)
+use super::types::*;
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
+
+/// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
+pub fn _mm_undefined_ps() -> __m128 {
+    transmute(f32x4::ZERO())
+}
+
+/// Construct a `__m128` with all elements initialized to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
+pub fn _mm_setzero_ps() -> __m128 {
+    transmute(f32x4::ZERO())
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
new file mode 100644
index 0000000000000..c9c90e3e9e267
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -0,0 +1,1618 @@
+//! Streaming SIMD Extensions 2 (SSE2)
+use super::sse2_handwritten::*;
+use super::types::*;
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
+
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
+pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_add(a.as_i8x16(), b.as_i8x16()))
+}
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
+pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
+}
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
+pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_add(a.as_i32x4(), b.as_i32x4()))
+}
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
+pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_add(a.as_i64x2(), b.as_i64x2()))
+}
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
+pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16()))
+}
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
+pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8()))
+}
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
+pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16()))
+}
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
+pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8()))
+}
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
+pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let a = simd_cast::<16, _, u16>(a.as_u8x16());
+        let b = simd_cast::<16, _, u16>(b.as_u8x16());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
+        transmute(simd_cast::<16, _, u8>(r))
+    }
+}
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
+pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let a = simd_cast::<8, _, u32>(a.as_u16x8());
+        let b = simd_cast::<8, _, u32>(b.as_u16x8());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
+        transmute(simd_cast::<8, _, u16>(r))
+    }
+}
+/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
+///
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+/// intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
+pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmaddwd(a.as_i16x8(), b.as_i16x8()))
+}
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
+pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
+}
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
+/// packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
+pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
+}
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
+pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
+}
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
+/// packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
+pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
+}
+/// Multiplies the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
+pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let a = simd_cast::<8, _, i32>(a.as_i16x8());
+        let b = simd_cast::<8, _, i32>(b.as_i16x8());
+        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
+        transmute(simd_cast::<8, i32, i16>(r))
+    }
+}
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
+pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let a = simd_cast::<8, _, u32>(a.as_u16x8());
+        let b = simd_cast::<8, _, u32>(b.as_u16x8());
+        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
+        transmute(simd_cast::<8, u32, u16>(r))
+    }
+}
+/// Multiplies the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// low 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
+pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
+}
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
+/// in `a` and `b`.
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
+pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        let mask = u64x2::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
+}
+/// Sum the absolute differences of packed unsigned 8-bit integers.
+///
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to produce
+/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
+/// the low 16 bits of 64-bit elements returned.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
+pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(psadbw(a.as_u8x16(), b.as_u8x16()))
+}
+/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
+pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
+}
+/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
+pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
+}
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
+pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_sub(a.as_i32x4(), b.as_i32x4()))
+}
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
+pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_sub(a.as_i64x2(), b.as_i64x2()))
+}
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
+pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16()))
+}
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
+pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8()))
+}
+/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
+pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16()))
+}
+/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
+pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8()))
+}
+/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
+pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_slli_si128_impl::<IMM8>(a)
+}
+
+fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 {
+            i
+        } else {
+            16 - shift + i
+        }
+    }
+    transmute::<i8x16, _>(simd_shuffle(
+        i8x16::ZERO(),
+        a.as_i8x16(),
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    ))
+}
+
+/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
+pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        _mm_slli_si128_impl::<IMM8>(a)
+    }
+}
+/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
+pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        _mm_srli_si128_impl::<IMM8>(a)
+    }
+}
+
+fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u32 {
+        if (shift as u32) > 15 {
+            i + 16
+        } else {
+            i + (shift as u32)
+        }
+    }
+    let x: i8x16 = simd_shuffle(
+        a.as_i8x16(),
+        i8x16::ZERO(),
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    );
+    transmute(x)
+}
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
+pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
+        }
+    }
+}
+/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
+pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psllw(a.as_i16x8(), count.as_i16x8()))
+}
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
+pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
+        }
+    }
+}
+/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
+pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
+    transmute(pslld(a.as_i32x4(), count.as_i32x4()))
+}
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
+pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+        }
+    }
+}
+/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
+pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psllq(a.as_i64x2(), count.as_i64x2()))
+}
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
+pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16)))
+}
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
+pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psraw(a.as_i16x8(), count.as_i16x8()))
+}
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
+pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31))))
+}
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
+pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psrad(a.as_i32x4(), count.as_i32x4()))
+}
+/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
+pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_srli_si128_impl::<IMM8>(a)
+}
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
+pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
+        }
+    }
+}
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
+pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psrlw(a.as_i16x8(), count.as_i16x8()))
+}
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
+pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
+        }
+    }
+}
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
+pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psrld(a.as_i32x4(), count.as_i32x4()))
+}
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
+pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+        }
+    }
+}
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
+pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psrlq(a.as_i64x2(), count.as_i64x2()))
+}
+/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
+pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_and(a.as_i32x4(), b.as_i32x4()))
+}
+/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
+/// then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
+pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_and(
+        simd_xor(_mm_set1_epi8(-1).as_i32x4(), a.as_i32x4()),
+        b.as_i32x4(),
+    ))
+}
+/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
+pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_or(a.as_i32x4(), b.as_i32x4()))
+}
+/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
+pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_xor(a.as_i32x4(), b.as_i32x4()))
+}
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
+pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16()))
+}
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
+pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8()))
+}
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
+pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
+}
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
+pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16()))
+}
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
+pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8()))
+}
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
+pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
+}
+/// Compares packed 8-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
+pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16()))
+}
+/// Compares packed 16-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
+pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8()))
+}
+/// Compares packed 32-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
+pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
+}
+/// Converts the lower two packed 32-bit integers in `a` to packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
+pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
+    {
+        let a = a.as_i32x4();
+        transmute(simd_cast::<2, i32, f64>(simd_shuffle(a, a, [0, 1])))
+    }
+}
+/// Returns `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
+pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
+    transmute(simd_insert(a.as_f64x2(), 0, b as f64))
+}
+/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
+pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
+    transmute(simd_cast::<4, _, f32>(a.as_i32x4()))
+}
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
+//     { transmute(cvtps2dq(a)) }
+// }
+/// Returns a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
+pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
+    transmute(i32x4::new(a, 0, 0, 0))
+}
+/// Returns the lowest element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
+pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
+    simd_extract(a.as_i32x4(), 0)
+}
+/// Sets packed 64-bit integers with the supplied values, from highest to
+/// lowest.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
+pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
+    transmute(i64x2::new(e0, e1))
+}
+/// Sets packed 32-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
+pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    transmute(i32x4::new(e0, e1, e2, e3))
+}
+/// Sets packed 16-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
+pub fn _mm_set_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+}
+/// Sets packed 8-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
+pub fn _mm_set_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    {
+        transmute(i8x16::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+        ))
+    }
+}
+/// Broadcasts 64-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
+pub fn _mm_set1_epi64x(a: i64) -> __m128i {
+    _mm_set_epi64x(a, a)
+}
+/// Broadcasts 32-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
+pub fn _mm_set1_epi32(a: i32) -> __m128i {
+    _mm_set_epi32(a, a, a, a)
+}
+/// Broadcasts 16-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
+pub fn _mm_set1_epi16(a: i16) -> __m128i {
+    _mm_set_epi16(a, a, a, a, a, a, a, a)
+}
+/// Broadcasts 8-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
+pub fn _mm_set1_epi8(a: i8) -> __m128i {
+    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+/// Sets packed 32-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
+pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    _mm_set_epi32(e0, e1, e2, e3)
+}
+/// Sets packed 16-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
+pub fn _mm_setr_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
+}
+/// Sets packed 8-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
+pub fn _mm_setr_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    _mm_set_epi8(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
+}
+/// Returns a vector with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
+pub fn _mm_setzero_si128() -> __m128i {
+    transmute(i32x4::ZERO())
+}
+/// Returns a vector where the low element is extracted from `a` and its upper
+/// element is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
+pub fn _mm_move_epi64(a: __m128i) -> __m128i {
+    {
+        let r: i64x2 = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 2]);
+        transmute(r)
+    }
+}
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
+pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(packsswb(a.as_i16x8(), b.as_i16x8()))
+}
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
+pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(packssdw(a.as_i32x4(), b.as_i32x4()))
+}
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
+pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(packuswb(a.as_i16x8(), b.as_i16x8()))
+}
+/// Returns the `imm8` element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
+pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 3);
+    simd_extract(a.as_u16x8(), IMM8 as u32) as i32
+}
+/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
+pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 3);
+    transmute(simd_insert(a.as_i16x8(), IMM8 as u32, i as i16))
+}
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
+pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
+    {
+        let z = i8x16::ZERO();
+        let m: i8x16 = simd_lt(a.as_i8x16(), z);
+        simd_bitmask_little!(15, m, u16) as u32 as i32
+    }
+}
+/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
+pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i32x4();
+        let x: i32x4 = simd_shuffle(
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        );
+        transmute(x)
+    }
+}
+/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
+/// `IMM8`.
+///
+/// Put the results in the high 64 bits of the returned vector, with the low 64
+/// bits being copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
+pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x8();
+        let x: i16x8 = simd_shuffle(
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                (IMM8 as u32 & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+            ],
+        );
+        transmute(x)
+    }
+}
+/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
+/// `IMM8`.
+///
+/// Put the results in the low 64 bits of the returned vector, with the high 64
+/// bits being copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
+pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x8();
+        let x: i16x8 = simd_shuffle(
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                4,
+                5,
+                6,
+                7,
+            ],
+        );
+        transmute(x)
+    }
+}
+/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
+pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute::<i8x16, _>(simd_shuffle(
+            a.as_i8x16(),
+            b.as_i8x16(),
+            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
+        ))
+    }
+}
+/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
+pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute::<i16x8, _>(x)
+    }
+}
+/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
+pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i32x4, _>(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
+}
+/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
+pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i64x2, _>(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [1, 3]))
+}
+/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
+pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute::<i8x16, _>(simd_shuffle(
+            a.as_i8x16(),
+            b.as_i8x16(),
+            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
+        ))
+    }
+}
+/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
+pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
+        transmute::<i16x8, _>(x)
+    }
+}
+/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
+pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i32x4, _>(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
+}
+/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
+pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i64x2, _>(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [0, 2]))
+}
+/// Returns a new vector with the low element of `a` replaced by the sum of the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))) }
+// }
+/// Adds packed double-precision (64-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_add(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the result of
+/// diving the lower element of `a` by the lower element of `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))) }
+// }
+/// Divide packed double-precision (64-bit) floating-point elements in `a` by
+/// packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_div(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the maximum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { maxsd(a, b) }
+// }
+/// Returns a new vector with the maximum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { maxpd(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the minimum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { minsd(a, b) }
+// }
+/// Returns a new vector with the minimum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { minpd(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by multiplying the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))) }
+// }
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_mul(a.as_f64x2(), b.as_f64x2())) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the square
+/// root of the lower element `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
+// }
+/// Returns a new vector with the square root of each of the values in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
+//     { simd_fsqrt(a) }
+// }
+/// Returns a new vector with the low element of `a` replaced by subtracting the
+/// low element by `b` from the low element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))) }
+// }
+/// Subtract packed double-precision (64-bit) floating-point elements in `b`
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_sub(a, b) }
+// }
+/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
+pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_and_si128(a, b))
+    }
+}
+/// Computes the bitwise NOT of `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
+pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_andnot_si128(a, b))
+    }
+}
+/// Computes the bitwise OR of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
+pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_or_si128(a, b))
+    }
+}
+/// Computes the bitwise XOR of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
+pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_xor_si128(a, b))
+    }
+}
+/// Returns a new vector with the low element of `a` replaced by the equality
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 0) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the less-than
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 1) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 2) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract(a, 1))) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(_mm_cmple_sd(b, a), 1, simd_extract(a, 1)) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the result
+/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
+/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 7) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the result of
+/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
+/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 3) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the not-equal
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 4) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-less-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 5) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 6) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract(a, 1)) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract(a, 1)) }
+// }
+/// Compares corresponding elements in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 0) }
+// }
+/// Compares corresponding elements in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 1) }
+// }
+/// Compares corresponding elements in `a` and `b` for less-than-or-equal
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 2) }
+// }
+/// Compares corresponding elements in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmplt_pd(b, a)
+// }
+/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmple_pd(b, a)
+// }
+/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 7) }
+// }
+/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 3) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 4) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 5) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 6) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmpnlt_pd(b, a)
+// }
+/// Compares corresponding elements in `a` and `b` for
+/// not-greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmpnle_pd(b, a)
+// }
+/// Compares the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comieqsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comiltsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comilesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comigtsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comigesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comineqsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomieqsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomiltsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomilesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomigtsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomigesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomineqsd(a, b) }
+// }
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed single-precision (32-bit) floating-point elements
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
+pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
+    {
+        let r = simd_cast::<2, _, f32>(a.as_f64x2());
+        let zero = f32x2::ZERO();
+        transmute::<f32x4, _>(simd_shuffle(r, zero, [0, 1, 2, 3]))
+    }
+}
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to
+/// packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
+pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
+    {
+        let a = a.as_f32x4();
+        transmute(simd_cast::<2, f32, f64>(simd_shuffle(a, a, [0, 1])))
+    }
+}
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
+//     { transmute(cvtpd2dq(a)) }
+// }
+/// Converts the lower double-precision (64-bit) floating-point element in a to
+/// a 32-bit integer.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
+//     { cvtsd2si(a) }
+// }
+/// Converts the lower double-precision (64-bit) floating-point element in `b`
+/// to a single-precision (32-bit) floating-point element, store the result in
+/// the lower element of the return value, and copies the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
+//     { cvtsd2ss(a, b) }
+// }
+/// Returns the lower double-precision (64-bit) floating-point element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
+//     { simd_extract(a, 0) }
+// }
+/// Converts the lower single-precision (32-bit) floating-point element in `b`
+/// to a double-precision (64-bit) floating-point element, store the result in
+/// the lower element of the return value, and copies the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
+//     { cvtss2sd(a, b) }
+// }
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
+//     { transmute(cvttpd2dq(a)) }
+// }
+/// Converts the lower double-precision (64-bit) floating-point element in `a`
+/// to a 32-bit integer with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
+//     { cvttsd2si(a) }
+// }
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
+//     { transmute(cvttps2dq(a)) }
+// }
+/// Copies double-precision (64-bit) floating-point element `a` to the lower
+/// element of the packed 64-bit return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
+pub fn _mm_set_sd(a: f64) -> __m128d {
+    _mm_set_pd(0.0, a)
+}
+/// Broadcasts double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
+pub fn _mm_set1_pd(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+/// Broadcasts double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
+pub fn _mm_set_pd1(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+/// Sets packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
+pub fn _mm_set_pd(a: f64, b: f64) -> __m128d {
+    transmute(f64x2::new(b, a))
+}
+/// Sets packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
+pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
+    _mm_set_pd(b, a)
+}
+/// Returns packed double-precision (64-bit) floating-point elements with all
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
+pub fn _mm_setzero_pd() -> __m128d {
+    transmute(f64x2::ZERO())
+}
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 2 least significant bits of the return value.
+/// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
+pub fn _mm_movemask_pd(a: __m128d) -> i32 {
+    {
+        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO());
+        simd_bitmask_little!(1, mask, u8) as i32
+    }
+}
+/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
+/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
+/// parameter as a specifier.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
+pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(MASK, 8);
+    transmute(simd_shuffle(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2],
+    ))
+}
+/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
+/// 64 bits are set to the lower 64 bits of the second parameter. The upper
+/// 64 bits are set to the upper 64 bits of the first parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
+pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_setr_pd(simd_extract(b.as_f64x2(), 0), simd_extract(a.as_f64x2(), 1))
+}
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// floating-point vector of `[4 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
+pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
+    transmute(a)
+}
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
+pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
+    transmute(a)
+}
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// floating-point vector of `[2 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
+pub fn _mm_castps_pd(a: __m128) -> __m128d {
+    transmute(a)
+}
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
+pub fn _mm_castps_si128(a: __m128) -> __m128i {
+    transmute(a)
+}
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[2 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
+pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
+    transmute(a)
+}
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[4 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
+pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
+    transmute(a)
+}
+/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
+pub fn _mm_undefined_pd() -> __m128d {
+    transmute(f32x4::ZERO())
+}
+/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
+pub fn _mm_undefined_si128() -> __m128i {
+    transmute(u32x4::ZERO())
+}
+/// The resulting `__m128d` element is composed by the low-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
+/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
+pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [1, 3]))
+}
+/// The resulting `__m128d` element is composed by the high-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
+/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
+pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [0, 2]))
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
new file mode 100644
index 0000000000000..217298286968c
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
@@ -0,0 +1,196 @@
+use crate::abstractions::{bit::MachineInteger, simd::*};
+pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
+    i8x16::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (i8::MAX as i16) {
+                i8::MAX
+            } else if a[i] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                a[i] as i8
+            }
+        } else {
+            if b[i - 8] > (i8::MAX as i16) {
+                i8::MAX
+            } else if b[i - 8] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                b[i - 8] as i8
+            }
+        }
+    })
+}
+pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4 {
+    i32x4::from_fn(|i| {
+        (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+    })
+}
+pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 {
+    let tmp = u8x16::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
+    u64x2::from_fn(|i| {
+        (tmp[i * 8] as u16)
+            .wrapping_add(tmp[i * 8 + 1] as u16)
+            .wrapping_add(tmp[i * 8 + 2] as u16)
+            .wrapping_add(tmp[i * 8 + 3] as u16)
+            .wrapping_add(tmp[i * 8 + 4] as u16)
+            .wrapping_add(tmp[i * 8 + 5] as u16)
+            .wrapping_add(tmp[i * 8 + 6] as u16)
+            .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+    })
+}
+pub fn psllw(a: i16x8, count: i16x8) -> i16x8 {
+    let count4: u64 = (count[0] as u16) as u64;
+    let count3: u64 = ((count[1] as u16) as u64) * 65536;
+    let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
+    let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
+    let count = count1 + count2 + count3 + count4;
+    i16x8::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) << count) as i16
+        }
+    })
+}
+
+pub fn pslld(a: i32x4, count: i32x4) -> i32x4 {
+    let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x4::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) << count) as i32
+        }
+    })
+}
+
+pub fn psllq(a: i64x2, count: i64x2) -> i64x2 {
+    let count: u64 = count[0] as u64;
+
+    i64x2::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u64) << count) as i64
+        }
+    })
+}
+
+pub fn psraw(a: i16x8, count: i16x8) -> i16x8 {
+    let count: u64 = ((count[3] as u16) as u64) * 281474976710656
+        + ((count[2] as u16) as u64) * 4294967296
+        + ((count[1] as u16) as u64) * 65536
+        + ((count[0] as u16) as u64);
+
+    i16x8::from_fn(|i| {
+        if count > 15 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count
+        }
+    })
+}
+
+pub fn psrad(a: i32x4, count: i32x4) -> i32x4 {
+    let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x4::from_fn(|i| {
+        if count > 31 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] << count
+        }
+    })
+}
+
+pub fn psrlw(a: i16x8, count: i16x8) -> i16x8 {
+    let count: u64 = (count[3] as u16 as u64) * 281474976710656
+        + (count[2] as u16 as u64) * 4294967296
+        + (count[1] as u16 as u64) * 65536
+        + (count[0] as u16 as u64);
+
+    i16x8::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) >> count) as i16
+        }
+    })
+}
+
+pub fn psrld(a: i32x4, count: i32x4) -> i32x4 {
+    let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
+
+    i32x4::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) >> count) as i32
+        }
+    })
+}
+
+pub fn psrlq(a: i64x2, count: i64x2) -> i64x2 {
+    let count: u64 = count[0] as u64;
+
+    i64x2::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u64) >> count) as i64
+        }
+    })
+}
+
+pub fn packssdw(a: i32x4, b: i32x4) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            if a[i] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                a[i] as i16
+            }
+        } else {
+            if b[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 4] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                b[i - 4] as i16
+            }
+        }
+    })
+}
+
+pub fn packuswb(a: i16x8, b: i16x8) -> u8x16 {
+    u8x16::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (u8::MAX as i16) {
+                u8::MAX
+            } else if a[i] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                a[i] as u8
+            }
+        } else {
+            if b[i - 8] > (u8::MAX as i16) {
+                u8::MAX
+            } else if b[i - 8] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                b[i - 8] as u8
+            }
+        }
+    })
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
new file mode 100644
index 0000000000000..665e83460fca6
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -0,0 +1,238 @@
+//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
+
+use super::sse2::*;
+use super::ssse3_handwritten::*;
+use super::types::*;
+
+/// Computes the absolute value of packed 8-bit signed integers in `a` and
+/// return the unsigned results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8)
+pub fn _mm_abs_epi8(a: __m128i) -> __m128i {
+    {
+        let a = a.as_i8x16();
+        let zero = i8x16::ZERO();
+        let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
+}
+/// Computes the absolute value of each of the packed 16-bit signed integers in
+/// `a` and
+/// return the 16-bit unsigned integer
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16)
+pub fn _mm_abs_epi16(a: __m128i) -> __m128i {
+    {
+        let a = a.as_i16x8();
+        let zero = i16x8::ZERO();
+        let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
+}
+/// Computes the absolute value of each of the packed 32-bit signed integers in
+/// `a` and
+/// return the 32-bit unsigned integer
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32)
+pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
+    {
+        let a = a.as_i32x4();
+        let zero = i32x4::ZERO();
+        let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
+}
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// The last 4 bits of each byte of `b` are used as addresses
+/// into the 16 bytes of `a`.
+///
+/// In addition, if the highest significant bit of a byte of `b`
+/// is set, the respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is
+/// logically equivalent to:
+///
+/// ```
+/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
+///     let mut r = [0u8; 16];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8)
+pub fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(pshufb128(a.as_u8x16(), b.as_u8x16()))
+    }
+}
+/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
+/// shift the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8)
+pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    if IMM8 > 32 {
+        return _mm_setzero_si128();
+    }
+    let (a, b) = if IMM8 > 16 {
+        (_mm_setzero_si128(), a)
+    } else {
+        (a, b)
+    };
+    const fn mask(shift: u32, i: u32) -> u32 {
+        if shift > 32 {
+            i
+        } else if shift > 16 {
+            shift - 16 + i
+        } else {
+            shift + i
+        }
+    }
+    {
+        let r: i8x16 = simd_shuffle(
+            b.as_i8x16(),
+            a.as_i8x16(),
+            [
+                mask(IMM8 as u32, 0),
+                mask(IMM8 as u32, 1),
+                mask(IMM8 as u32, 2),
+                mask(IMM8 as u32, 3),
+                mask(IMM8 as u32, 4),
+                mask(IMM8 as u32, 5),
+                mask(IMM8 as u32, 6),
+                mask(IMM8 as u32, 7),
+                mask(IMM8 as u32, 8),
+                mask(IMM8 as u32, 9),
+                mask(IMM8 as u32, 10),
+                mask(IMM8 as u32, 11),
+                mask(IMM8 as u32, 12),
+                mask(IMM8 as u32, 13),
+                mask(IMM8 as u32, 14),
+                mask(IMM8 as u32, 15),
+            ],
+        );
+        transmute(r)
+    }
+}
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16)
+pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(phaddw128(a.as_i16x8(), b.as_i16x8()))
+    }
+}
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are
+/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16)
+pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(phaddsw128(a.as_i16x8(), b.as_i16x8()))
+    }
+}
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32)
+pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(phaddd128(a.as_i32x4(), b.as_i32x4()))
+    }
+}
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16)
+pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(phsubw128(a.as_i16x8(), b.as_i16x8()))
+    }
+}
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than
+/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+/// saturated to 8000h.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16)
+pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(phsubsw128(a.as_i16x8(), b.as_i16x8()))
+    }
+}
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32)
+pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(phsubd128(a.as_i32x4(), b.as_i32x4()))
+    }
+}
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// values contained in the first source operand and packed 8-bit signed
+/// integer values contained in the second source operand, add pairs of
+/// contiguous products with signed saturation, and writes the 16-bit sums to
+/// the corresponding bits in the destination.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16)
+pub fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16()))
+    }
+}
+/// Multiplies packed 16-bit signed integer values, truncate the 32-bit
+/// product to the 18 most significant bits by right-shifting, round the
+/// truncated value by adding 1, and write bits `[16:1]` to the destination.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16)
+pub fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8()))
+    }
+}
+/// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit
+/// integer in `b` is negative, and returns the result.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8)
+pub fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(psignb128(a.as_i8x16(), b.as_i8x16()))
+    }
+}
+/// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit
+/// integer in `b` is negative, and returns the results.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16)
+pub fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(psignw128(a.as_i16x8(), b.as_i16x8()))
+    }
+}
+/// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit
+/// integer in `b` is negative, and returns the results.
+/// Element in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32)
+pub fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
+    {
+        transmute(psignd128(a.as_i32x4(), b.as_i32x4()))
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs
new file mode 100644
index 0000000000000..4e911a83fb457
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs
@@ -0,0 +1,127 @@
+use crate::abstractions::simd::*;
+pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
+    u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u32] })
+}
+
+pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phaddsw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_add(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phaddd128(a: i32x4, b: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else {
+            b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+        }
+    })
+}
+
+pub fn phsubw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phsubsw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_sub(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phsubd128(a: i32x4, b: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else {
+            b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+        }
+    })
+}
+
+pub fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8 {
+    i16x8::from_fn(|i| {
+        ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+            .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+    })
+}
+
+pub fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        let temp = (a[i] as i32) * (b[i] as i32);
+        let temp = (temp >> 14).wrapping_add(1) >> 1;
+        temp as i16
+    })
+}
+
+pub fn psignb128(a: i8x16, b: i8x16) -> i8x16 {
+    i8x16::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i8::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+
+pub fn psignw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i16::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+
+pub fn psignd128(a: i32x4, b: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i32::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
new file mode 100644
index 0000000000000..02b1d81173ad0
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -0,0 +1,258 @@
+use super::types::*;
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+macro_rules! assert_feq {
+    ($lhs:expr, $rhs:expr) => {
+        assert!(($lhs.is_nan() && $rhs.is_nan()) || $lhs == $rhs)
+    };
+}
+
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::avx::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+mk!(_mm256_blendv_ps(a: __m256, b: __m256, c: __m256));
+
+#[test]
+fn _mm256_movemask_ps() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_movemask_ps(a.into()),
+            unsafe { upstream::_mm256_movemask_ps(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_movemask_pd() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_movemask_pd(a.into()),
+            unsafe { upstream::_mm256_movemask_pd(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_testz_si256() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        let b: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_testz_si256(a.into(), b.into()),
+            unsafe { upstream::_mm256_testz_si256(a.into(), b.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_testc_si256() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        let b: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_testc_si256(a.into(), b.into()),
+            unsafe { upstream::_mm256_testc_si256(a.into(), b.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_cvtsd_f64() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_feq!(
+            super::super::models::avx::_mm256_cvtsd_f64(a.into()),
+            unsafe { upstream::_mm256_cvtsd_f64(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_cvtsi256_si32() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_cvtsi256_si32(a.into()),
+            unsafe { upstream::_mm256_cvtsi256_si32(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_cvtss_f32() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_feq!(
+            super::super::models::avx::_mm256_cvtss_f32(a.into()),
+            unsafe { upstream::_mm256_cvtss_f32(a.into()) }
+        );
+    }
+}
+
+mk!(_mm256_setzero_ps());
+mk!(_mm256_setzero_si256());
+mk!(_mm256_set_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8
+));
+mk!(_mm256_set_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16
+));
+mk!(_mm256_set_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32
+));
+mk!(_mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64));
+mk!(_mm256_set1_epi8(a: i8));
+mk!(_mm256_set1_epi16(a: i16));
+mk!(_mm256_set1_epi32(a: i32));
+mk!(_mm256_set1_epi64x(a: i64));
+mk!(_mm256_set_pd(a: f64, b: f64, c: f64, d: f64));
+mk!(_mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32));
+mk!(_mm256_setr_pd(a: f64, b: f64, c: f64, d: f64));
+mk!(_mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32));
+mk!(_mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64));
+mk!(_mm256_set1_pd(a: f64));
+mk!(_mm256_set1_ps(a: f32));
+
+mk!(_mm256_and_pd(a: __m256d, b: __m256d));
+mk!(_mm256_and_ps(a: __m256, b: __m256));
+mk!(_mm256_or_pd(a: __m256d, b: __m256d));
+mk!(_mm256_or_ps(a: __m256, b: __m256));
+mk!(_mm256_andnot_pd(a: __m256d, b: __m256d));
+mk!(_mm256_andnot_ps(a: __m256, b: __m256));
+mk!(_mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d));
+mk!(_mm256_xor_pd(a: __m256d, b: __m256d));
+mk!(_mm256_xor_ps(a: __m256, b: __m256));
+mk!(_mm256_cvtepi32_pd(a: __m128i));
+mk!(_mm256_cvtepi32_ps(a: __m256i));
+mk!(_mm256_cvtpd_ps(a: __m256d));
+mk!(_mm256_cvtps_pd(a: __m128));
+mk!(_mm256_movehdup_ps(a: __m256));
+mk!(_mm256_moveldup_ps(a: __m256));
+mk!(_mm256_movedup_pd(a: __m256d));
+mk!(_mm256_unpackhi_pd(a: __m256d, b: __m256d));
+mk!(_mm256_unpackhi_ps(a: __m256, b: __m256));
+mk!(_mm256_unpacklo_pd(a: __m256d, b: __m256d));
+mk!(_mm256_unpacklo_ps(a: __m256, b: __m256));
+mk!(_mm256_setzero_pd());
+mk!(_mm256_castpd_ps(a: __m256d));
+mk!(_mm256_castps_pd(a: __m256));
+mk!(_mm256_castps_si256(a: __m256));
+mk!(_mm256_castsi256_ps(a: __m256i));
+mk!(_mm256_castpd_si256(a: __m256d));
+mk!(_mm256_castsi256_pd(a: __m256i));
+mk!(_mm256_castps256_ps128(a: __m256));
+mk!(_mm256_castpd256_pd128(a: __m256d));
+mk!(_mm256_castsi256_si128(a: __m256i));
+mk!(_mm256_castps128_ps256(a: __m128));
+mk!(_mm256_castpd128_pd256(a: __m128d));
+mk!(_mm256_castsi128_si256(a: __m128i));
+mk!(_mm256_zextps128_ps256(a: __m128));
+mk!(_mm256_zextsi128_si256(a: __m128i));
+mk!(_mm256_zextpd128_pd256(a: __m128d));
+mk!(_mm256_undefined_ps());
+mk!(_mm256_undefined_pd());
+mk!(_mm256_undefined_si256());
+mk!(_mm256_set_m128(hi: __m128, lo: __m128));
+mk!(_mm256_set_m128d(hi: __m128d, lo: __m128d));
+mk!(_mm256_set_m128i(hi: __m128i, lo: __m128i));
+mk!(_mm256_setr_m128(lo: __m128, hi: __m128));
+mk!(_mm256_setr_m128d(lo: __m128d, hi: __m128d));
+mk!(_mm256_setr_m128i(lo: __m128i, hi: __m128i));
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
new file mode 100644
index 0000000000000..dcabcbb58b1e0
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
@@ -0,0 +1,541 @@
+use super::types::*;
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::avx2::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+
+mk!(_mm256_abs_epi32(a: BitVec));
+mk!(_mm256_abs_epi16(a: BitVec));
+mk!(_mm256_abs_epi8(a: BitVec));
+mk!(_mm256_add_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_add_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_add_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_add_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epu16(a: BitVec, b: BitVec));
+mk!([100]_mm256_alignr_epi8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec));
+mk!([100]_mm256_permute2x128_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec));
+mk!(_mm256_blendv_epi8(a: BitVec, b: BitVec, mask: BitVec));
+mk!(_mm_broadcastb_epi8(a: BitVec));
+mk!(_mm256_broadcastb_epi8(a: BitVec));
+mk!(_mm_broadcastd_epi32(a: BitVec));
+mk!(_mm256_broadcastd_epi32(a: BitVec));
+mk!(_mm_broadcastq_epi64(a: BitVec));
+mk!(_mm256_broadcastq_epi64(a: BitVec));
+mk!(_mm_broadcastsi128_si256(a: BitVec));
+mk!(_mm256_broadcastsi128_si256(a: BitVec));
+mk!(_mm_broadcastw_epi16(a: BitVec));
+mk!(_mm256_broadcastw_epi16(a: BitVec));
+mk!(_mm256_cmpeq_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_cmpeq_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_cmpeq_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_cmpeq_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_cvtepi16_epi32(a: BitVec));
+mk!(_mm256_cvtepi16_epi64(a: BitVec));
+mk!(_mm256_cvtepi32_epi64(a: BitVec));
+mk!(_mm256_cvtepi8_epi16(a: BitVec));
+mk!(_mm256_cvtepi8_epi32(a: BitVec));
+mk!(_mm256_cvtepi8_epi64(a: BitVec));
+mk!(_mm256_cvtepu16_epi32(a: BitVec));
+mk!(_mm256_cvtepu16_epi64(a: BitVec));
+mk!(_mm256_cvtepu32_epi64(a: BitVec));
+mk!(_mm256_cvtepu8_epi16(a: BitVec));
+mk!(_mm256_cvtepu8_epi32(a: BitVec));
+mk!(_mm256_cvtepu8_epi64(a: BitVec));
+mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));
+mk!(_mm256_hadd_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_hadd_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_hsub_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_hsub_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_hsubs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_inserti128_si256{<0>,<1>}(a: BitVec, b: BitVec));
+mk!(_mm256_madd_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_maddubs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_max_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_max_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_max_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_max_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_max_epu32(a: BitVec, b: BitVec));
+mk!(_mm256_max_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_min_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_min_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_min_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_min_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_min_epu32(a: BitVec, b: BitVec));
+mk!(_mm256_min_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_mul_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_mul_epu32(a: BitVec, b: BitVec));
+mk!(_mm256_mulhi_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_mulhi_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_mullo_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_mullo_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_mulhrs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_or_si256(a: BitVec, b: BitVec));
+mk!(_mm256_packs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_packs_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_packus_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_packus_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_permutevar8x32_epi32(a: BitVec, b: BitVec));
+#[test]
+fn _mm256_movemask_epi8() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_movemask_epi8(a.into()),
+            unsafe { upstream::_mm256_movemask_epi8(a.into()) }
+        );
+    }
+}
+mk!([100]_mm256_mpsadbw_epu8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec));
+
+mk!([100]_mm256_permute4x64_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_shuffle_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_shufflehi_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_shufflelo_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm256_sad_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_shuffle_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_sign_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_sign_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_sign_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_sll_epi16(a: BitVec, count: BitVec));
+mk!(_mm256_sll_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_sll_epi64(a: BitVec, count: BitVec));
+mk!([100]_mm256_slli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_slli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_slli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_slli_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_bslli_epi128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm_sllv_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_sllv_epi32(a: BitVec, count: BitVec));
+mk!(_mm_sllv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_sllv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_sra_epi16(a: BitVec, count: BitVec));
+mk!(_mm256_sra_epi32(a: BitVec, count: BitVec));
+mk!([100]_mm256_srai_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm256_srai_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm_srav_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_srav_epi32(a: BitVec, count: BitVec));
+mk!([100]_mm256_srli_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm256_srl_epi16(a: BitVec, count: BitVec));
+mk!(_mm256_srl_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_srl_epi64(a: BitVec, count: BitVec));
+mk!([100]_mm256_srli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_srli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_srli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm_srlv_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_srlv_epi32(a: BitVec, count: BitVec));
+mk!(_mm_srlv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_srlv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_sub_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_sub_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_sub_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_sub_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_xor_si256(a: BitVec, b: BitVec));
+
+#[test]
+fn _mm256_extract_epi8() {
+    let n = 100;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<0>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<0>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<1>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<1>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<2>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<2>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<3>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<3>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<4>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<4>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<5>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<5>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<6>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<6>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<7>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<7>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<8>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<8>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<9>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<9>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<10>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<10>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<11>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<11>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<12>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<12>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<13>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<13>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<14>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<14>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<15>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<15>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<16>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<16>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<17>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<17>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<18>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<18>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<19>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<19>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<20>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<20>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<21>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<21>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<22>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<22>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<23>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<23>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<24>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<24>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<25>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<25>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<26>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<26>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<27>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<27>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<28>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<28>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<29>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<29>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<30>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<30>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<31>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<31>(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_extract_epi16() {
+    let n = 100;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<0>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<0>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<1>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<1>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<2>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<2>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<3>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<3>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<4>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<4>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<5>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<5>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<6>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<6>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<7>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<7>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<8>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<8>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<9>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<9>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<10>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<10>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<11>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<11>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<12>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<12>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<13>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<13>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<14>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<14>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<15>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<15>(a.into()) }
+        );
+    }
+}
+
+mk!(_mm256_and_si256(a: __m256i, b: __m256i));
+mk!(_mm256_andnot_si256(a: __m256i, b: __m256i));
+mk!(_mm256_avg_epu16(a: __m256i, b: __m256i));
+mk!(_mm256_avg_epu8(a: __m256i, b: __m256i));
+mk!(_mm_broadcastsd_pd(a: __m128d));
+mk!(_mm256_broadcastsd_pd(a: __m128d));
+mk!(_mm_broadcastss_ps(a: __m128));
+mk!(_mm256_broadcastss_ps(a: __m128));
diff --git a/testable-simd-models/src/core_arch/x86/tests/mod.rs b/testable-simd-models/src/core_arch/x86/tests/mod.rs
new file mode 100644
index 0000000000000..217ff55623dbf
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/mod.rs
@@ -0,0 +1,172 @@
+//! Tests for intrinsics defined in `crate::core_arch::x86::models`
+//!
+//! Each and every modelled intrinsic is tested against the Rust
+//! implementation here. For the most part, the tests work by
+//! generating random inputs, passing them as arguments
+//! to both the models in this crate, and the corresponding intrinsics
+//! in the Rust core and then comparing their outputs.
+//!
+//! To add a test for a modelled intrinsic, go the appropriate file, and
+//! use the `mk!` macro to define it.
+//!
+//! A `mk!` macro invocation looks like the following,
+//! `mk!([<number of times the random test happens>]<function name>{<<const values, if the function takes any>,>}(<function arguments : with types,>))
+//!
+//! For example, some valid invocations are
+//!
+//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: __m256i));`
+//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: __m256i));`
+//! `mk!(_mm256_abs_epi16(a: __m256i));`
+//!
+//! The number of random tests is optional. If not provided, it is taken to be 1000 by default.
+//! The const values are necessary if the function has constant arguments, but should be discarded if not.
+//! The function name and the function arguments are necessary in all cases.
+//!
+//! Note: This only works if the function returns a bit-vector or funarray. If it returns an integer, the
+//! test has to be written manually. It is recommended that the manually defined test follows
+//! the pattern of tests defined via the `mk!` invocation. It is also recommended that, in the
+//! case that the intrinsic takes constant arguments, each and every possible constant value
+//! (upto a maximum of 255) that can be passed to the function be used for testing. The number
+//! of constant values passed depends on if the Rust intrinsics statically asserts that the
+//! length of the constant argument be less than or equal to a certain number of bits.
+
+mod avx;
+mod avx2;
+mod sse2;
+mod ssse3;
+use crate::abstractions::bitvec::*;
+
+pub(crate) mod types {
+    use crate::abstractions::bitvec::*;
+
+    #[allow(non_camel_case_types)]
+    pub type __m256i = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m256 = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128i = BitVec<128>;
+    #[allow(non_camel_case_types)]
+    pub type __m256d = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128 = BitVec<128>;
+    #[allow(non_camel_case_types)]
+    pub type __m128d = BitVec<128>;
+}
+
+pub(crate) mod upstream {
+    #[cfg(target_arch = "x86")]
+    pub use core::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    pub use core::arch::x86_64::*;
+}
+
+mod conversions {
+    use super::upstream::{
+        __m128, __m128d, __m128i, __m256, __m256d, __m256i, _mm256_castpd_si256,
+        _mm256_castps_si256, _mm256_castsi256_pd, _mm256_castsi256_ps, _mm256_loadu_si256,
+        _mm256_storeu_si256, _mm_castpd_si128, _mm_castps_si128, _mm_castsi128_pd,
+        _mm_castsi128_ps, _mm_loadu_si128, _mm_storeu_si128,
+    };
+    use super::BitVec;
+
+    impl From<BitVec<256>> for __m256i {
+        fn from(bv: BitVec<256>) -> __m256i {
+            let bv: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm256_loadu_si256(bv.as_ptr() as *const _) }
+        }
+    }
+    impl From<BitVec<256>> for __m256 {
+        fn from(bv: BitVec<256>) -> __m256 {
+            let bv: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm256_castsi256_ps(_mm256_loadu_si256(bv.as_ptr() as *const _)) }
+        }
+    }
+
+    impl From<BitVec<128>> for __m128i {
+        fn from(bv: BitVec<128>) -> __m128i {
+            let slice: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm_loadu_si128(slice.as_ptr() as *const __m128i) }
+        }
+    }
+
+    impl From<BitVec<128>> for __m128 {
+        fn from(bv: BitVec<128>) -> __m128 {
+            let slice: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm_castsi128_ps(_mm_loadu_si128(slice.as_ptr() as *const __m128i)) }
+        }
+    }
+
+    impl From<BitVec<128>> for __m128d {
+        fn from(bv: BitVec<128>) -> __m128d {
+            let slice: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm_castsi128_pd(_mm_loadu_si128(slice.as_ptr() as *const __m128i)) }
+        }
+    }
+
+    impl From<BitVec<256>> for __m256d {
+        fn from(bv: BitVec<256>) -> __m256d {
+            let bv: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm256_castsi256_pd(_mm256_loadu_si256(bv.as_ptr() as *const _)) }
+        }
+    }
+
+    impl From<__m256i> for BitVec<256> {
+        fn from(vec: __m256i) -> BitVec<256> {
+            let mut v = [0u8; 32];
+            unsafe {
+                _mm256_storeu_si256(v.as_mut_ptr() as *mut _, vec);
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m256> for BitVec<256> {
+        fn from(vec: __m256) -> BitVec<256> {
+            let mut v = [0u8; 32];
+            unsafe {
+                _mm256_storeu_si256(v.as_mut_ptr() as *mut _, _mm256_castps_si256(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m256d> for BitVec<256> {
+        fn from(vec: __m256d) -> BitVec<256> {
+            let mut v = [0u8; 32];
+            unsafe {
+                _mm256_storeu_si256(v.as_mut_ptr() as *mut _, _mm256_castpd_si256(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m128i> for BitVec<128> {
+        fn from(vec: __m128i) -> BitVec<128> {
+            let mut v = [0u8; 16];
+            unsafe {
+                _mm_storeu_si128(v.as_mut_ptr() as *mut _, vec);
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m128> for BitVec<128> {
+        fn from(vec: __m128) -> BitVec<128> {
+            let mut v = [0u8; 16];
+            unsafe {
+                _mm_storeu_si128(v.as_mut_ptr() as *mut _, _mm_castps_si128(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m128d> for BitVec<128> {
+        fn from(vec: __m128d) -> BitVec<128> {
+            let mut v = [0u8; 16];
+            unsafe {
+                _mm_storeu_si128(v.as_mut_ptr() as *mut _, _mm_castpd_si128(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/tests/sse2.rs b/testable-simd-models/src/core_arch/x86/tests/sse2.rs
new file mode 100644
index 0000000000000..ed387f5938524
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/sse2.rs
@@ -0,0 +1,201 @@
+use super::types::*;
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::sse2::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+mk!(_mm_add_epi8(a: __m128i, b: __m128i));
+mk!(_mm_add_epi16(a: __m128i, b: __m128i));
+mk!(_mm_add_epi32(a: __m128i, b: __m128i));
+mk!(_mm_add_epi64(a: __m128i, b: __m128i));
+mk!(_mm_adds_epi8(a: __m128i, b: __m128i));
+mk!(_mm_adds_epi16(a: __m128i, b: __m128i));
+mk!(_mm_adds_epu8(a: __m128i, b: __m128i));
+mk!(_mm_adds_epu16(a: __m128i, b: __m128i));
+mk!(_mm_avg_epu8(a: __m128i, b: __m128i));
+mk!(_mm_avg_epu16(a: __m128i, b: __m128i));
+mk!(_mm_madd_epi16(a: __m128i, b: __m128i));
+mk!(_mm_max_epi16(a: __m128i, b: __m128i));
+mk!(_mm_max_epu8(a: __m128i, b: __m128i));
+mk!(_mm_min_epi16(a: __m128i, b: __m128i));
+mk!(_mm_min_epu8(a: __m128i, b: __m128i));
+mk!(_mm_mulhi_epi16(a: __m128i, b: __m128i));
+mk!(_mm_mulhi_epu16(a: __m128i, b: __m128i));
+mk!(_mm_mullo_epi16(a: __m128i, b: __m128i));
+mk!(_mm_mul_epu32(a: __m128i, b: __m128i));
+mk!(_mm_sad_epu8(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi8(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi16(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi32(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi64(a: __m128i, b: __m128i));
+mk!(_mm_subs_epi8(a: __m128i, b: __m128i));
+mk!(_mm_subs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_subs_epu8(a: __m128i, b: __m128i));
+mk!(_mm_subs_epu16(a: __m128i, b: __m128i));
+
+mk!([100]_mm_slli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_bslli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_bsrli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_slli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sll_epi16(a: __m128i, count: __m128i));
+
+mk!([100]_mm_slli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sll_epi32(a: __m128i, count: __m128i));
+
+mk!([100]_mm_slli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sll_epi64(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srai_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sra_epi16(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srai_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sra_epi32(a: __m128i, count: __m128i));
+mk!([100]_mm_srli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_srli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_srl_epi16(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_srl_epi32(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!(_mm_srl_epi64(a: __m128i, count: __m128i));
+mk!(_mm_and_si128(a: __m128i, b: __m128i));
+mk!(_mm_andnot_si128(a: __m128i, b: __m128i));
+mk!(_mm_or_si128(a: __m128i, b: __m128i));
+mk!(_mm_xor_si128(a: __m128i, b: __m128i));
+mk!(_mm_cmpeq_epi8(a: __m128i, b: __m128i));
+mk!(_mm_cmpeq_epi16(a: __m128i, b: __m128i));
+mk!(_mm_cmpeq_epi32(a: __m128i, b: __m128i));
+mk!(_mm_cmpgt_epi8(a: __m128i, b: __m128i));
+mk!(_mm_cmpgt_epi16(a: __m128i, b: __m128i));
+mk!(_mm_cmpgt_epi32(a: __m128i, b: __m128i));
+mk!(_mm_cmplt_epi8(a: __m128i, b: __m128i));
+mk!(_mm_cmplt_epi16(a: __m128i, b: __m128i));
+mk!(_mm_cmplt_epi32(a: __m128i, b: __m128i));
+mk!(_mm_cvtsi32_si128(a: i32));
+
+// mk!(_mm_cvtsi128_si32(a: __m128i));
+
+mk!(_mm_set_epi64x(e1: i64, e0: i64));
+mk!(_mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32));
+mk!(_mm_set_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16
+));
+mk!(_mm_set_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8
+));
+mk!(_mm_set1_epi64x(a: i64));
+mk!(_mm_set1_epi32(a: i32));
+mk!(_mm_set1_epi16(a: i16));
+mk!(_mm_set1_epi8(a: i8));
+mk!(_mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32));
+mk!(_mm_setr_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16
+));
+mk!(_mm_setr_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8
+));
+mk!(_mm_setzero_si128());
+mk!(_mm_move_epi64(a: __m128i));
+mk!(_mm_packs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_packs_epi32(a: __m128i, b: __m128i));
+mk!(_mm_packus_epi16(a: __m128i, b: __m128i));
+
+// mk!([100]_mm_extract_epi16(a: __m128i));
+mk!([100]_mm_insert_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>}(a: __m128i, i: i32));
+
+// mk!([100]_mm_movemask_epi8(a: __m128i));
+
+mk!([100]_mm_shuffle_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_shufflehi_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_shufflelo_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!(_mm_unpackhi_epi8(a: __m128i, b: __m128i));
+mk!(_mm_unpackhi_epi16(a: __m128i, b: __m128i));
+mk!(_mm_unpackhi_epi32(a: __m128i, b: __m128i));
+mk!(_mm_unpackhi_epi64(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi8(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi16(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi32(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi64(a: __m128i, b: __m128i));
+mk!(_mm_undefined_si128());
diff --git a/testable-simd-models/src/core_arch/x86/tests/ssse3.rs b/testable-simd-models/src/core_arch/x86/tests/ssse3.rs
new file mode 100644
index 0000000000000..6382f953f2063
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/ssse3.rs
@@ -0,0 +1,51 @@
+use super::types::*;
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::ssse3::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+mk!(_mm_abs_epi8(a: __m128i));
+mk!(_mm_abs_epi16(a: __m128i));
+mk!(_mm_abs_epi32(a: __m128i));
+mk!(_mm_shuffle_epi8(a: __m128i, b: __m128i));
+mk!([100]_mm_alignr_epi8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i, b: __m128i));
+mk!(_mm_hadd_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hadds_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hadd_epi32(a: __m128i, b: __m128i));
+mk!(_mm_hsub_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hsubs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hsub_epi32(a: __m128i, b: __m128i));
+mk!(_mm_maddubs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_mulhrs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_sign_epi8(a: __m128i, b: __m128i));
+mk!(_mm_sign_epi16(a: __m128i, b: __m128i));
+mk!(_mm_sign_epi32(a: __m128i, b: __m128i));
diff --git a/testable-simd-models/src/helpers.rs b/testable-simd-models/src/helpers.rs
new file mode 100644
index 0000000000000..1a30bf251a877
--- /dev/null
+++ b/testable-simd-models/src/helpers.rs
@@ -0,0 +1,67 @@
+#[cfg(test)]
+pub mod test {
+    use crate::abstractions::{bit::Bit, bitvec::BitVec, funarr::FunArray};
+    use rand::prelude::*;
+
+    /// Helper trait to generate random values
+    pub trait HasRandom {
+        fn random() -> Self;
+    }
+    macro_rules! mk_has_random {
+        ($($ty:ty),*) => {
+            $(impl HasRandom for $ty {
+                fn random() -> Self {
+                    let mut rng = rand::rng();
+                    rng.random()
+                }
+            })*
+        };
+    }
+
+    mk_has_random!(bool);
+    mk_has_random!(i8, i16, i32, i64, i128);
+    mk_has_random!(u8, u16, u32, u64, u128);
+
+    impl HasRandom for isize {
+        fn random() -> Self {
+            i128::random() as isize
+        }
+    }
+    impl HasRandom for usize {
+        fn random() -> Self {
+            i128::random() as usize
+        }
+    }
+
+    impl HasRandom for f32 {
+        fn random() -> Self {
+            u32::random() as f32
+        }
+    }
+
+    impl HasRandom for f64 {
+        fn random() -> Self {
+            u64::random() as f64
+        }
+    }
+
+    impl HasRandom for Bit {
+        fn random() -> Self {
+            crate::abstractions::bit::Bit::from(bool::random())
+        }
+    }
+    impl<const N: u32> HasRandom for BitVec<N> {
+        fn random() -> Self {
+            Self::from_fn(|_| Bit::random())
+        }
+    }
+
+    impl<const N: u32, T: HasRandom> HasRandom for FunArray<N, T> {
+        fn random() -> Self {
+            FunArray::from_fn(|_| T::random())
+        }
+    }
+}
+
+#[cfg(test)]
+pub use test::*;
diff --git a/testable-simd-models/src/lib.rs b/testable-simd-models/src/lib.rs
new file mode 100644
index 0000000000000..13d6ba2e6e7cd
--- /dev/null
+++ b/testable-simd-models/src/lib.rs
@@ -0,0 +1,35 @@
+//! `testable-simd-models`: A Rust Model for the `core` Library
+//!
+//! `testable-simd-models` is a simplified, self-contained model of Rust’s `core` library. It aims to provide
+//! a purely Rust-based specification of `core`'s fundamental operations, making them easier to
+//! understand, analyze, and formally verify. Unlike `core`, which may rely on platform-specific
+//! intrinsics and compiler magic, `core-models` expresses everything in plain Rust, prioritizing
+//! clarity and explicitness over efficiency.
+//!
+//! ## Key Features
+//!
+//! - **Partial Modeling**: `core-models` includes only a subset of `core`, focusing on modeling
+//!   fundamental operations rather than providing a complete replacement.
+//! - **Exact Signatures**: Any item that exists in both `core-models` and `core` has the same type signature,
+//!   ensuring compatibility with formal verification efforts.
+//! - **Purely Functional Approach**: Where possible, `core-models` favors functional programming principles,
+//!   avoiding unnecessary mutation and side effects to facilitate formal reasoning.
+//! - **Explicit Implementations**: Even low-level operations, such as SIMD, are modeled explicitly using
+//!   Rust constructs like bit arrays and partial maps.
+//! - **Extra Abstractions**: `core-models` includes additional helper types and functions to support
+//!   modeling. These extra items are marked appropriately to distinguish them from `core` definitions.
+//!
+//! ## Intended Use
+//!
+//! `testable-simd-models` is designed as a reference model for formal verification and reasoning about Rust programs.
+//! By providing a readable, testable, well-specified version of `core`'s behavior, it serves as a foundation for
+//! proof assistants and other verification tools.
+
+// This recursion limit is necessary for mk! macro used for tests.
+// We test functions with const generics, the macro generate a test per possible (const generic) control value.
+#![recursion_limit = "4096"]
+pub mod abstractions;
+pub mod core_arch;
+
+pub use core_arch as arch;
+pub mod helpers;