From d418b8a1d19c9570883e5d52edfa6f683afd592f Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Fri, 25 Jul 2025 13:19:30 -0400
Subject: [PATCH 01/47] u64 -> u32

---
 testable-simd-models/README.md                |   2 +
 .../src/abstractions/bitvec.rs                |  24 +--
 .../src/abstractions/funarr.rs                |  22 +-
 testable-simd-models/src/abstractions/simd.rs |  70 +++----
 .../src/core_arch/x86/models/avx.rs           |   4 +-
 .../src/core_arch/x86/models/avx2.rs          | 198 +++++++++---------
 .../src/core_arch/x86/models/sse2.rs          |  40 ++--
 .../src/core_arch/x86/models/ssse3.rs         |  40 ++--
 testable-simd-models/src/helpers.rs           |   4 +-
 testable-simd-models/test.sh                  |   2 -
 10 files changed, 205 insertions(+), 201 deletions(-)
 delete mode 100755 testable-simd-models/test.sh
diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index d051de6145f4a..f2f6ec3b9b629 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -23,6 +23,8 @@ tests work by testing the models against the intrinsics in the Rust
 core, trying out random inputs (generally 1000), and comparing their
 outputs.
 
+The tests can run by executing `cargo test`.
+
 ## Modeling Process
 The process of adding a specific intrinsic's model goes as follows.
 For this example, let us say the intrinsic we are adding is
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index 0f3003f4beadc..02a32df4e152e 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -15,7 +15,7 @@ use std::fmt::Formatter;
 /// making the bit pattern more human-readable. The type also implements indexing,
 /// allowing for easy access to individual bits.
 #[derive(Copy, Clone, Eq, PartialEq)]
-pub struct BitVec<const N: u64>(FunArray<N, Bit>);
+pub struct BitVec<const N: u32>(FunArray<N, Bit>);
 
 /// Pretty prints a bit slice by group of 8
 fn bit_slice_to_string(bits: &[Bit]) -> String {
@@ -33,15 +33,15 @@ fn bit_slice_to_string(bits: &[Bit]) -> String {
         .into()
 }
 
-impl<const N: u64> core::fmt::Debug for BitVec<N> {
+impl<const N: u32> core::fmt::Debug for BitVec<N> {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
         write!(f, "{}", bit_slice_to_string(&self.0.as_vec()))
     }
 }
 
-impl<const N: u64> core::ops::Index<u64> for BitVec<N> {
+impl<const N: u32> core::ops::Index<u32> for BitVec<N> {
     type Output = Bit;
-    fn index(&self, index: u64) -> &Self::Output {
+    fn index(&self, index: u32) -> &Self::Output {
         self.0.get(index)
     }
 }
@@ -75,19 +75,19 @@ fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) ->
     };
     n
 }
-impl<const N: u64> BitVec<N> {
+impl<const N: u32> BitVec<N> {
     /// Constructor for BitVec. `BitVec::<N>::from_fn` constructs a bitvector out of a function that takes usizes smaller than `N` and produces bits.
-    pub fn from_fn<F: Fn(u64) -> Bit>(f: F) -> Self {
+    pub fn from_fn<F: Fn(u32) -> Bit>(f: F) -> Self {
         Self(FunArray::from_fn(f))
     }
     /// Convert a slice of machine integers where only the `d` least significant bits are relevant.
-    pub fn from_slice<T: Into<i128> + MachineInteger + Copy>(x: &[T], d: u64) -> Self {
+    pub fn from_slice<T: Into<i128> + MachineInteger + Copy>(x: &[T], d: u32) -> Self {
         Self::from_fn(|i| Bit::of_int::<T>(x[(i / d) as usize], (i % d) as u32))
     }
 
     /// Construct a BitVec out of a machine integer.
     pub fn from_int<T: Into<i128> + MachineInteger + Copy>(n: T) -> Self {
-        Self::from_slice::<T>(&[n], T::bits() as u64)
+        Self::from_slice::<T>(&[n], T::bits() as u32)
     }
 
     /// Convert a BitVec into a machine integer of type `T`.
@@ -115,12 +115,12 @@ impl<const N: u64> BitVec<N> {
     }
 }
 
-impl<const N: u64> BitVec<N> {
-    pub fn chunked_shift<const CHUNK: u64, const SHIFTS: u64>(
+impl<const N: u32> BitVec<N> {
+    pub fn chunked_shift<const CHUNK: u32, const SHIFTS: u32>(
         self,
         shl: FunArray<SHIFTS, i128>,
     ) -> BitVec<N> {
-        fn chunked_shift<const N: u64, const CHUNK: u64, const SHIFTS: u64>(
+        fn chunked_shift<const N: u32, const CHUNK: u32, const SHIFTS: u32>(
             bitvec: BitVec<N>,
             shl: FunArray<SHIFTS, i128>,
         ) -> BitVec<N> {
@@ -134,7 +134,7 @@ impl<const N: u64> BitVec<N> {
                 };
                 let local_index = (nth_bit as i128).wrapping_sub(shift);
                 if local_index < CHUNK as i128 && local_index >= 0 {
-                    let local_index = local_index as u64;
+                    let local_index = local_index as u32;
                     bitvec[nth_chunk * CHUNK + local_index]
                 } else {
                     Bit::Zero
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index 4c120addcb0c5..429db9d1123f1 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -5,19 +5,19 @@
 /// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
 /// Unused elements beyond `N` are filled with `None`.
 #[derive(Copy, Clone, Eq, PartialEq)]
-pub struct FunArray<const N: u64, T>([Option<T>; 512]);
+pub struct FunArray<const N: u32, T>([Option<T>; 512]);
 
-impl<const N: u64, T> FunArray<N, T> {
+impl<const N: u32, T> FunArray<N, T> {
     /// Gets a reference to the element at index `i`.
-    pub fn get(&self, i: u64) -> &T {
+    pub fn get(&self, i: u32) -> &T {
         self.0[i as usize].as_ref().unwrap()
     }
     /// Constructor for FunArray. `FunArray<N,T>::from_fn` constructs a funarray out of a function that takes usizes smaller than `N` and produces an element of type T.
-    pub fn from_fn<F: Fn(u64) -> T>(f: F) -> Self {
+    pub fn from_fn<F: Fn(u32) -> T>(f: F) -> Self {
         // let vec = (0..N).map(f).collect();
         let arr = core::array::from_fn(|i| {
-            if (i as u64) < N {
-                Some(f(i as u64))
+            if (i as u32) < N {
+                Some(f(i as u32))
             } else {
                 None
             }
@@ -53,10 +53,10 @@ impl<const N: u64, T> FunArray<N, T> {
     }
 }
 
-impl<const N: u64, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
+impl<const N: u32, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     type Error = ();
     fn try_from(v: Vec<T>) -> Result<Self, ()> {
-        if (v.len() as u64) < N {
+        if (v.len() as u32) < N {
             Err(())
         } else {
             Ok(Self::from_fn(|i| v[i as usize].clone()))
@@ -64,16 +64,16 @@ impl<const N: u64, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     }
 }
 
-impl<const N: u64, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
+impl<const N: u32, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         write!(f, "{:?}", self.as_vec())
     }
 }
 
-impl<const N: u64, T> core::ops::Index<u64> for FunArray<N, T> {
+impl<const N: u32, T> core::ops::Index<u32> for FunArray<N, T> {
     type Output = T;
 
-    fn index(&self, index: u64) -> &Self::Output {
+    fn index(&self, index: u32) -> &Self::Output {
         self.get(index)
     }
 }
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 08b1b21bce34d..9ccd1ec86e8e9 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -20,7 +20,7 @@ macro_rules! interpretations {
                         #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
                         pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
             let vec: Vec<$ty> = iv.as_vec();
-            Self::from_slice(&vec[..], <$ty>::bits() as u64)
+            Self::from_slice(&vec[..], <$ty>::bits() as u32)
                         }
                         #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
                         pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
@@ -72,7 +72,7 @@ interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
 ///
 /// `idx` must be in-bounds of the vector, ie. idx < N
 
-pub fn simd_insert<const N: u64, T: Copy>(x: FunArray<N, T>, idx: u64, val: T) -> FunArray<N, T> {
+pub fn simd_insert<const N: u32, T: Copy>(x: FunArray<N, T>, idx: u32, val: T) -> FunArray<N, T> {
     FunArray::from_fn(|i| if i == idx { val } else { x[i] })
 }
 
@@ -81,49 +81,49 @@ pub fn simd_insert<const N: u64, T: Copy>(x: FunArray<N, T>, idx: u64, val: T) -
 /// # Safety
 ///
 /// `idx` must be in-bounds of the vector, ie. idx < N
-pub fn simd_extract<const N: u64, T: Clone>(x: FunArray<N, T>, idx: u64) -> T {
+pub fn simd_extract<const N: u32, T: Clone>(x: FunArray<N, T>, idx: u32) -> T {
     x.get(idx).clone()
 }
 
 /// Adds two vectors elementwise with wrapping on overflow/underflow.
-pub fn simd_add<const N: u64, T: MachineInteger + Copy>(
+pub fn simd_add<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
-    FunArray::from_fn(|i| (x[i].wrapping_add(y[i])))
+    FunArray::from_fn(|i| x[i].wrapping_add(y[i]))
 }
 
 /// Subtracts `rhs` from `lhs` elementwise with wrapping on overflow/underflow.
-pub fn simd_sub<const N: u64, T: MachineInteger + Copy>(
+pub fn simd_sub<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
-    FunArray::from_fn(|i| (x[i].wrapping_sub(y[i])))
+    FunArray::from_fn(|i| x[i].wrapping_sub(y[i]))
 }
 
 /// Multiplies two vectors elementwise with wrapping on overflow/underflow.
-pub fn simd_mul<const N: u64, T: MachineInteger + Copy>(
+pub fn simd_mul<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
-    FunArray::from_fn(|i| (x[i].overflowing_mul(y[i])))
+    FunArray::from_fn(|i| x[i].overflowing_mul(y[i]))
 }
 
 /// Produces the elementwise absolute values.
 /// For vectors of unsigned integers it returns the vector untouched.
 /// If the element is the minimum value of a signed integer, it returns the element as is.
-pub fn simd_abs<const N: u64, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
+pub fn simd_abs<const N: u32, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
     FunArray::from_fn(|i| x[i].absolute_val())
 }
 
 /// Produces the elementwise absolute difference of two vectors.
 /// Note: Absolute difference in this case is simply the element with the smaller value subtracted from the element with the larger value, with overflow/underflow.
 /// For example, if the elements are i8, the absolute difference of 255 and -2 is -255.
-pub fn simd_abs_diff<const N: u64, T: MachineInteger + Copy>(
+pub fn simd_abs_diff<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
-    FunArray::from_fn(|i| (x[i].absolute_diff(y[i])))
+    FunArray::from_fn(|i| x[i].absolute_diff(y[i]))
 }
 
 /// Shifts vector left elementwise, with UB on overflow.
@@ -131,11 +131,11 @@ pub fn simd_abs_diff<const N: u64, T: MachineInteger + Copy>(
 /// # Safety
 ///
 /// Each element of `rhs` must be less than `<int>::BITS`.
-pub fn simd_shl<const N: u64, T: Shl + Copy>(
+pub fn simd_shl<const N: u32, T: Shl + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as Shl>::Output> {
-    FunArray::from_fn(|i| (x[i] << y[i]))
+    FunArray::from_fn(|i| x[i] << y[i])
 }
 
 /// Shifts vector right elementwise, with UB on overflow.
@@ -146,38 +146,38 @@ pub fn simd_shl<const N: u64, T: Shl + Copy>(
 ///
 /// Each element of `rhs` must be less than `<int>::BITS`.
 
-pub fn simd_shr<const N: u64, T: Shr + Copy>(
+pub fn simd_shr<const N: u32, T: Shr + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as Shr>::Output> {
-    FunArray::from_fn(|i| (x[i] >> y[i]))
+    FunArray::from_fn(|i| x[i] >> y[i])
 }
 
 /// "Ands" vectors elementwise.
 
-pub fn simd_and<const N: u64, T: BitAnd + Copy>(
+pub fn simd_and<const N: u32, T: BitAnd + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitAnd>::Output> {
-    FunArray::from_fn(|i| (x[i] & y[i]))
+    FunArray::from_fn(|i| x[i] & y[i])
 }
 
 /// "Ors" vectors elementwise.
 
-pub fn simd_or<const N: u64, T: BitOr + Copy>(
+pub fn simd_or<const N: u32, T: BitOr + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitOr>::Output> {
-    FunArray::from_fn(|i| (x[i] | y[i]))
+    FunArray::from_fn(|i| x[i] | y[i])
 }
 
 /// "Exclusive ors" vectors elementwise.
 
-pub fn simd_xor<const N: u64, T: BitXor + Copy>(
+pub fn simd_xor<const N: u32, T: BitXor + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitXor>::Output> {
-    FunArray::from_fn(|i| (x[i] ^ y[i]))
+    FunArray::from_fn(|i| x[i] ^ y[i])
 }
 
 pub trait CastsFrom<T> {
@@ -327,7 +327,7 @@ self_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
 ///
 /// When casting from a wider number to a smaller number, the higher bits are removed.
 /// Otherwise, it extends the number, following signedness.
-pub fn simd_cast<const N: u64, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>) -> FunArray<N, T2> {
+pub fn simd_cast<const N: u32, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>) -> FunArray<N, T2> {
     FunArray::from_fn(|i| T2::cast(x[i]))
 }
 
@@ -335,7 +335,7 @@ pub fn simd_cast<const N: u64, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>)
 ///
 /// Rust panics for `-<int>::Min` due to overflow, but here, it just returns the element as is.
 
-pub fn simd_neg<const N: u64, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
+pub fn simd_neg<const N: u32, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
     x: FunArray<N, T>,
 ) -> FunArray<N, T> {
     FunArray::from_fn(|i| {
@@ -350,7 +350,7 @@ pub fn simd_neg<const N: u64, T: From<<T as Neg>::Output> + MachineInteger + Eq
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_eq<const N: u64, T: Eq + MachineInteger + Copy>(
+pub fn simd_eq<const N: u32, T: Eq + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -361,7 +361,7 @@ pub fn simd_eq<const N: u64, T: Eq + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_ne<const N: u64, T: Eq + MachineInteger + Copy>(
+pub fn simd_ne<const N: u32, T: Eq + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -372,7 +372,7 @@ pub fn simd_ne<const N: u64, T: Eq + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_lt<const N: u64, T: Ord + MachineInteger + Copy>(
+pub fn simd_lt<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -383,7 +383,7 @@ pub fn simd_lt<const N: u64, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_le<const N: u64, T: Ord + MachineInteger + Copy>(
+pub fn simd_le<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -394,7 +394,7 @@ pub fn simd_le<const N: u64, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_gt<const N: u64, T: Ord + MachineInteger + Copy>(
+pub fn simd_gt<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -405,7 +405,7 @@ pub fn simd_gt<const N: u64, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_ge<const N: u64, T: Ord + MachineInteger + Copy>(
+pub fn simd_ge<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -415,10 +415,10 @@ pub fn simd_ge<const N: u64, T: Ord + MachineInteger + Copy>(
 /// Shuffles two vectors by the indices in idx.
 ///
 /// For safety, `N2 <= N1 + N3` must hold.
-pub fn simd_shuffle<T: Copy, const N1: u64, const N2: usize, const N3: u64>(
+pub fn simd_shuffle<T: Copy, const N1: u32, const N2: usize, const N3: u32>(
     x: FunArray<N1, T>,
     y: FunArray<N1, T>,
-    idx: [u64; N2],
+    idx: [u32; N2],
 ) -> FunArray<N3, T> {
     FunArray::from_fn(|i| {
         let i = idx[i as usize];
@@ -432,7 +432,7 @@ pub fn simd_shuffle<T: Copy, const N1: u64, const N2: usize, const N3: u64>(
 
 /// Adds two vectors elementwise, with saturation.
 
-pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u64>(
+pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u32>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -441,7 +441,7 @@ pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u64>(
 
 /// Subtracts `y` from `x` elementwise, with saturation.
 
-pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: u64>(
+pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: u32>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -923,7 +923,7 @@ pub(crate) use simd_bitmask_big;
 /// # Safety
 /// `mask` must only contain `0` and `!0`.
 
-pub fn simd_select<const N: u64, T1: Eq + MachineInteger, T2: Copy + MachineInteger>(
+pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy + MachineInteger>(
     mask: FunArray<N, T1>,
     if_true: FunArray<N, T2>,
     if_false: FunArray<N, T2>,
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index f392a7abf05b0..aaf7a0d2649c5 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -169,7 +169,7 @@ pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
 
 pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
     // // static_assert_uimm_bits!(INDEX, 5);
-    simd_insert(BitVec::to_i8x32(a), INDEX as u64, i).into()
+    simd_insert(BitVec::to_i8x32(a), INDEX as u32, i).into()
 }
 
 /// Copies `a` to result, and inserts the 16-bit integer `i` into result
@@ -181,7 +181,7 @@ pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
 
 pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
     // // static_assert_uimm_bits!(INDEX, 4);
-    simd_insert(BitVec::to_i16x16(a), INDEX as u64, i).into()
+    simd_insert(BitVec::to_i16x16(a), INDEX as u32, i).into()
 }
 
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 05173b19a8c58..c6ada5a406766 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -318,10 +318,10 @@ mod c_extern {
     }
 
     pub fn psllw(a: i16x16, count: i16x8) -> i16x16 {
-        let count4: u64 = (count[0] as u16) as u64;
-        let count3: u64 = ((count[1] as u16) as u64) * 65536;
-        let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
-        let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
+        let count4 = (count[0] as u16) as  u64;
+        let count3 = ((count[1] as u16) as u64) * 65536;
+        let count2 = ((count[2] as u16) as u64) * 4294967296;
+        let count1 = ((count[3] as u16) as u64) * 281474976710656;
         let count = count1 + count2 + count3 + count4;
         i16x16::from_fn(|i| {
             if count > 15 {
@@ -333,7 +333,7 @@ mod c_extern {
     }
 
     pub fn pslld(a: i32x8, count: i32x4) -> i32x8 {
-        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+        let count = (count[1] as u64) * 4294967296 + (count[0] as u64);
 
         i32x8::from_fn(|i| {
             if count > 31 {
@@ -344,13 +344,13 @@ mod c_extern {
         })
     }
     pub fn psllq(a: i64x4, count: i64x2) -> i64x4 {
-        let count: u64 = count[0] as u64;
+        let count = count[0] as u32;
 
         i64x4::from_fn(|i| {
             if count > 63 {
                 0
             } else {
-                ((a[i] as u64) << count) as i64
+                ((a[i] as u32) << count) as i64
             }
         })
     }
@@ -379,7 +379,7 @@ mod c_extern {
             if count[i] > 63 || count[i] < 0 {
                 0
             } else {
-                ((a[i] as u64) << count[i]) as i64
+                ((a[i] as u32) << count[i]) as i64
             }
         })
     }
@@ -388,13 +388,13 @@ mod c_extern {
             if count[i] > 63 || count[i] < 0 {
                 0
             } else {
-                ((a[i] as u64) << count[i]) as i64
+                ((a[i] as u32) << count[i]) as i64
             }
         })
     }
 
     pub fn psraw(a: i16x16, count: i16x8) -> i16x16 {
-        let count: u64 = ((count[3] as u16) as u64) * 281474976710656
+        let count = ((count[3] as u16) as u64) * 281474976710656
             + ((count[2] as u16) as u64) * 4294967296
             + ((count[1] as u16) as u64) * 65536
             + ((count[0] as u16) as u64);
@@ -413,7 +413,7 @@ mod c_extern {
     }
 
     pub fn psrad(a: i32x8, count: i32x4) -> i32x8 {
-        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+        let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
 
         i32x8::from_fn(|i| {
             if count > 31 {
@@ -458,7 +458,7 @@ mod c_extern {
     }
 
     pub fn psrlw(a: i16x16, count: i16x8) -> i16x16 {
-        let count: u64 = (count[3] as u16 as u64) * 281474976710656
+        let count = (count[3] as u16 as u64) * 281474976710656
             + (count[2] as u16 as u64) * 4294967296
             + (count[1] as u16 as u64) * 65536
             + (count[0] as u16 as u64);
@@ -473,7 +473,7 @@ mod c_extern {
     }
 
     pub fn psrld(a: i32x8, count: i32x4) -> i32x8 {
-        let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
+        let count = (count[1] as u64) * 4294967296 + (count[0] as u64);
 
         i32x8::from_fn(|i| {
             if count > 31 {
@@ -491,7 +491,7 @@ mod c_extern {
             if count > 63 {
                 0
             } else {
-                ((a[i] as u64) >> count) as i64
+                ((a[i] as u32) >> count) as i64
             }
         })
     }
@@ -520,7 +520,7 @@ mod c_extern {
             if count[i] > 63 || count[i] < 0 {
                 0
             } else {
-                ((a[i] as u64) >> count[i]) as i64
+                ((a[i] as u32) >> count[i]) as i64
             }
         })
     }
@@ -529,7 +529,7 @@ mod c_extern {
             if count[i] > 63 || count[i] < 0 {
                 0
             } else {
-                ((a[i] as u64) >> count[i]) as i64
+                ((a[i] as u32) >> count[i]) as i64
             }
         })
     }
@@ -540,14 +540,14 @@ mod c_extern {
                 if b[i] > 127 {
                     0
                 } else {
-                    let index: u64 = (b[i] % 16) as u64;
+                    let index = (b[i] % 16) as u32;
                     a[index]
                 }
             } else {
                 if b[i] > 127 {
                     0
                 } else {
-                    let index: u64 = (b[i] % 16) as u64;
+                    let index = (b[i] % 16) as u32;
                     a[index + 16]
                 }
             }
@@ -557,15 +557,15 @@ mod c_extern {
     pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
         u32x8::from_fn(|i| {
             let id = b[i] % 8;
-            a[id as u64]
+            a[id]
         })
     }
 
     pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
         u16x16::from_fn(|i| {
             if i < 8 {
-                let a_offset = (((imm8 & 4) >> 2) * 4) as u32 as u64;
-                let b_offset = ((imm8 & 3) * 4) as u32 as u64;
+                let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
+                let b_offset = ((imm8 & 3) * 4) as u32;
                 let k = a_offset + i;
                 let l = b_offset;
                 ((a[k].absolute_diff(b[l]) as i8) as u8 as u16)
@@ -575,8 +575,8 @@ mod c_extern {
             } else {
                 let i = i - 8;
                 let imm8 = imm8 >> 3;
-                let a_offset = (((imm8 & 4) >> 2) * 4) as u32 as u64;
-                let b_offset = ((imm8 & 3) * 4) as u32 as u64;
+                let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
+                let b_offset = ((imm8 & 3) * 4) as u32;
                 let k = a_offset + i;
                 let l = b_offset;
                 ((a[16 + k].absolute_diff(b[16 + l]) as i8) as u8 as u16)
@@ -589,10 +589,10 @@ mod c_extern {
 
     pub fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4 {
         let a = i128x2::from_fn(|i| {
-            ((a[2 * i] as u64 as u128) + ((a[2 * i + 1] as u64 as u128) << 64)) as i128
+            ((a[2 * i] as u32 as u128) + ((a[2 * i + 1] as u32 as u128) << 64)) as i128
         });
         let b = i128x2::from_fn(|i| {
-            ((b[2 * i] as u64 as u128) + ((b[2 * i + 1] as u64 as u128) << 64)) as i128
+            ((b[2 * i] as u32 as u128) + ((b[2 * i + 1] as u32 as u128) << 64)) as i128
         });
         let imm8 = imm8 as u8 as u32 as i32;
         let r = i128x2::from_fn(|i| {
@@ -646,7 +646,7 @@ use c_extern::*;
 
 use super::avx::*;
 use super::types::*;
-use crate::abstractions::simd::*;
+
 /// Computes the absolute values of packed 32-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
@@ -1041,7 +1041,7 @@ pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
 pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u64; 16]);
+    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u32; 16]);
     ret.into()
 }
 
@@ -1050,7 +1050,7 @@ pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
 pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u64; 32]);
+    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u32; 32]);
     ret.into()
 }
 
@@ -1062,7 +1062,7 @@ pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
 
 pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u64; 4]);
+    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u32; 4]);
     ret.into()
 }
 
@@ -1074,7 +1074,7 @@ pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
 
 pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u64; 8]);
+    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u32; 8]);
     ret.into()
 }
 
@@ -1087,7 +1087,7 @@ pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 // See https://github.com/rust-lang/stdarch/issues/791
 
 pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u64; 2]);
+    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u32; 2]);
     ret.into()
 }
 
@@ -1097,7 +1097,7 @@ pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
 
 pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u64; 4]);
+    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u32; 4]);
     ret.into()
 }
 
@@ -1129,7 +1129,7 @@ pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
 
 pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u64; 8]);
+    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u32; 8]);
     ret.into()
 }
 
@@ -1139,7 +1139,7 @@ pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
 
 pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u64; 16]);
+    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u32; 16]);
     ret.into()
 }
 
@@ -1720,10 +1720,10 @@ pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
         BitVec::to_i64x4(a),
         zero,
         [
-            IMM8 as u64 & 0b11,
-            (IMM8 as u64 >> 2) & 0b11,
-            (IMM8 as u64 >> 4) & 0b11,
-            (IMM8 as u64 >> 6) & 0b11,
+            IMM8 as u32 & 0b11,
+            (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11,
+            (IMM8 as u32 >> 6) & 0b11,
         ],
     );
     r.into()
@@ -1790,14 +1790,14 @@ pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
         BitVec::to_i32x8(a),
         BitVec::to_i32x8(a),
         [
-            MASK as u64 & 0b11,
-            (MASK as u64 >> 2) & 0b11,
-            (MASK as u64 >> 4) & 0b11,
-            (MASK as u64 >> 6) & 0b11,
-            (MASK as u64 & 0b11) + 4,
-            ((MASK as u64 >> 2) & 0b11) + 4,
-            ((MASK as u64 >> 4) & 0b11) + 4,
-            ((MASK as u64 >> 6) & 0b11) + 4,
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            (MASK as u32 >> 4) & 0b11,
+            (MASK as u32 >> 6) & 0b11,
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
         ],
     );
     r.into()
@@ -1819,18 +1819,18 @@ pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
             1,
             2,
             3,
-            4 + (IMM8 as u64 & 0b11),
-            4 + ((IMM8 as u64 >> 2) & 0b11),
-            4 + ((IMM8 as u64 >> 4) & 0b11),
-            4 + ((IMM8 as u64 >> 6) & 0b11),
+            4 + (IMM8 as u32 & 0b11),
+            4 + ((IMM8 as u32 >> 2) & 0b11),
+            4 + ((IMM8 as u32 >> 4) & 0b11),
+            4 + ((IMM8 as u32 >> 6) & 0b11),
             8,
             9,
             10,
             11,
-            12 + (IMM8 as u64 & 0b11),
-            12 + ((IMM8 as u64 >> 2) & 0b11),
-            12 + ((IMM8 as u64 >> 4) & 0b11),
-            12 + ((IMM8 as u64 >> 6) & 0b11),
+            12 + (IMM8 as u32 & 0b11),
+            12 + ((IMM8 as u32 >> 2) & 0b11),
+            12 + ((IMM8 as u32 >> 4) & 0b11),
+            12 + ((IMM8 as u32 >> 6) & 0b11),
         ],
     );
     r.into()
@@ -1848,18 +1848,18 @@ pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
         a,
         a,
         [
-            0 + (IMM8 as u64 & 0b11),
-            0 + ((IMM8 as u64 >> 2) & 0b11),
-            0 + ((IMM8 as u64 >> 4) & 0b11),
-            0 + ((IMM8 as u64 >> 6) & 0b11),
+            0 + (IMM8 as u32 & 0b11),
+            0 + ((IMM8 as u32 >> 2) & 0b11),
+            0 + ((IMM8 as u32 >> 4) & 0b11),
+            0 + ((IMM8 as u32 >> 6) & 0b11),
             4,
             5,
             6,
             7,
-            8 + (IMM8 as u64 & 0b11),
-            8 + ((IMM8 as u64 >> 2) & 0b11),
-            8 + ((IMM8 as u64 >> 4) & 0b11),
-            8 + ((IMM8 as u64 >> 6) & 0b11),
+            8 + (IMM8 as u32 & 0b11),
+            8 + ((IMM8 as u32 >> 2) & 0b11),
+            8 + ((IMM8 as u32 >> 4) & 0b11),
+            8 + ((IMM8 as u32 >> 6) & 0b11),
             12,
             13,
             14,
@@ -1991,38 +1991,38 @@ pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
         i8x32::from_fn(|_| 0),
         a,
         [
-            mask(IMM8, 0) as u64,
-            mask(IMM8, 1) as u64,
-            mask(IMM8, 2) as u64,
-            mask(IMM8, 3) as u64,
-            mask(IMM8, 4) as u64,
-            mask(IMM8, 5) as u64,
-            mask(IMM8, 6) as u64,
-            mask(IMM8, 7) as u64,
-            mask(IMM8, 8) as u64,
-            mask(IMM8, 9) as u64,
-            mask(IMM8, 10) as u64,
-            mask(IMM8, 11) as u64,
-            mask(IMM8, 12) as u64,
-            mask(IMM8, 13) as u64,
-            mask(IMM8, 14) as u64,
-            mask(IMM8, 15) as u64,
-            mask(IMM8, 16) as u64,
-            mask(IMM8, 17) as u64,
-            mask(IMM8, 18) as u64,
-            mask(IMM8, 19) as u64,
-            mask(IMM8, 20) as u64,
-            mask(IMM8, 21) as u64,
-            mask(IMM8, 22) as u64,
-            mask(IMM8, 23) as u64,
-            mask(IMM8, 24) as u64,
-            mask(IMM8, 25) as u64,
-            mask(IMM8, 26) as u64,
-            mask(IMM8, 27) as u64,
-            mask(IMM8, 28) as u64,
-            mask(IMM8, 29) as u64,
-            mask(IMM8, 30) as u64,
-            mask(IMM8, 31) as u64,
+            mask(IMM8, 0) as u32,
+            mask(IMM8, 1) as u32,
+            mask(IMM8, 2) as u32,
+            mask(IMM8, 3) as u32,
+            mask(IMM8, 4) as u32,
+            mask(IMM8, 5) as u32,
+            mask(IMM8, 6) as u32,
+            mask(IMM8, 7) as u32,
+            mask(IMM8, 8) as u32,
+            mask(IMM8, 9) as u32,
+            mask(IMM8, 10) as u32,
+            mask(IMM8, 11) as u32,
+            mask(IMM8, 12) as u32,
+            mask(IMM8, 13) as u32,
+            mask(IMM8, 14) as u32,
+            mask(IMM8, 15) as u32,
+            mask(IMM8, 16) as u32,
+            mask(IMM8, 17) as u32,
+            mask(IMM8, 18) as u32,
+            mask(IMM8, 19) as u32,
+            mask(IMM8, 20) as u32,
+            mask(IMM8, 21) as u32,
+            mask(IMM8, 22) as u32,
+            mask(IMM8, 23) as u32,
+            mask(IMM8, 24) as u32,
+            mask(IMM8, 25) as u32,
+            mask(IMM8, 26) as u32,
+            mask(IMM8, 27) as u32,
+            mask(IMM8, 28) as u32,
+            mask(IMM8, 29) as u32,
+            mask(IMM8, 30) as u32,
+            mask(IMM8, 31) as u32,
         ],
     );
     r.into()
@@ -2135,12 +2135,12 @@ pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
 
 pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
-    const fn mask(shift: i32, i: u32) -> u64 {
+    const fn mask(shift: i32, i: u32) -> u32 {
         let shift = shift as u32 & 0xff;
         if shift > 15 || (15 - (i % 16)) < shift {
-            0 as u64
+            0 as u32
         } else {
-            (32 + (i + shift)) as u64
+            (32 + (i + shift)) as u32
         }
     }
 
@@ -2476,7 +2476,7 @@ pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
 // This intrinsic has no corresponding instruction.
 
 pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
-    simd_extract(BitVec::to_u8x32(a), INDEX as u64) as u32 as i32
+    simd_extract(BitVec::to_u8x32(a), INDEX as u32) as u32 as i32
 }
 
 /// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
@@ -2489,5 +2489,5 @@ pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
 // This intrinsic has no corresponding instruction.
 
 pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
-    simd_extract(BitVec::to_u16x16(a), INDEX as u64) as u32 as i32
+    simd_extract(BitVec::to_u16x16(a), INDEX as u32) as u32 as i32
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index ed57f03cfd5d8..ae6a1b0b7f6d5 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -491,12 +491,12 @@ pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
 /// `_mm_slli_si128` intrinsic into a compile-time constant.
 
 fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
-    const fn mask(shift: i32, i: u32) -> u64 {
+    const fn mask(shift: i32, i: u32) -> u32 {
         let shift = shift as u32 & 0xff;
         if shift > 15 {
-            i as u64
+            i as u32
         } else {
-            (16 - shift + i) as u64
+            (16 - shift + i) as u32
         }
     }
     (simd_shuffle(
@@ -662,11 +662,11 @@ pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
 /// `_mm_srli_si128` intrinsic into a compile-time constant.
 
 fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
-    const fn mask(shift: i32, i: u32) -> u64 {
+    const fn mask(shift: i32, i: u32) -> u32 {
         if (shift as u32) > 15 {
-            (i + 16) as u64
+            (i + 16) as u32
         } else {
-            (i + (shift as u32)) as u64
+            (i + (shift as u32)) as u32
         }
     }
     let x: i8x16 = simd_shuffle(
@@ -1105,7 +1105,7 @@ pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
     // static_assert_uimm_bits!(IMM8, 3);
-    simd_extract(BitVec::to_u16x8(a), IMM8 as u64) as i32
+    simd_extract(BitVec::to_u16x8(a), IMM8 as u32) as i32
 }
 
 /// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
@@ -1114,7 +1114,7 @@ pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
 
 pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
     // static_assert_uimm_bits!(IMM8, 3);
-    simd_insert(BitVec::to_i16x8(a), IMM8 as u64, i as i16).into()
+    simd_insert(BitVec::to_i16x8(a), IMM8 as u32, i as i16).into()
 }
 
 /// Returns a mask of the most significant bit of each element in `a`.
@@ -1140,10 +1140,10 @@ pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
         a,
         a,
         [
-            IMM8 as u64 & 0b11,
-            (IMM8 as u64 >> 2) & 0b11,
-            (IMM8 as u64 >> 4) & 0b11,
-            (IMM8 as u64 >> 6) & 0b11,
+            IMM8 as u32 & 0b11,
+            (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11,
+            (IMM8 as u32 >> 6) & 0b11,
         ],
     );
     x.into()
@@ -1169,10 +1169,10 @@ pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
             1,
             2,
             3,
-            (IMM8 as u64 & 0b11) + 4,
-            ((IMM8 as u64 >> 2) & 0b11) + 4,
-            ((IMM8 as u64 >> 4) & 0b11) + 4,
-            ((IMM8 as u64 >> 6) & 0b11) + 4,
+            (IMM8 as u32 & 0b11) + 4,
+            ((IMM8 as u32 >> 2) & 0b11) + 4,
+            ((IMM8 as u32 >> 4) & 0b11) + 4,
+            ((IMM8 as u32 >> 6) & 0b11) + 4,
         ],
     );
     x.into()
@@ -1194,10 +1194,10 @@ pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
         a,
         a,
         [
-            IMM8 as u64 & 0b11,
-            (IMM8 as u64 >> 2) & 0b11,
-            (IMM8 as u64 >> 4) & 0b11,
-            (IMM8 as u64 >> 6) & 0b11,
+            IMM8 as u32 & 0b11,
+            (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11,
+            (IMM8 as u32 >> 6) & 0b11,
             4,
             5,
             6,
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index 8d0488430756c..a4375f56a0672 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -7,7 +7,7 @@ use super::types::*;
 mod c_extern {
     use crate::abstractions::simd::*;
     pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
-        u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u64] })
+        u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u32] })
     }
 
     pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
@@ -220,7 +220,7 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     } else {
         (a, b)
     };
-    const fn mask(shift: u64, i: u64) -> u64 {
+    const fn mask(shift: u32, i: u32) -> u32 {
         if shift > 32 {
             // Unused, but needs to be a valid index.
             i
@@ -235,22 +235,26 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
         BitVec::to_i8x16(b),
         BitVec::to_i8x16(a),
         [
-            mask(IMM8 as u64, 0),
-            mask(IMM8 as u64, 1),
-            mask(IMM8 as u64, 2),
-            mask(IMM8 as u64, 3),
-            mask(IMM8 as u64, 4),
-            mask(IMM8 as u64, 5),
-            mask(IMM8 as u64, 6),
-            mask(IMM8 as u64, 7),
-            mask(IMM8 as u64, 8),
-            mask(IMM8 as u64, 9),
-            mask(IMM8 as u64, 10),
-            mask(IMM8 as u64, 11),
-            mask(IMM8 as u64, 12),
-            mask(IMM8 as u64, 13),
-            mask(IMM8 as u64, 14),
-            mask(IMM8 as u64, 15),
+            mask(IMM8 as u32, 0),
+            mask(IMM8 as u32, 0),
+            mask(IMM8 as u32, 0),
+            mask(IMM8 as u32, 0),
+            mask(IMM8 as u32, 0),
+            mask(IMM8 as u32, 1),
+            mask(IMM8 as u32, 2),
+            mask(IMM8 as u32, 3),
+            mask(IMM8 as u32, 4),
+            mask(IMM8 as u32, 5),
+            mask(IMM8 as u32, 6),
+            mask(IMM8 as u32, 7),
+            mask(IMM8 as u32, 8),
+            mask(IMM8 as u32, 9),
+            mask(IMM8 as u32, 10),
+            mask(IMM8 as u32, 11),
+            mask(IMM8 as u32, 12),
+            mask(IMM8 as u32, 13),
+            mask(IMM8 as u32, 14),
+            mask(IMM8 as u32, 15),
         ],
     );
     r.into()
diff --git a/testable-simd-models/src/helpers.rs b/testable-simd-models/src/helpers.rs
index 6c5e84e2a8dbd..c2ea42b4b7ed6 100644
--- a/testable-simd-models/src/helpers.rs
+++ b/testable-simd-models/src/helpers.rs
@@ -38,13 +38,13 @@ pub mod test {
             crate::abstractions::bit::Bit::from(bool::random())
         }
     }
-    impl<const N: u64> HasRandom for BitVec<N> {
+    impl<const N: u32> HasRandom for BitVec<N> {
         fn random() -> Self {
             Self::from_fn(|_| Bit::random())
         }
     }
 
-    impl<const N: u64, T: HasRandom> HasRandom for FunArray<N, T> {
+    impl<const N: u32, T: HasRandom> HasRandom for FunArray<N, T> {
         fn random() -> Self {
             FunArray::from_fn(|_| T::random())
         }
diff --git a/testable-simd-models/test.sh b/testable-simd-models/test.sh
deleted file mode 100755
index 8f521735122c3..0000000000000
--- a/testable-simd-models/test.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-cross test --target aarch64-unknown-linux-gnu
-cross test --target x86_64-unknown-linux-gnu

From 370303f8d1649effb99dd4b501a5d29ea2f2a7f5 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Sat, 26 Jul 2025 06:19:41 -0400
Subject: [PATCH 02/47] edited avx2

---
 library/Cargo.lock                            |  90 +++++
 .../src/abstractions/bitvec.rs                |   7 +
 .../src/abstractions/funarr.rs                |   9 +
 testable-simd-models/src/abstractions/simd.rs |  11 +-
 .../src/core_arch/x86/models/avx.rs           |  78 ++--
 .../src/core_arch/x86/models/avx2.rs          | 368 +++++++++---------
 6 files changed, 325 insertions(+), 238 deletions(-)

diff --git a/library/Cargo.lock b/library/Cargo.lock
index c681c5935df5f..aa756e9e6aa94 100644
--- a/library/Cargo.lock
+++ b/library/Cargo.lock
@@ -28,6 +28,7 @@ version = "0.0.0"
 dependencies = [
  "compiler_builtins",
  "core",
+ "safety",
 ]
 
 [[package]]
@@ -67,6 +68,9 @@ dependencies = [
 [[package]]
 name = "core"
 version = "0.0.0"
+dependencies = [
+ "safety",
+]
 
 [[package]]
 name = "coretests"
@@ -201,6 +205,39 @@ dependencies = [
  "unwind",
 ]
 
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
+dependencies = [
+ "unicode-ident",
+]
+
 [[package]]
 name = "proc_macro"
 version = "0.0.0"
@@ -217,6 +254,15 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
 [[package]]
 name = "r-efi"
 version = "5.3.0"
@@ -301,6 +347,16 @@ dependencies = [
  "std",
 ]
 
+[[package]]
+name = "safety"
+version = "0.1.0"
+dependencies = [
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.104",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -330,6 +386,7 @@ dependencies = [
  "rand",
  "rand_xorshift",
  "rustc-demangle",
+ "safety",
  "std_detect",
  "unwind",
  "wasi",
@@ -346,6 +403,27 @@ dependencies = [
  "rustc-std-workspace-core",
 ]
 
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.104"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
 [[package]]
 name = "sysroot"
 version = "0.0.0"
@@ -366,6 +444,12 @@ dependencies = [
  "std",
 ]
 
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
 [[package]]
 name = "unicode-width"
 version = "0.2.1"
@@ -398,6 +482,12 @@ dependencies = [
  "rustc-std-workspace-core",
 ]
 
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
 [[package]]
 name = "wasi"
 version = "0.11.1+wasi-snapshot-preview1"
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index 02a32df4e152e..952c5a21cb1a6 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -17,6 +17,13 @@ use std::fmt::Formatter;
 #[derive(Copy, Clone, Eq, PartialEq)]
 pub struct BitVec<const N: u32>(FunArray<N, Bit>);
 
+impl<const N: u32> BitVec<N> {
+    #[allow(non_snake_case)]
+    pub fn ZERO() -> Self {
+        Self::from_fn(|_| Bit::Zero)
+    }
+}
+
 /// Pretty prints a bit slice by group of 8
 fn bit_slice_to_string(bits: &[Bit]) -> String {
     bits.iter()
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index 429db9d1123f1..a57d6d2fd58d2 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -1,6 +1,8 @@
 //! This module implements a fixed-size array wrapper with functional semantics
 //! which are used in formulating abstractions.
 
+use crate::abstractions::bit::MachineInteger;
+
 /// `FunArray<N, T>` represents an array of `T` values of length `N`, where `N` is a compile-time constant.
 /// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
 /// Unused elements beyond `N` are filled with `None`.
@@ -53,6 +55,13 @@ impl<const N: u32, T> FunArray<N, T> {
     }
 }
 
+impl<const N: u32, T:MachineInteger> FunArray<N, T> {
+    #[allow(non_snake_case)]
+    pub fn ZERO() -> Self {
+        Self::from_fn(|_| T::ZEROS)
+    }
+}
+
 impl<const N: u32, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     type Error = ();
     fn try_from(v: Vec<T>) -> Result<Self, ()> {
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 9ccd1ec86e8e9..b0193748b554d 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -25,6 +25,11 @@ macro_rules! interpretations {
                         #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
                         pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
             let vec: Vec<$ty> = bv.to_vec();
+            $name::from_fn(|i| vec[i as usize])
+                        }
+                        #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
+                        pub fn [< as_ $name >](self) -> $name {
+            let vec: Vec<$ty> = self.to_vec();
             $name::from_fn(|i| vec[i as usize])
                         }
 
@@ -71,7 +76,6 @@ interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
 /// # Safety
 ///
 /// `idx` must be in-bounds of the vector, ie. idx < N
-
 pub fn simd_insert<const N: u32, T: Copy>(x: FunArray<N, T>, idx: u32, val: T) -> FunArray<N, T> {
     FunArray::from_fn(|i| if i == idx { val } else { x[i] })
 }
@@ -936,3 +940,8 @@ pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy + MachineInte
         }
     })
 }
+
+/// Converts one type to another
+pub fn transmute<T, U:From<T>>(a:T) -> U {
+    a.into()
+}
\ No newline at end of file
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index aaf7a0d2649c5..50886b5def283 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -38,110 +38,82 @@ use c_extern::*;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
 pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    let mask: i32x8 = simd_lt(BitVec::to_i32x8(c), i32x8::from_fn(|_| 0));
-    BitVec::from_i32x8(simd_select(mask, BitVec::to_i32x8(b), BitVec::to_i32x8(a)))
+    let mask: i32x8 = simd_lt(c.as_i32x8(), i32x8::ZERO());
+    transmute(simd_select(mask, b.as_i32x8(), a.as_i32x8()))
 }
 
 /// Equal (ordered, non-signaling)
-
 pub const _CMP_EQ_OQ: i32 = 0x00;
 /// Less-than (ordered, signaling)
-
 pub const _CMP_LT_OS: i32 = 0x01;
 /// Less-than-or-equal (ordered, signaling)
-
 pub const _CMP_LE_OS: i32 = 0x02;
 /// Unordered (non-signaling)
-
 pub const _CMP_UNORD_Q: i32 = 0x03;
 /// Not-equal (unordered, non-signaling)
-
 pub const _CMP_NEQ_UQ: i32 = 0x04;
 /// Not-less-than (unordered, signaling)
-
 pub const _CMP_NLT_US: i32 = 0x05;
 /// Not-less-than-or-equal (unordered, signaling)
-
 pub const _CMP_NLE_US: i32 = 0x06;
 /// Ordered (non-signaling)
-
 pub const _CMP_ORD_Q: i32 = 0x07;
 /// Equal (unordered, non-signaling)
-
 pub const _CMP_EQ_UQ: i32 = 0x08;
 /// Not-greater-than-or-equal (unordered, signaling)
-
 pub const _CMP_NGE_US: i32 = 0x09;
 /// Not-greater-than (unordered, signaling)
-
 pub const _CMP_NGT_US: i32 = 0x0a;
 /// False (ordered, non-signaling)
-
 pub const _CMP_FALSE_OQ: i32 = 0x0b;
 /// Not-equal (ordered, non-signaling)
-
 pub const _CMP_NEQ_OQ: i32 = 0x0c;
 /// Greater-than-or-equal (ordered, signaling)
-
 pub const _CMP_GE_OS: i32 = 0x0d;
 /// Greater-than (ordered, signaling)
-
 pub const _CMP_GT_OS: i32 = 0x0e;
 /// True (unordered, non-signaling)
-
 pub const _CMP_TRUE_UQ: i32 = 0x0f;
 /// Equal (ordered, signaling)
-
 pub const _CMP_EQ_OS: i32 = 0x10;
 /// Less-than (ordered, non-signaling)
-
 pub const _CMP_LT_OQ: i32 = 0x11;
 /// Less-than-or-equal (ordered, non-signaling)
-
 pub const _CMP_LE_OQ: i32 = 0x12;
 /// Unordered (signaling)
-
 pub const _CMP_UNORD_S: i32 = 0x13;
 /// Not-equal (unordered, signaling)
-
 pub const _CMP_NEQ_US: i32 = 0x14;
 /// Not-less-than (unordered, non-signaling)
-
 pub const _CMP_NLT_UQ: i32 = 0x15;
 /// Not-less-than-or-equal (unordered, non-signaling)
-
 pub const _CMP_NLE_UQ: i32 = 0x16;
 /// Ordered (signaling)
-
 pub const _CMP_ORD_S: i32 = 0x17;
 /// Equal (unordered, signaling)
-
 pub const _CMP_EQ_US: i32 = 0x18;
 /// Not-greater-than-or-equal (unordered, non-signaling)
-
 pub const _CMP_NGE_UQ: i32 = 0x19;
 /// Not-greater-than (unordered, non-signaling)
-
 pub const _CMP_NGT_UQ: i32 = 0x1a;
 /// False (ordered, signaling)
-
 pub const _CMP_FALSE_OS: i32 = 0x1b;
 /// Not-equal (ordered, signaling)
-
 pub const _CMP_NEQ_OS: i32 = 0x1c;
 /// Greater-than-or-equal (ordered, non-signaling)
-
 pub const _CMP_GE_OQ: i32 = 0x1d;
 /// Greater-than (ordered, non-signaling)
-
 pub const _CMP_GT_OQ: i32 = 0x1e;
 /// True (unordered, signaling)
-
 pub const _CMP_TRUE_US: i32 = 0x1f;
 
+/// Shuffles 128-bits (composed of integer data) selected by `imm8`
+/// from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
 pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    // // static_assert_uimm_bits!(IMM8, 8);
-    vperm2f128si256(BitVec::to_i32x8(a), BitVec::to_i32x8(b), IMM8 as i8).into()
+   // static_assert_uimm_bits!(IMM8, 8);
+    vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8).into()
 }
 
 /// Copies `a` to result, then inserts 128 bits from `b` into result
@@ -153,8 +125,8 @@ pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
     // // static_assert_uimm_bits!(IMM1, 1);
 
     let dst: i64x4 = simd_shuffle(
-        BitVec::to_i64x4(a),
-        BitVec::to_i64x4(_mm256_castsi128_si256(b)),
+        a.as_i64x4(),
+        _mm256_castsi128_si256(b).as_i64x4(),
         [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
     );
     dst.into()
@@ -169,7 +141,7 @@ pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
 
 pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
     // // static_assert_uimm_bits!(INDEX, 5);
-    simd_insert(BitVec::to_i8x32(a), INDEX as u32, i).into()
+    simd_insert(a.as_i8x32(), INDEX as u32, i).into()
 }
 
 /// Copies `a` to result, and inserts the 16-bit integer `i` into result
@@ -181,7 +153,7 @@ pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
 
 pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
     // // static_assert_uimm_bits!(INDEX, 4);
-    simd_insert(BitVec::to_i16x16(a), INDEX as u32, i).into()
+    simd_insert(a.as_i16x16(), INDEX as u32, i).into()
 }
 
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -211,7 +183,7 @@ pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
 pub fn _mm256_movemask_ps(a: __m256) -> i32 {
     // Propagate the highest bit to the rest, because simd_bitmask
     // requires all-1 or all-0.
-    let mask: i32x8 = simd_lt(BitVec::to_i32x8(a), i32x8::from_fn(|_| 0));
+    let mask: i32x8 = simd_lt(a.as_i32x8(), i32x8::ZERO());
     let r = simd_bitmask_little!(7, mask, u8);
     r as u32 as i32
 }
@@ -221,7 +193,7 @@ pub fn _mm256_movemask_ps(a: __m256) -> i32 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
 
 pub fn _mm256_setzero_ps() -> __m256 {
-    BitVec::from_fn(|_| Bit::Zero)
+    __m256::ZERO()
 }
 
 /// Returns vector of type __m256i with all elements set to zero.
@@ -229,7 +201,7 @@ pub fn _mm256_setzero_ps() -> __m256 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
 
 pub fn _mm256_setzero_si256() -> __m256i {
-    BitVec::from_fn(|_| Bit::Zero)
+    __m256i::ZERO()
 }
 
 /// Sets packed 8-bit integers in returned vector with the supplied values.
@@ -276,7 +248,7 @@ pub fn _mm256_set_epi8(
         e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
         e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
     ];
-    BitVec::from_i8x32(i8x32::from_fn(|i| vec[(31 - i) as usize]))
+    transmute(i8x32::from_fn(|i| vec[(31 - i) as usize]))
 }
 
 /// Sets packed 16-bit integers in returned vector with the supplied values.
@@ -306,7 +278,7 @@ pub fn _mm256_set_epi16(
     let vec = [
         e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
     ];
-    BitVec::from_i16x16(i16x16::from_fn(|i| vec[(15 - i) as usize]))
+    transmute(i16x16::from_fn(|i| vec[(15 - i) as usize]))
 }
 
 /// Sets packed 32-bit integers in returned vector with the supplied values.
@@ -326,7 +298,7 @@ pub fn _mm256_set_epi32(
     e7: i32,
 ) -> __m256i {
     let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
-    BitVec::from_i32x8(i32x8::from_fn(|i| vec[(7 - i) as usize]))
+    transmute(i32x8::from_fn(|i| vec[(7 - i) as usize]))
 }
 
 /// Sets packed 64-bit integers in returned vector with the supplied values.
@@ -335,7 +307,7 @@ pub fn _mm256_set_epi32(
 // This intrinsic has no corresponding instruction.
 pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
     let vec = [d, c, b, a];
-    BitVec::from_i64x4(i64x4::from_fn(|i| vec[i as usize]))
+    transmute(i64x4::from_fn(|i| vec[i as usize]))
 }
 
 /// Broadcasts 8-bit integer `a` to all elements of returned vector.
@@ -348,7 +320,7 @@ pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
 // This intrinsic has no corresponding instruction.
 
 pub fn _mm256_set1_epi8(val: i8) -> BitVec<256> {
-    BitVec::from_i8x32(i8x32::from_fn(|_| val))
+    transmute(i8x32::from_fn(|_| val))
 }
 
 /// Broadcasts 16-bit integer `a` to all elements of returned vector.
@@ -361,7 +333,7 @@ pub fn _mm256_set1_epi8(val: i8) -> BitVec<256> {
 // This intrinsic has no corresponding instruction.
 
 pub fn _mm256_set1_epi16(a: i16) -> __m256i {
-    BitVec::from_i16x16(i16x16::from_fn(|_| a))
+    transmute(i16x16::from_fn(|_| a))
 }
 
 /// Broadcasts 32-bit integer `a` to all elements of returned vector.
@@ -372,7 +344,7 @@ pub fn _mm256_set1_epi16(a: i16) -> __m256i {
 // This intrinsic has no corresponding instruction.
 
 pub fn _mm256_set1_epi32(a: i32) -> __m256i {
-    BitVec::from_i32x8(i32x8::from_fn(|_| a))
+    transmute(i32x8::from_fn(|_| a))
 }
 
 /// Broadcasts 64-bit integer `a` to all elements of returned vector.
@@ -381,7 +353,7 @@ pub fn _mm256_set1_epi32(a: i32) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
 // This intrinsic has no corresponding instruction.
 pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
-    BitVec::from_i64x4(i64x4::from_fn(|_| a))
+    transmute(i64x4::from_fn(|_| a))
 }
 
 pub fn _mm256_castps_si256(a: __m256) -> __m256i {
@@ -417,10 +389,10 @@ pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 // instructions, thus it has zero latency.
 
 pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
-    let a = BitVec::to_i64x2(a);
+    let a = a.as_i64x2();
     let undefined = i64x2::from_fn(|_| 0);
     let dst: i64x4 = simd_shuffle(a, undefined, [0, 1, 2, 2]);
-    BitVec::from_i64x4(dst)
+    transmute(dst)
 }
 
 /// Sets packed __m256i returned vector with the supplied values.
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index c6ada5a406766..0565630149381 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -652,9 +652,9 @@ use super::types::*;
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
 
 pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let r = simd_select(simd_lt(a, i32x8::from_fn(|_| 0)), simd_neg(a), a);
-    BitVec::from_i32x8(r)
+    let a = a.as_i32x8();
+    let r = simd_select(simd_lt(a, i32x8::ZERO()), simd_neg(a), a);
+    transmute(r)
 }
 
 /// Computes the absolute values of packed 16-bit integers in `a`.
@@ -662,9 +662,9 @@ pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
 
 pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let r = simd_select(simd_lt(a, i16x16::from_fn(|_| 0)), simd_neg(a), a);
-    BitVec::from_i16x16(r)
+    let a = a.as_i16x16();
+    let r = simd_select(simd_lt(a, i16x16::ZERO()), simd_neg(a), a);
+    transmute(r)
 }
 
 /// Computes the absolute values of packed 8-bit integers in `a`.
@@ -672,9 +672,9 @@ pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
 
 pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
-    let a = BitVec::to_i8x32(a);
-    let r = simd_select(simd_lt(a, i8x32::from_fn(|_| 0)), simd_neg(a), a);
-    BitVec::from_i8x32(r)
+    let a = a.as_i8x32();
+    let r = simd_select(simd_lt(a, i8x32::ZERO()), simd_neg(a), a);
+    transmute(r)
 }
 
 /// Adds packed 64-bit integers in `a` and `b`.
@@ -682,7 +682,7 @@ pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
 
 pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i64x4(simd_add(BitVec::to_i64x4(a), BitVec::to_i64x4(b)))
+    transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Adds packed 32-bit integers in `a` and `b`.
@@ -690,7 +690,7 @@ pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
 
 pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i32x8(simd_add(BitVec::to_i32x8(a), BitVec::to_i32x8(b)))
+    transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Adds packed 16-bit integers in `a` and `b`.
@@ -698,7 +698,7 @@ pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
 
 pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i16x16(simd_add(BitVec::to_i16x16(a), BitVec::to_i16x16(b)))
+    transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Adds packed 8-bit integers in `a` and `b`.
@@ -706,7 +706,7 @@ pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
 
 pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i8x32(simd_add(BitVec::to_i8x32(a), BitVec::to_i8x32(b)))
+    transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Adds packed 8-bit integers in `a` and `b` using saturation.
@@ -714,9 +714,9 @@ pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
 
 pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i8x32(simd_saturating_add(
-        BitVec::to_i8x32(a),
-        BitVec::to_i8x32(b),
+    transmute(simd_saturating_add(
+        a.as_i8x32(),
+        b.as_i8x32(),
     ))
 }
 
@@ -725,9 +725,9 @@ pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
 
 pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i16x16(simd_saturating_add(
-        BitVec::to_i16x16(a),
-        BitVec::to_i16x16(b),
+    transmute(simd_saturating_add(
+        a.as_i16x16(),
+        b.as_i16x16(),
     ))
 }
 
@@ -736,7 +736,7 @@ pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
 
 pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_add(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+    simd_saturating_add(a.as_u8x32(), b.as_u8x32()).into()
 }
 
 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
@@ -744,7 +744,7 @@ pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
 
 pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_add(BitVec::to_u16x16(a), BitVec::to_u16x16(b)).into()
+    simd_saturating_add(a.as_u16x16(), b.as_u16x16()).into()
 }
 
 /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
@@ -766,8 +766,8 @@ pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
         (a, b)
     };
 
-    let a = BitVec::to_i8x32(a);
-    let b = BitVec::to_i8x32(b);
+    let a = a.as_i8x32();
+    let b = b.as_i8x32();
 
     if IMM8 == 16 {
         return a.into();
@@ -913,7 +913,7 @@ pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
 
 pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
-    simd_and(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+    simd_and(a.as_i64x4(), b.as_i64x4()).into()
 }
 
 /// Computes the bitwise NOT of 256 bits (representing integer data)
@@ -924,8 +924,8 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
     let all_ones = _mm256_set1_epi8(-1);
     simd_and(
-        simd_xor(BitVec::to_i64x4(a), BitVec::to_i64x4(all_ones)),
-        BitVec::to_i64x4(b),
+        simd_xor(a.as_i64x4(),all_ones.as_i64x4()),
+        b.as_i64x4(),
     )
     .into()
 }
@@ -935,8 +935,8 @@ pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
 
 pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<16, _, u32>(BitVec::to_u16x16(a));
-    let b = simd_cast::<16, _, u32>(BitVec::to_u16x16(b));
+    let a = simd_cast::<16, _, u32>(a.as_u16x16());
+    let b = simd_cast::<16, _, u32>(b.as_u16x16());
     let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
     simd_cast::<16, _, u16>(r).into()
 }
@@ -946,8 +946,8 @@ pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
 
 pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<32, _, u16>(BitVec::to_u8x32(a));
-    let b = simd_cast::<32, _, u16>(BitVec::to_u8x32(b));
+    let a = simd_cast::<32, _, u16>(a.as_u8x32());
+    let b = simd_cast::<32, _, u16>(b.as_u8x32());
     let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
     simd_cast::<32, _, u8>(r).into()
 }
@@ -957,8 +957,8 @@ pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
 
 pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i32x4(a);
-    let b = BitVec::to_i32x4(b);
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
     let r: i32x4 = simd_shuffle(
         a,
         b,
@@ -977,8 +977,8 @@ pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
 
 pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
     let r: i32x8 = simd_shuffle(
         a,
         b,
@@ -1000,8 +1000,8 @@ pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
 pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let b = BitVec::to_i16x16(b);
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
 
     let r: i16x16 = simd_shuffle(
         a,
@@ -1032,8 +1032,8 @@ pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
 pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
-    let mask: i8x32 = simd_lt(BitVec::to_i8x32(mask), i8x32::from_fn(|_| 0));
-    simd_select(mask, BitVec::to_i8x32(b), BitVec::to_i8x32(a)).into()
+    let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO());
+    simd_select(mask, b.as_i8x32(), a.as_i8x32()).into()
 }
 
 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
@@ -1041,7 +1041,7 @@ pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
 pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u32; 16]);
+    let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 16]);
     ret.into()
 }
 
@@ -1050,7 +1050,7 @@ pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
 pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u32; 32]);
+    let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 32]);
     ret.into()
 }
 
@@ -1062,7 +1062,7 @@ pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
 
 pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u32; 4]);
+    let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 4]);
     ret.into()
 }
 
@@ -1074,7 +1074,7 @@ pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
 
 pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u32; 8]);
+    let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 8]);
     ret.into()
 }
 
@@ -1087,7 +1087,7 @@ pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 // See https://github.com/rust-lang/stdarch/issues/791
 
 pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u32; 2]);
+    let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
     ret.into()
 }
 
@@ -1097,7 +1097,7 @@ pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
 
 pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u32; 4]);
+    let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
     ret.into()
 }
 
@@ -1107,7 +1107,7 @@ pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
 
 pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 1, 0, 1]);
+    let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
     ret.into()
 }
 
@@ -1119,7 +1119,7 @@ pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
 
 pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 1, 0, 1]);
+    let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
     ret.into()
 }
 
@@ -1129,7 +1129,7 @@ pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
 
 pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u32; 8]);
+    let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 8]);
     ret.into()
 }
 
@@ -1139,7 +1139,7 @@ pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
 
 pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u32; 16]);
+    let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 16]);
     ret.into()
 }
 
@@ -1148,7 +1148,7 @@ pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
 
 pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+    simd_eq(a.as_i64x4(), b.as_i64x4()).into()
 }
 
 /// Compares packed 32-bit integers in `a` and `b` for equality.
@@ -1156,7 +1156,7 @@ pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
 
 pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    simd_eq(a.as_i32x8(), b.as_i32x8()).into()
 }
 
 /// Compares packed 16-bit integers in `a` and `b` for equality.
@@ -1164,7 +1164,7 @@ pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
 
 pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    simd_eq(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Compares packed 8-bit integers in `a` and `b` for equality.
@@ -1172,7 +1172,7 @@ pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
 
 pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+    simd_eq(a.as_i8x32(), b.as_i8x32()).into()
 }
 
 /// Compares packed 64-bit integers in `a` and `b` for greater-than.
@@ -1180,7 +1180,7 @@ pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
 
 pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+    simd_gt(a.as_i64x4(), b.as_i64x4()).into()
 }
 
 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
@@ -1188,7 +1188,7 @@ pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
 
 pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    simd_gt(a.as_i32x8(), b.as_i32x8()).into()
 }
 
 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
@@ -1196,7 +1196,7 @@ pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
 
 pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    simd_gt(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
@@ -1204,7 +1204,7 @@ pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
 
 pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+    simd_gt(a.as_i8x32(), b.as_i8x32()).into()
 }
 
 /// Sign-extend 16-bit integers to 32-bit integers.
@@ -1212,7 +1212,7 @@ pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
 
 pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
-    simd_cast::<8, _, i32>(BitVec::to_i16x8(a)).into()
+    simd_cast::<8, _, i32>(a.as_i16x8()).into()
 }
 
 /// Sign-extend 16-bit integers to 64-bit integers.
@@ -1220,7 +1220,7 @@ pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
 
 pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
-    let a = BitVec::to_i16x8(a);
+    let a = a.as_i16x8();
     let v64: i16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
     simd_cast::<4, i16, i64>(v64).into()
 }
@@ -1230,7 +1230,7 @@ pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
 
 pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
-    simd_cast::<4, i32, i64>(BitVec::to_i32x4(a)).into()
+    simd_cast::<4, i32, i64>(a.as_i32x4()).into()
 }
 
 /// Sign-extend 8-bit integers to 16-bit integers.
@@ -1238,7 +1238,7 @@ pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
 
 pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
-    simd_cast::<16, i8, i16>(BitVec::to_i8x16(a)).into()
+    simd_cast::<16, i8, i16>(a.as_i8x16()).into()
 }
 
 /// Sign-extend 8-bit integers to 32-bit integers.
@@ -1246,7 +1246,7 @@ pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
 
 pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
-    let a = BitVec::to_i8x16(a);
+    let a = a.as_i8x16();
     let v64: i8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
     simd_cast::<8, i8, i32>(v64).into()
 }
@@ -1255,7 +1255,7 @@ pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
 pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
-    let a = BitVec::to_i8x16(a);
+    let a = a.as_i8x16();
     let v32: i8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
     simd_cast::<4, i8, i64>(v32).into()
 }
@@ -1266,7 +1266,7 @@ pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
 
 pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
-    simd_cast::<8, u16, u32>(BitVec::to_u16x8(a)).into()
+    simd_cast::<8, u16, u32>(a.as_u16x8()).into()
 }
 
 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
@@ -1275,7 +1275,7 @@ pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
 
 pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
-    let a = BitVec::to_u16x8(a);
+    let a = a.as_u16x8();
     let v64: u16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
     simd_cast::<4, u16, u64>(v64).into()
 }
@@ -1285,7 +1285,7 @@ pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
 
 pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
-    simd_cast::<4, u32, u64>(BitVec::to_u32x4(a)).into()
+    simd_cast::<4, u32, u64>(a.as_u32x4()).into()
 }
 
 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
@@ -1293,7 +1293,7 @@ pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
 
 pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
-    simd_cast::<16, u8, u16>(BitVec::to_u8x16(a)).into()
+    simd_cast::<16, u8, u16>(a.as_u8x16()).into()
 }
 
 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
@@ -1302,7 +1302,7 @@ pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
 
 pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
-    let a = BitVec::to_u8x16(a);
+    let a = a.as_u8x16();
     let v64: u8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
     simd_cast::<8, u8, u32>(v64).into()
 }
@@ -1313,7 +1313,7 @@ pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
 
 pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
-    let a = BitVec::to_u8x16(a);
+    let a = a.as_u8x16();
     let v32: u8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
     simd_cast::<4, u8, u64>(v32).into()
 }
@@ -1323,7 +1323,7 @@ pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
 
 pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
-    let a = BitVec::to_i64x4(a);
+    let a = a.as_i64x4();
     let b = i64x4::from_fn(|_| 0);
     let dst: i64x2 = simd_shuffle(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
     dst.into()
@@ -1334,7 +1334,7 @@ pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
 
 pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phaddw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    phaddw(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
@@ -1342,7 +1342,7 @@ pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
 
 pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
-    phaddd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    phaddd(a.as_i32x8(), b.as_i32x8()).into()
 }
 
 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
@@ -1351,7 +1351,7 @@ pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
 
 pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phaddsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    phaddsw(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
@@ -1359,7 +1359,7 @@ pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
 
 pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phsubw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    phsubw(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
@@ -1367,7 +1367,7 @@ pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
 
 pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    phsubd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    phsubd(a.as_i32x8(), b.as_i32x8()).into()
 }
 
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@@ -1376,7 +1376,7 @@ pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
 
 pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phsubsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    phsubsw(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
@@ -1385,8 +1385,8 @@ pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
 
 pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
-    let a = BitVec::to_i64x4(a);
-    let b = BitVec::to_i64x4(_mm256_castsi128_si256(b));
+    let a = a.as_i64x4();
+    let b = _mm256_castsi128_si256(b).as_i64x4();
     let dst: i64x4 = simd_shuffle(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
     dst.into()
 }
@@ -1398,7 +1398,7 @@ pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
 
 pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    pmaddwd(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    pmaddwd(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -1409,7 +1409,7 @@ pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
 
 pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    pmaddubsw(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+    pmaddubsw(a.as_u8x32(), b.as_u8x32()).into()
 }
 
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -1418,8 +1418,8 @@ pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
 
 pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let b = BitVec::to_i16x16(b);
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
     simd_select::<16, i16, _>(simd_gt(a, b), a, b).into()
 }
 
@@ -1429,8 +1429,8 @@ pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
 
 pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
     simd_select::<8, i32, _>(simd_gt(a, b), a, b).into()
 }
 
@@ -1440,8 +1440,8 @@ pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
 
 pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i8x32(a);
-    let b = BitVec::to_i8x32(b);
+    let a = a.as_i8x32();
+    let b = b.as_i8x32();
     simd_select::<32, i8, _>(simd_gt(a, b), a, b).into()
 }
 
@@ -1451,8 +1451,8 @@ pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
 
 pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u16x16(a);
-    let b = BitVec::to_u16x16(b);
+    let a = a.as_u16x16();
+    let b = b.as_u16x16();
     simd_select::<16, _, u16>(simd_gt(a, b), a, b).into()
 }
 
@@ -1462,8 +1462,8 @@ pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
 
 pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u32x8(a);
-    let b = BitVec::to_u32x8(b);
+    let a = a.as_u32x8();
+    let b = b.as_u32x8();
     simd_select::<8, _, u32>(simd_gt(a, b), a, b).into()
 }
 
@@ -1473,8 +1473,8 @@ pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
 
 pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u8x32(a);
-    let b = BitVec::to_u8x32(b);
+    let a = a.as_u8x32();
+    let b = b.as_u8x32();
     simd_select::<32, _, u8>(simd_gt(a, b), a, b).into()
 }
 
@@ -1484,8 +1484,8 @@ pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
 
 pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let b = BitVec::to_i16x16(b);
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
     simd_select::<16, _, i16>(simd_lt(a, b), a, b).into()
 }
 
@@ -1495,8 +1495,8 @@ pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
 
 pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
     simd_select::<8, i32, _>(simd_lt(a, b), a, b).into()
 }
 
@@ -1506,8 +1506,8 @@ pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
 
 pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i8x32(a);
-    let b = BitVec::to_i8x32(b);
+    let a = a.as_i8x32();
+    let b = b.as_i8x32();
     simd_select::<32, i8, _>(simd_lt(a, b), a, b).into()
 }
 
@@ -1517,8 +1517,8 @@ pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
 
 pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u16x16(a);
-    let b = BitVec::to_u16x16(b);
+    let a = a.as_u16x16();
+    let b = b.as_u16x16();
     simd_select::<16, _, u16>(simd_lt(a, b), a, b).into()
 }
 
@@ -1528,8 +1528,8 @@ pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
 
 pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u32x8(a);
-    let b = BitVec::to_u32x8(b);
+    let a = a.as_u32x8();
+    let b = b.as_u32x8();
     simd_select::<8, _, u32>(simd_lt(a, b), a, b).into()
 }
 
@@ -1539,8 +1539,8 @@ pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
 
 pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u8x32(a);
-    let b = BitVec::to_u8x32(b);
+    let a = a.as_u8x32();
+    let b = b.as_u8x32();
     simd_select::<32, _, u8>(simd_lt(a, b), a, b).into()
 }
 
@@ -1551,7 +1551,7 @@ pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
     let z = i8x32::from_fn(|_| 0);
-    let m: i8x32 = simd_lt(BitVec::to_i8x32(a), z);
+    let m: i8x32 = simd_lt(a.as_i8x32(), z);
     let r = simd_bitmask_little!(31, m, u32);
     r as i32
 }
@@ -1567,7 +1567,7 @@ pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
 
 pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    mpsadbw(BitVec::to_u8x32(a), BitVec::to_u8x32(b), IMM8).into()
+    mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8).into()
 }
 
 /// Multiplies the low 32-bit integers from each packed 64-bit element in
@@ -1578,8 +1578,8 @@ pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
 
 pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(BitVec::to_i64x4(a)));
-    let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(BitVec::to_i64x4(b)));
+    let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(a.as_i64x4()));
+    let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(b.as_i64x4()));
     simd_mul(a, b).into()
 }
 
@@ -1591,8 +1591,8 @@ pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
 
 pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u64x4(a);
-    let b = BitVec::to_u64x4(b);
+    let a = a.as_u64x4();
+    let b = b.as_u64x4();
     let mask = u64x4::splat(u32::MAX.into());
     BitVec::from_u64x4(simd_mul(simd_and(a, mask), simd_and(b, mask)))
 }
@@ -1604,8 +1604,8 @@ pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
 
 pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<16, _, i32>(BitVec::to_i16x16(a));
-    let b = simd_cast::<16, _, i32>(BitVec::to_i16x16(b));
+    let a = simd_cast::<16, _, i32>(a.as_i16x16());
+    let b = simd_cast::<16, _, i32>(b.as_i16x16());
     let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
     simd_cast::<16, i32, i16>(r).into()
 }
@@ -1617,8 +1617,8 @@ pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
 
 pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<16, _, u32>(BitVec::to_u16x16(a));
-    let b = simd_cast::<16, _, u32>(BitVec::to_u16x16(b));
+    let a = simd_cast::<16, _, u32>(a.as_u16x16());
+    let b = simd_cast::<16, _, u32>(b.as_u16x16());
     let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
     simd_cast::<16, u32, u16>(r).into()
 }
@@ -1630,7 +1630,7 @@ pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
 
 pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_mul(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    simd_mul(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Multiplies the packed 32-bit integers in `a` and `b`, producing
@@ -1640,7 +1640,7 @@ pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
 
 pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_mul(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    simd_mul(a.as_i32x8(), b.as_i32x8()).into()
 }
 
 /// Multiplies packed 16-bit integers in `a` and `b`, producing
@@ -1651,7 +1651,7 @@ pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
 
 pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    pmulhrsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    pmulhrsw(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Computes the bitwise OR of 256 bits (representing integer data) in `a`
@@ -1660,7 +1660,7 @@ pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
 
 pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
-    simd_or(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    simd_or(a.as_i32x8(), b.as_i32x8()).into()
 }
 
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
@@ -1669,7 +1669,7 @@ pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
 
 pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    packsswb(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    packsswb(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -1678,7 +1678,7 @@ pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
 
 pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
-    packssdw(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    packssdw(a.as_i32x8(), b.as_i32x8()).into()
 }
 
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
@@ -1687,7 +1687,7 @@ pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
 
 pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
-    packuswb(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    packuswb(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -1696,7 +1696,7 @@ pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
 
 pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
-    packusdw(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    packusdw(a.as_i32x8(), b.as_i32x8()).into()
 }
 
 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
@@ -1707,7 +1707,7 @@ pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
 
 pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
-    permd(BitVec::to_u32x8(a), BitVec::to_u32x8(b)).into()
+    permd(a.as_u32x8(), b.as_u32x8()).into()
 }
 
 /// Permutes 64-bit integers from `a` using control mask `imm8`.
@@ -1717,7 +1717,7 @@ pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
     let zero = i64x4::from_fn(|_| 0);
     let r: i64x4 = simd_shuffle(
-        BitVec::to_i64x4(a),
+        a.as_i64x4(),
         zero,
         [
             IMM8 as u32 & 0b11,
@@ -1734,7 +1734,7 @@ pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
 
 pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    vperm2i128(BitVec::to_i64x4(a), BitVec::to_i64x4(b), IMM8 as i8).into()
+    vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8).into()
 }
 
 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
@@ -1745,7 +1745,7 @@ pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
 
 pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
-    psadbw(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+    psadbw(a.as_u8x32(), b.as_u8x32()).into()
 }
 
 /// Shuffles bytes from `a` according to the content of `b`.
@@ -1778,7 +1778,7 @@ pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
 
 pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
-    pshufb(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+    pshufb(a.as_u8x32(), b.as_u8x32()).into()
 }
 
 /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
@@ -1787,8 +1787,8 @@ pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
     let r: i32x8 = simd_shuffle(
-        BitVec::to_i32x8(a),
-        BitVec::to_i32x8(a),
+        a.as_i32x8(),
+        a.as_i32x8(),
         [
             MASK as u32 & 0b11,
             (MASK as u32 >> 2) & 0b11,
@@ -1810,7 +1810,7 @@ pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
 
 pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
+    let a = a.as_i16x16();
     let r: i16x16 = simd_shuffle(
         a,
         a,
@@ -1843,7 +1843,7 @@ pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
 
 pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
+    let a = a.as_i16x16();
     let r: i16x16 = simd_shuffle(
         a,
         a,
@@ -1876,7 +1876,7 @@ pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
 
 pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
-    psignw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    psignw(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Negates packed 32-bit integers in `a` when the corresponding signed
@@ -1886,7 +1886,7 @@ pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
 
 pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
-    psignd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    psignd(a.as_i32x8(), b.as_i32x8()).into()
 }
 
 /// Negates packed 8-bit integers in `a` when the corresponding signed
@@ -1896,7 +1896,7 @@ pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
 
 pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
-    psignb(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+    psignb(a.as_i8x32(), b.as_i8x32()).into()
 }
 
 /// Shifts packed 16-bit integers in `a` left by `count` while
@@ -1905,7 +1905,7 @@ pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
 
 pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
-    psllw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+    psllw(a.as_i16x16(), count.as_i16x8()).into()
 }
 
 /// Shifts packed 32-bit integers in `a` left by `count` while
@@ -1914,7 +1914,7 @@ pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
 
 pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
-    pslld(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+    pslld(a.as_i32x8(), count.as_i32x4()).into()
 }
 
 /// Shifts packed 64-bit integers in `a` left by `count` while
@@ -1923,7 +1923,7 @@ pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
 
 pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
-    psllq(BitVec::to_i64x4(a), BitVec::to_i64x2(count)).into()
+    psllq(a.as_i64x4(), count.as_i64x2()).into()
 }
 
 /// Shifts packed 16-bit integers in `a` left by `IMM8` while
@@ -1935,7 +1935,7 @@ pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 16 {
         _mm256_setzero_si256()
     } else {
-        simd_shl(BitVec::to_u16x16(a), u16x16::splat(IMM8 as u16)).into()
+        simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)).into()
     }
 }
 
@@ -1948,7 +1948,7 @@ pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 32 {
         _mm256_setzero_si256()
     } else {
-        simd_shl(BitVec::to_u32x8(a), u32x8::splat(IMM8 as u32)).into()
+        simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)).into()
     }
 }
 
@@ -1961,7 +1961,7 @@ pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 64 {
         _mm256_setzero_si256()
     } else {
-        simd_shl(BitVec::to_u64x4(a), u64x4::splat(IMM8 as u64)).into()
+        simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)).into()
     }
 }
 
@@ -1986,7 +1986,7 @@ pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
             32 + (i - shift)
         }
     }
-    let a = BitVec::to_i8x32(a);
+    let a = a.as_i8x32();
     let r: i8x32 = simd_shuffle(
         i8x32::from_fn(|_| 0),
         a,
@@ -2035,7 +2035,7 @@ pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
 
 pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psllvd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    psllvd(a.as_i32x4(), count.as_i32x4()).into()
 }
 
 /// Shifts packed 32-bit integers in `a` left by the amount
@@ -2045,7 +2045,7 @@ pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
 
 pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    psllvd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+    psllvd256(a.as_i32x8(), count.as_i32x8()).into()
 }
 
 /// Shifts packed 64-bit integers in `a` left by the amount
@@ -2055,7 +2055,7 @@ pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
 
 pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psllvq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+    psllvq(a.as_i64x2(), count.as_i64x2()).into()
 }
 
 /// Shifts packed 64-bit integers in `a` left by the amount
@@ -2065,7 +2065,7 @@ pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
 
 pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    psllvq256(BitVec::to_i64x4(a), BitVec::to_i64x4(count)).into()
+    psllvq256(a.as_i64x4(), count.as_i64x4()).into()
 }
 
 /// Shifts packed 16-bit integers in `a` right by `count` while
@@ -2074,7 +2074,7 @@ pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
 
 pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
-    psraw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+    psraw(a.as_i16x16(), count.as_i16x8()).into()
 }
 
 /// Shifts packed 32-bit integers in `a` right by `count` while
@@ -2083,7 +2083,7 @@ pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
 
 pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
-    psrad(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+    psrad(a.as_i32x8(), count.as_i32x4()).into()
 }
 
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while
@@ -2092,7 +2092,7 @@ pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
 
 pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    simd_shr(BitVec::to_i16x16(a), i16x16::splat(IMM8.min(15) as i16)).into()
+    simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16)).into()
 }
 
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while
@@ -2101,7 +2101,7 @@ pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
 
 pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    simd_shr(BitVec::to_i32x8(a), i32x8::splat(IMM8.min(31))).into()
+    simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31))).into()
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
@@ -2110,7 +2110,7 @@ pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
 
 pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psravd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    psravd(a.as_i32x4(), count.as_i32x4()).into()
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
@@ -2119,7 +2119,7 @@ pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
 
 pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
-    psravd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+    psravd256(a.as_i32x8(), count.as_i32x8()).into()
 }
 
 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
@@ -2144,7 +2144,7 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
         }
     }
 
-    let a = BitVec::to_i8x32(a);
+    let a = a.as_i8x32();
     let r: i8x32 = simd_shuffle(
         i8x32::from_fn(|_| 0),
         a,
@@ -2193,7 +2193,7 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
 
 pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
-    psrlw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+    psrlw(a.as_i16x16(), count.as_i16x8()).into()
 }
 
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
@@ -2202,7 +2202,7 @@ pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
 
 pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
-    psrld(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+    psrld(a.as_i32x8(), count.as_i32x4()).into()
 }
 
 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
@@ -2211,7 +2211,7 @@ pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
 
 pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
-    psrlq(BitVec::to_i64x4(a), BitVec::to_i64x2(count)).into()
+    psrlq(a.as_i64x4(), count.as_i64x2()).into()
 }
 
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
@@ -2223,7 +2223,7 @@ pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 16 {
         _mm256_setzero_si256()
     } else {
-        simd_shr(BitVec::to_u16x16(a), u16x16::splat(IMM8 as u16)).into()
+        simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)).into()
     }
 }
 
@@ -2236,7 +2236,7 @@ pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 32 {
         _mm256_setzero_si256()
     } else {
-        simd_shr(BitVec::to_u32x8(a), u32x8::splat(IMM8 as u32)).into()
+        simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)).into()
     }
 }
 
@@ -2249,7 +2249,7 @@ pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 64 {
         _mm256_setzero_si256()
     } else {
-        simd_shr(BitVec::to_u64x4(a), u64x4::splat(IMM8 as u64)).into()
+        simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)).into()
     }
 }
 
@@ -2259,7 +2259,7 @@ pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
 
 pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psrlvd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    psrlvd(a.as_i32x4(), count.as_i32x4()).into()
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by
@@ -2268,7 +2268,7 @@ pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
 
 pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    psrlvd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+    psrlvd256(a.as_i32x8(), count.as_i32x8()).into()
 }
 
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
@@ -2277,7 +2277,7 @@ pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
 
 pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psrlvq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+    psrlvq(a.as_i64x2(), count.as_i64x2()).into()
 }
 
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
@@ -2286,7 +2286,7 @@ pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
 
 pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    psrlvq256(BitVec::to_i64x4(a), BitVec::to_i64x4(count)).into()
+    psrlvq256(a.as_i64x4(), count.as_i64x4()).into()
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
@@ -2294,7 +2294,7 @@ pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
 
 pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    simd_sub(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
@@ -2302,7 +2302,7 @@ pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
 
 pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    simd_sub(a.as_i32x8(), b.as_i32x8()).into()
 }
 
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
@@ -2310,7 +2310,7 @@ pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
 
 pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+    simd_sub(a.as_i64x4(), b.as_i64x4()).into()
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
@@ -2318,7 +2318,7 @@ pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
 
 pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+    simd_sub(a.as_i8x32(), b.as_i8x32()).into()
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
@@ -2327,7 +2327,7 @@ pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
 
 pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    simd_saturating_sub(a.as_i16x16(), b.as_i16x16()).into()
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
@@ -2336,7 +2336,7 @@ pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
 
 pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+    simd_saturating_sub(a.as_i8x32(), b.as_i8x32()).into()
 }
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
@@ -2345,7 +2345,7 @@ pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
 
 pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(BitVec::to_u16x16(a), BitVec::to_u16x16(b)).into()
+    simd_saturating_sub(a.as_u16x16(), b.as_u16x16()).into()
 }
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
@@ -2354,7 +2354,7 @@ pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
 
 pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+    simd_saturating_sub(a.as_u8x32(), b.as_u8x32()).into()
 }
 
 /// Unpacks and interleave 8-bit integers from the high half of each
@@ -2363,7 +2363,7 @@ pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
     #[rustfmt::skip]
-    let r: i8x32 = simd_shuffle(BitVec::to_i8x32(a), BitVec::to_i8x32(b), [
+    let r: i8x32 = simd_shuffle(a.as_i8x32(), b.as_i8x32(), [
             8, 40, 9, 41, 10, 42, 11, 43,
             12, 44, 13, 45, 14, 46, 15, 47,
             24, 56, 25, 57, 26, 58, 27, 59,
@@ -2378,7 +2378,7 @@ pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
     #[rustfmt::skip]
-    let r: i8x32 = simd_shuffle(BitVec::to_i8x32(a), BitVec::to_i8x32(b), [
+    let r: i8x32 = simd_shuffle(a.as_i8x32(), b.as_i8x32(), [
         0, 32, 1, 33, 2, 34, 3, 35,
         4, 36, 5, 37, 6, 38, 7, 39,
         16, 48, 17, 49, 18, 50, 19, 51,
@@ -2393,8 +2393,8 @@ pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
     let r: i16x16 = simd_shuffle(
-        BitVec::to_i16x16(a),
-        BitVec::to_i16x16(b),
+        a.as_i16x16(),
+        b.as_i16x16(),
         [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
     );
     r.into()
@@ -2406,8 +2406,8 @@ pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
     let r: i16x16 = simd_shuffle(
-        BitVec::to_i16x16(a),
-        BitVec::to_i16x16(b),
+        a.as_i16x16(),
+        b.as_i16x16(),
         [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
     );
     r.into()
@@ -2419,8 +2419,8 @@ pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
     let r: i32x8 = simd_shuffle(
-        BitVec::to_i32x8(a),
-        BitVec::to_i32x8(b),
+        a.as_i32x8(),
+        b.as_i32x8(),
         [2, 10, 3, 11, 6, 14, 7, 15],
     );
     r.into()
@@ -2432,8 +2432,8 @@ pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
     let r: i32x8 = simd_shuffle(
-        BitVec::to_i32x8(a),
-        BitVec::to_i32x8(b),
+        a.as_i32x8(),
+        b.as_i32x8(),
         [0, 8, 1, 9, 4, 12, 5, 13],
     );
     r.into()
@@ -2444,7 +2444,7 @@ pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
 
 pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let r: i64x4 = simd_shuffle(BitVec::to_i64x4(a), BitVec::to_i64x4(b), [1, 5, 3, 7]);
+    let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
     r.into()
 }
 
@@ -2453,7 +2453,7 @@ pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
 
 pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let r: i64x4 = simd_shuffle(BitVec::to_i64x4(a), BitVec::to_i64x4(b), [0, 4, 2, 6]);
+    let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
     r.into()
 }
 
@@ -2463,7 +2463,7 @@ pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
 
 pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
-    simd_xor(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+    simd_xor(a.as_i64x4(), b.as_i64x4()).into()
 }
 
 /// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
@@ -2476,7 +2476,7 @@ pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
 // This intrinsic has no corresponding instruction.
 
 pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
-    simd_extract(BitVec::to_u8x32(a), INDEX as u32) as u32 as i32
+    simd_extract(a.as_u8x32(), INDEX as u32) as u32 as i32
 }
 
 /// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
@@ -2489,5 +2489,5 @@ pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
 // This intrinsic has no corresponding instruction.
 
 pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
-    simd_extract(BitVec::to_u16x16(a), INDEX as u32) as u32 as i32
+    simd_extract(a.as_u16x16(), INDEX as u32) as u32 as i32
 }

From 6dfd40cff019d1461c00ff444fd55256f0c48d8e Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Sat, 26 Jul 2025 08:30:21 -0400
Subject: [PATCH 03/47] testing

---
 testable-simd-models/src/core_arch/x86/models/avx2.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 0565630149381..5b76fdaa64c74 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -333,7 +333,7 @@ mod c_extern {
     }
 
     pub fn pslld(a: i32x8, count: i32x4) -> i32x8 {
-        let count = (count[1] as u64) * 4294967296 + (count[0] as u64);
+        let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
 
         i32x8::from_fn(|i| {
             if count > 31 {
@@ -473,7 +473,7 @@ mod c_extern {
     }
 
     pub fn psrld(a: i32x8, count: i32x4) -> i32x8 {
-        let count = (count[1] as u64) * 4294967296 + (count[0] as u64);
+        let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
 
         i32x8::from_fn(|i| {
             if count > 31 {
@@ -589,10 +589,10 @@ mod c_extern {
 
     pub fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4 {
         let a = i128x2::from_fn(|i| {
-            ((a[2 * i] as u32 as u128) + ((a[2 * i + 1] as u32 as u128) << 64)) as i128
+            ((a[2 * i] as u64 as u128) + ((a[2 * i + 1] as u64 as u128) << 64)) as i128
         });
         let b = i128x2::from_fn(|i| {
-            ((b[2 * i] as u32 as u128) + ((b[2 * i + 1] as u32 as u128) << 64)) as i128
+            ((b[2 * i] as u64 as u128) + ((b[2 * i + 1] as u64 as u128) << 64)) as i128
         });
         let imm8 = imm8 as u8 as u32 as i32;
         let r = i128x2::from_fn(|i| {

From 888d53ff418f67411bbb5c21d79706c9baf8a023 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Sat, 26 Jul 2025 08:41:44 -0400
Subject: [PATCH 04/47] test fix

---
 testable-simd-models/src/abstractions/bit.rs          | 11 +++++++++++
 testable-simd-models/src/core_arch/x86/models/sse2.rs |  2 +-
 .../src/core_arch/x86/models/ssse3.rs                 |  4 ----
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 4fac19fdcd567..f4c1f99d42093 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -68,6 +68,17 @@ impl std::ops::BitXor for Bit {
     }
 }
 
+impl std::ops::Not for Bit {
+    type Output = Self;
+     fn not(self) -> Self {
+        match self {
+            Bit::One => Bit::Zero,
+            Bit::Zero => Bit::One,
+        }
+    }
+}
+
+
 impl std::ops::Neg for Bit {
     type Output = Self;
     fn neg(self) -> Self {
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index ae6a1b0b7f6d5..1fe801b7de340 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -781,7 +781,7 @@ pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
 
 pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_fn(|i| BitVec::<128>::from_fn(|i| _mm_set1_epi8(-1)[i] ^ a[i])[i] & b[i])
+    BitVec::from_fn(|i| (!a[i]) & b[i])
 }
 
 /// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index a4375f56a0672..92ab218710be6 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -235,10 +235,6 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
         BitVec::to_i8x16(b),
         BitVec::to_i8x16(a),
         [
-            mask(IMM8 as u32, 0),
-            mask(IMM8 as u32, 0),
-            mask(IMM8 as u32, 0),
-            mask(IMM8 as u32, 0),
             mask(IMM8 as u32, 0),
             mask(IMM8 as u32, 1),
             mask(IMM8 as u32, 2),

From 19c54496738abba8d8a14c9fba7b7e23bfc440ed Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Sat, 26 Jul 2025 17:02:10 -0400
Subject: [PATCH 05/47] removed bitvec

---
 .../src/core_arch/x86/models/avx.rs           |  16 +-
 .../src/core_arch/x86/models/avx2.rs          |   4 +-
 .../src/core_arch/x86/models/sse2.rs          | 200 +++++++++---------
 .../src/core_arch/x86/models/ssse3.rs         |  42 ++--
 4 files changed, 129 insertions(+), 133 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 50886b5def283..d114290c44a05 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -14,7 +14,7 @@
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
 use super::types::*;
-use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::*};
+use crate::abstractions::simd::*;
 
 mod c_extern {
     use crate::abstractions::simd::*;
@@ -163,12 +163,8 @@ pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
 pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    let c = BitVec::<256>::from_fn(|i| match (a[i], b[i]) {
-        (Bit::One, Bit::One) => Bit::One,
-        _ => Bit::Zero,
-    });
-    let all_zero = c.fold(true, |acc, bit| acc && bit == Bit::Zero);
-    if all_zero {
+    let c = __m256i::from_fn(|i| a[i] & b[i]);
+    if c == __m256i::ZERO() {
         1
     } else {
         0
@@ -319,7 +315,7 @@ pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
 
 // This intrinsic has no corresponding instruction.
 
-pub fn _mm256_set1_epi8(val: i8) -> BitVec<256> {
+pub fn _mm256_set1_epi8(val: i8) -> __m256i {
     transmute(i8x32::from_fn(|_| val))
 }
 
@@ -377,7 +373,7 @@ pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
 // instructions, thus it has zero latency.
 
 pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
-    BitVec::from_fn(|i| a[i])
+    __m128i::from_fn(|i| a[i])
 }
 
 /// Casts vector of type __m128i to type __m256i;
@@ -400,5 +396,5 @@ pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
 
 pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
-    BitVec::from_fn(|i| if i < 128 { lo[i] } else { hi[i - 128] })
+    __m256i::from_fn(|i| if i < 128 { lo[i] } else { hi[i - 128] })
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 5b76fdaa64c74..82efd1fb53f05 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -19,7 +19,7 @@
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
-use crate::abstractions::{bitvec::BitVec, simd::*};
+use crate::abstractions::simd::*;
 
 mod c_extern {
     use crate::abstractions::{bit::MachineInteger, simd::*};
@@ -1594,7 +1594,7 @@ pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_u64x4();
     let b = b.as_u64x4();
     let mask = u64x4::splat(u32::MAX.into());
-    BitVec::from_u64x4(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    __m256i::from_u64x4(simd_mul(simd_and(a, mask), simd_and(b, mask)))
 }
 
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 1fe801b7de340..64dc661b9fc8f 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -1,6 +1,6 @@
 //! Streaming SIMD Extensions 2 (SSE2)
 use super::types::*;
-use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::*};
+use crate::abstractions::simd::*;
 mod c_extern {
     use crate::abstractions::{bit::MachineInteger, simd::*};
     pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
@@ -207,7 +207,7 @@ use c_extern::*;
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
 
 pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+    simd_add(a.as_i8x16(), b.as_i8x16()).into()
 }
 
 /// Adds packed 16-bit integers in `a` and `b`.
@@ -215,7 +215,7 @@ pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
 
 pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_i16x8(simd_add(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+    transmute(simd_add(b.as_i16x8(), a.as_i16x8()))
 }
 
 /// Adds packed 32-bit integers in `a` and `b`.
@@ -223,7 +223,7 @@ pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
 
 pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    simd_add(b.as_i32x4(), a.as_i32x4()).into()
 }
 
 /// Adds packed 64-bit integers in `a` and `b`.
@@ -231,7 +231,7 @@ pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
 
 pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(BitVec::to_i64x2(a), BitVec::to_i64x2(b)).into()
+    simd_add(b.as_i64x2(), a.as_i64x2()).into()
 }
 
 /// Adds packed 8-bit integers in `a` and `b` using saturation.
@@ -239,7 +239,7 @@ pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
 
 pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+    simd_saturating_add(b.as_i8x16(), a.as_i8x16()).into()
 }
 
 /// Adds packed 16-bit integers in `a` and `b` using saturation.
@@ -247,7 +247,7 @@ pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
 
 pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    simd_saturating_add(b.as_i16x8(), a.as_i16x8()).into()
 }
 
 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
@@ -255,7 +255,7 @@ pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
 
 pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+    simd_saturating_add(b.as_u8x16(), a.as_u8x16()).into()
 }
 
 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
@@ -263,7 +263,7 @@ pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
 
 pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(BitVec::to_u16x8(a), BitVec::to_u16x8(b)).into()
+    simd_saturating_add(b.as_u16x8(), a.as_u16x8()).into()
 }
 
 /// Averages packed unsigned 8-bit integers in `a` and `b`.
@@ -271,8 +271,8 @@ pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
 
 pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<16, _, u16>(BitVec::to_u8x16(a));
-    let b = simd_cast::<16, _, u16>(BitVec::to_u8x16(b));
+    let a = simd_cast::<16, _, u16>(a.as_u8x16());
+    let b = simd_cast::<16, _, u16>(b.as_u8x16());
     let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
     simd_cast::<16, _, u8>(r).into()
 }
@@ -282,8 +282,8 @@ pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
 
 pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<8, _, u32>(BitVec::to_u16x8(a));
-    let b = simd_cast::<8, _, u32>(BitVec::to_u16x8(b));
+    let a = simd_cast::<8, _, u32>(a.as_u16x8());
+    let b = simd_cast::<8, _, u32>(b.as_u16x8());
     let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
     simd_cast::<8, _, u16>(r).into()
 }
@@ -297,7 +297,7 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
 
 pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    pmaddwd(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    pmaddwd(b.as_i16x8(), a.as_i16x8()).into()
 }
 
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -306,8 +306,8 @@ pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
 
 pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
-    let b = BitVec::to_i16x8(b);
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
     simd_select(simd_gt(a, b), a, b).into()
 }
 
@@ -317,8 +317,8 @@ pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
 
 pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_u8x16(a);
-    let b = BitVec::to_u8x16(b);
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
     simd_select(simd_gt(a, b), a, b).into()
 }
 
@@ -328,8 +328,8 @@ pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
 
 pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
-    let b = BitVec::to_i16x8(b);
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
     simd_select(simd_lt(a, b), a, b).into()
 }
 
@@ -339,8 +339,8 @@ pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
 
 pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_u8x16(a);
-    let b = BitVec::to_u8x16(b);
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
     simd_select(simd_lt(a, b), a, b).into()
 }
 
@@ -352,10 +352,10 @@ pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
 
 pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<8, i16, i32>(BitVec::to_i16x8(a));
-    let b = simd_cast::<8, i16, i32>(BitVec::to_i16x8(b));
+    let a = simd_cast::<8, i16, i32>(a.as_i16x8());
+    let b = simd_cast::<8, i16, i32>(b.as_i16x8());
     let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
-    BitVec::from_i16x8(simd_cast::<8, i32, i16>(r))
+    transmute(simd_cast::<8, i32, i16>(r))
 }
 
 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
@@ -366,8 +366,8 @@ pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
 
 pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<8, _, u32>(BitVec::to_u16x8(a));
-    let b = simd_cast::<8, _, u32>(BitVec::to_u16x8(b));
+    let a = simd_cast::<8, _, u32>(a.as_u16x8());
+    let b = simd_cast::<8, _, u32>(b.as_u16x8());
     let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
     simd_cast::<8, u32, u16>(r).into()
 }
@@ -380,7 +380,7 @@ pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
 
 pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_i16x8(simd_mul(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+    transmute(simd_mul(b.as_i16x8(), a.as_i16x8()))
 }
 
 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
@@ -391,8 +391,8 @@ pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
 
 pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_u64x2(a);
-    let b = BitVec::to_u64x2(b);
+    let a = a.as_u64x2();
+    let b = b.as_u64x2();
     let mask = u64x2::splat(u32::MAX.into());
     simd_mul(simd_and(a, mask), simd_and(b, mask)).into()
 }
@@ -407,7 +407,7 @@ pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
 
 pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
-    psadbw(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+    psadbw(b.as_u8x16(), a.as_u8x16()).into()
 }
 
 /// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
@@ -415,7 +415,7 @@ pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
 
 pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_i8x16(simd_sub(BitVec::to_i8x16(a), BitVec::to_i8x16(b)))
+    transmute(simd_sub(b.as_i8x16(), a.as_i8x16()))
 }
 
 /// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
@@ -423,7 +423,7 @@ pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
 
 pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_i16x8(simd_sub(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+    transmute(simd_sub(b.as_i16x8(), a.as_i16x8()))
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
@@ -431,7 +431,7 @@ pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
 
 pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_sub(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    simd_sub(b.as_i32x4(), a.as_i32x4()).into()
 }
 
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
@@ -439,7 +439,7 @@ pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
 
 pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_sub(BitVec::to_i64x2(a), BitVec::to_i64x2(b)).into()
+    simd_sub(b.as_i64x2(), a.as_i64x2()).into()
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
@@ -448,7 +448,7 @@ pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
 
 pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+    simd_saturating_sub(b.as_i8x16(), a.as_i8x16()).into()
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
@@ -457,7 +457,7 @@ pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
 
 pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    simd_saturating_sub(b.as_i16x8(), a.as_i16x8()).into()
 }
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
@@ -466,7 +466,7 @@ pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
 
 pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+    simd_saturating_sub(b.as_u8x16(), a.as_u8x16()).into()
 }
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
@@ -475,7 +475,7 @@ pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
 
 pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(BitVec::to_u16x8(a), BitVec::to_u16x8(b)).into()
+    simd_saturating_sub(b.as_u16x8(), a.as_u16x8()).into()
 }
 
 /// Shifts `a` left by `IMM8` bytes while shifting in zeros.
@@ -501,7 +501,7 @@ fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
     }
     (simd_shuffle(
         i8x16::from_fn(|_| 0),
-        BitVec::to_i8x16(a),
+        a.as_i8x16(),
         [
             mask(IMM8, 0),
             mask(IMM8, 1),
@@ -552,7 +552,7 @@ pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     if IMM8 >= 16 {
         _mm_setzero_si128()
     } else {
-        simd_shl(BitVec::to_u16x8(a), u16x8::splat(IMM8 as u16)).into()
+        simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)).into()
     }
 }
 
@@ -562,7 +562,7 @@ pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
 
 pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
-    psllw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+    psllw(count.as_i16x8(), a.as_i16x8()).into()
 }
 
 /// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
@@ -575,7 +575,7 @@ pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
     if IMM8 >= 32 {
         _mm_setzero_si128()
     } else {
-        simd_shl(BitVec::to_u32x4(a), u32x4::splat(IMM8 as u32)).into()
+        simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)).into()
     }
 }
 
@@ -585,7 +585,7 @@ pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
 
 pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
-    pslld(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    pslld(count.as_i32x4(), a.as_i32x4()).into()
 }
 
 /// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
@@ -598,7 +598,7 @@ pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
     if IMM8 >= 64 {
         _mm_setzero_si128()
     } else {
-        simd_shl(BitVec::to_u64x2(a), u64x2::splat(IMM8 as u64)).into()
+        simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)).into()
     }
 }
 
@@ -608,7 +608,7 @@ pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
 
 pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psllq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+    psllq(count.as_i64x2(), a.as_i64x2()).into()
 }
 
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
@@ -618,7 +618,7 @@ pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
 
 pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     // static_assert_uimm_bits!(IMM8, 8);
-    simd_shr(BitVec::to_i16x8(a), i16x8::splat(IMM8.min(15) as i16)).into()
+    simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16)).into()
 }
 
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
@@ -627,7 +627,7 @@ pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
 
 pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
-    psraw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+    psraw(count.as_i16x8(), a.as_i16x8()).into()
 }
 
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
@@ -637,7 +637,7 @@ pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
 
 pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
     // static_assert_uimm_bits!(IMM8, 8);
-    simd_shr(BitVec::to_i32x4(a), i32x4::splat(IMM8.min(31))).into()
+    simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31))).into()
 }
 
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
@@ -646,7 +646,7 @@ pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
 
 pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psrad(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    psrad(count.as_i32x4(), a.as_i32x4()).into()
 }
 
 /// Shifts `a` right by `IMM8` bytes while shifting in zeros.
@@ -670,7 +670,7 @@ fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
         }
     }
     let x: i8x16 = simd_shuffle(
-        BitVec::to_i8x16(a),
+        a.as_i8x16(),
         i8x16::from_fn(|_| 0),
         [
             mask(IMM8, 0),
@@ -705,7 +705,7 @@ pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     if IMM8 >= 16 {
         _mm_setzero_si128()
     } else {
-        simd_shr(BitVec::to_u16x8(a), u16x8::splat(IMM8 as u16)).into()
+        simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)).into()
     }
 }
 
@@ -715,7 +715,7 @@ pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
 
 pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
-    psrlw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+    psrlw(a.as_i16x8(), count.as_i16x8()).into()
 }
 
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
@@ -729,7 +729,7 @@ pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
     if IMM8 >= 32 {
         _mm_setzero_si128()
     } else {
-        simd_shr(BitVec::to_u32x4(a), u32x4::splat(IMM8 as u32)).into()
+        simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)).into()
     }
 }
 
@@ -739,7 +739,7 @@ pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
 
 pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psrld(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    psrld(count.as_i32x4(), a.as_i32x4()).into()
 }
 
 /// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
@@ -751,9 +751,9 @@ pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
     // TODO    // static_assert_uimm_bits!(IMM8, 8);
 
     if IMM8 >= 64 {
-        BitVec::from_fn(|_| Bit::Zero)
+        __m128i::ZERO()
     } else {
-        BitVec::from_u64x2(simd_shr(BitVec::to_u64x2(a), u64x2::splat(IMM8 as u64)))
+        __m128i::from_u64x2(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
     }
 }
 
@@ -763,7 +763,7 @@ pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
 
 pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psrlq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+    psrlq(count.as_i64x2(), a.as_i64x2()).into()
 }
 
 /// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
@@ -772,7 +772,7 @@ pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
 
 pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_fn(|i| a[i] & b[i])
+    __m128i::from_fn(|i| a[i] & b[i])
 }
 
 /// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
@@ -781,7 +781,7 @@ pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
 
 pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_fn(|i| (!a[i]) & b[i])
+    __m128i::from_fn(|i| (!a[i]) & b[i])
 }
 
 /// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
@@ -790,7 +790,7 @@ pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
 
 pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_fn(|i| a[i] | b[i])
+    __m128i::from_fn(|i| a[i] | b[i])
 }
 
 /// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
@@ -799,7 +799,7 @@ pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
 
 pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_fn(|i| a[i] ^ b[i])
+    __m128i::from_fn(|i| a[i] ^ b[i])
 }
 
 /// Compares packed 8-bit integers in `a` and `b` for equality.
@@ -807,7 +807,7 @@ pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
 
 pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
-    (simd_eq(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+    transmute(simd_eq(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares packed 16-bit integers in `a` and `b` for equality.
@@ -815,7 +815,7 @@ pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
 
 pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
-    (simd_eq(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+    transmute (simd_eq(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares packed 32-bit integers in `a` and `b` for equality.
@@ -823,7 +823,7 @@ pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
 
 pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
-    (simd_eq(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+    transmute(simd_eq(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
@@ -831,7 +831,7 @@ pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
 
 pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
-    (simd_gt(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+    transmute(simd_gt(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
@@ -839,7 +839,7 @@ pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
 
 pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
-    (simd_gt(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+    transmute(simd_gt(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
@@ -847,7 +847,7 @@ pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
 
 pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
-    (simd_gt(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+    transmute(simd_gt(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compares packed 8-bit integers in `a` and `b` for less-than.
@@ -855,7 +855,7 @@ pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
 
 pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
-    (simd_lt(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+    transmute(simd_lt(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares packed 16-bit integers in `a` and `b` for less-than.
@@ -863,7 +863,7 @@ pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
 
 pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
-    (simd_lt(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+    transmute(simd_lt(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares packed 32-bit integers in `a` and `b` for less-than.
@@ -871,7 +871,7 @@ pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
 
 pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
-    (simd_lt(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+    transmute(simd_lt(a.as_i32x4(), b.as_i32x4()))
 }
 
 pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
@@ -883,7 +883,7 @@ pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
 
 pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
-    simd_extract(BitVec::to_i32x4(a), 0)
+    simd_extract(a.as_i32x4(), 0)
 }
 
 /// Sets packed 64-bit integers with the supplied values, from highest to
@@ -903,7 +903,7 @@ pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
 // no particular instruction to test
 pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
     let vec = [e0, e1, e2, e3];
-    BitVec::from_i32x4(i32x4::from_fn(|i| vec[i as usize]))
+    transmute(i32x4::from_fn(|i| vec[i as usize]))
 }
 
 /// Sets packed 16-bit integers with the supplied values.
@@ -923,7 +923,7 @@ pub fn _mm_set_epi16(
     e0: i16,
 ) -> __m128i {
     let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
-    BitVec::from_i16x8(i16x8::from_fn(|i| vec[i as usize]))
+    transmute(i16x8::from_fn(|i| vec[i as usize]))
 }
 
 /// Sets packed 8-bit integers with the supplied values.
@@ -951,7 +951,7 @@ pub fn _mm_set_epi8(
     let vec = [
         e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
     ];
-    BitVec::from_i8x16(i8x16::from_fn(|i| vec[i as usize]))
+    transmute(i8x16::from_fn(|i| vec[i as usize]))
 }
 
 /// Broadcasts 64-bit integer `a` to all elements.
@@ -981,7 +981,7 @@ pub fn _mm_set1_epi32(a: i32) -> __m128i {
 // no particular instruction to test
 
 pub fn _mm_set1_epi16(a: i16) -> __m128i {
-    BitVec::from_i16x8(i16x8::from_fn(|_| a))
+    transmute(i16x8::from_fn(|_| a))
 }
 
 /// Broadcasts 8-bit integer `a` to all elements.
@@ -1057,7 +1057,7 @@ pub fn _mm_setr_epi8(
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
 
 pub fn _mm_setzero_si128() -> __m128i {
-    BitVec::from_fn(|_| Bit::Zero)
+    __m128i::ZERO()
 }
 
 /// Returns a vector where the low element is extracted from `a` and its upper
@@ -1068,7 +1068,7 @@ pub fn _mm_setzero_si128() -> __m128i {
 // FIXME movd on msvc, movd on i686
 
 pub fn _mm_move_epi64(a: __m128i) -> __m128i {
-    let r: i64x2 = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 2]);
+    let r: i64x2 = simd_shuffle(a.as_i64x2(), i64x2::from_fn(|_| 0), [0, 2]);
     r.into()
 }
 
@@ -1078,7 +1078,7 @@ pub fn _mm_move_epi64(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
 
 pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    packsswb(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    packsswb(b.as_i16x8(), a.as_i16x8()).into()
 }
 
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -1087,7 +1087,7 @@ pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
 
 pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
-    packssdw(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    packssdw(b.as_i32x4(), a.as_i32x4()).into()
 }
 
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
@@ -1096,7 +1096,7 @@ pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
 
 pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
-    packuswb(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    packuswb(b.as_i16x8(), a.as_i16x8()).into()
 }
 
 /// Returns the `imm8` element of `a`.
@@ -1105,7 +1105,7 @@ pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
     // static_assert_uimm_bits!(IMM8, 3);
-    simd_extract(BitVec::to_u16x8(a), IMM8 as u32) as i32
+    simd_extract(a.as_u16x8(), IMM8 as u32) as i32
 }
 
 /// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
@@ -1114,7 +1114,7 @@ pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
 
 pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
     // static_assert_uimm_bits!(IMM8, 3);
-    simd_insert(BitVec::to_i16x8(a), IMM8 as u32, i as i16).into()
+    simd_insert(a.as_i16x8(), IMM8 as u32, i as i16).into()
 }
 
 /// Returns a mask of the most significant bit of each element in `a`.
@@ -1123,7 +1123,7 @@ pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
 
 pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
     let z = i8x16::from_fn(|_| 0);
-    let m: i8x16 = simd_lt(BitVec::to_i8x16(a), z);
+    let m: i8x16 = simd_lt(a.as_i8x16(), z);
     let r = simd_bitmask_little!(15, m, u16);
     r as u32 as i32
 }
@@ -1135,7 +1135,7 @@ pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
 pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
     // static_assert_uimm_bits!(IMM8, 8);
 
-    let a = BitVec::to_i32x4(a);
+    let a = a.as_i32x4();
     let x: i32x4 = simd_shuffle(
         a,
         a,
@@ -1160,7 +1160,7 @@ pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     // static_assert_uimm_bits!(IMM8, 8);
 
-    let a = BitVec::to_i16x8(a);
+    let a = a.as_i16x8();
     let x: i16x8 = simd_shuffle(
         a,
         a,
@@ -1189,7 +1189,7 @@ pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     // static_assert_uimm_bits!(IMM8, 8);
 
-    let a = BitVec::to_i16x8(a);
+    let a = a.as_i16x8();
     let x: i16x8 = simd_shuffle(
         a,
         a,
@@ -1213,8 +1213,8 @@ pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 
 pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
     (simd_shuffle(
-        BitVec::to_i8x16(a),
-        BitVec::to_i8x16(b),
+        a.as_i8x16(),
+        b.as_i8x16(),
         [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
     ))
     .into()
@@ -1226,8 +1226,8 @@ pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
 
 pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
     let x = simd_shuffle(
-        BitVec::to_i16x8(a),
-        BitVec::to_i16x8(b),
+        a.as_i16x8(),
+        b.as_i16x8(),
         [4, 12, 5, 13, 6, 14, 7, 15],
     );
     (x).into()
@@ -1238,7 +1238,7 @@ pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
 
 pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(BitVec::to_i32x4(a), BitVec::to_i32x4(b), [2, 6, 3, 7])).into()
+    (simd_shuffle(b.as_i32x4(), a.as_i32x4(), [2, 6, 3, 7])).into()
 }
 
 /// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
@@ -1246,7 +1246,7 @@ pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
 
 pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(b), [1, 3])).into()
+    (simd_shuffle(b.as_i64x2(), a.as_i64x2(), [1, 3])).into()
 }
 
 /// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
@@ -1255,8 +1255,8 @@ pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
 
 pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
     (simd_shuffle(
-        BitVec::to_i8x16(a),
-        BitVec::to_i8x16(b),
+        a.as_i8x16(),
+        b.as_i8x16(),
         [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
     ))
     .into()
@@ -1268,8 +1268,8 @@ pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
 
 pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
     let x = simd_shuffle(
-        BitVec::to_i16x8(a),
-        BitVec::to_i16x8(b),
+        a.as_i16x8(),
+        b.as_i16x8(),
         [0, 8, 1, 9, 2, 10, 3, 11],
     );
     x.into()
@@ -1280,7 +1280,7 @@ pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
 
 pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_shuffle(BitVec::to_i32x4(a), BitVec::to_i32x4(b), [0, 4, 1, 5]).into()
+    simd_shuffle(b.as_i32x4(), a.as_i32x4(), [0, 4, 1, 5]).into()
 }
 
 /// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
@@ -1288,7 +1288,7 @@ pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
 
 pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(b), [0, 2]).into()
+    simd_shuffle(b.as_i64x2(), a.as_i64x2(), [0, 2]).into()
 }
 
 /// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
@@ -1299,5 +1299,5 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
 
 pub fn _mm_undefined_si128() -> __m128i {
-    BitVec::from_fn(|_| Bit::Zero)
+    __m128i::ZERO()
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index 92ab218710be6..7fc29262232a9 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -1,6 +1,6 @@
 //! Supplemental Streaming SIMD Extensions 3 (SSSE3)
 
-use crate::abstractions::{bitvec::BitVec, simd::*};
+use crate::abstractions::simd::*;
 
 use super::types::*;
 
@@ -141,10 +141,10 @@ use c_extern::*;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8)
 pub fn _mm_abs_epi8(a: __m128i) -> __m128i {
-    let a = BitVec::to_i8x16(a);
+    let a = a.as_i8x16();
     let zero = i8x16::from_fn(|_| 0);
     let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
-    BitVec::from_i8x16(r)
+    transmute(r)
 }
 
 /// Computes the absolute value of each of the packed 16-bit signed integers in
@@ -153,10 +153,10 @@ pub fn _mm_abs_epi8(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16)
 pub fn _mm_abs_epi16(a: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
+    let a = a.as_i16x8();
     let zero = i16x8::from_fn(|_| 0);
     let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
-    BitVec::from_i16x8(r)
+    transmute(r)
 }
 
 /// Computes the absolute value of each of the packed 32-bit signed integers in
@@ -165,10 +165,10 @@ pub fn _mm_abs_epi16(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32)
 pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
-    let a = BitVec::to_i32x4(a);
+    let a = a.as_i32x4();
     let zero = i32x4::from_fn(|_| 0);
     let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
-    BitVec::from_i32x4(r)
+    transmute(r)
 }
 
 /// Shuffles bytes from `a` according to the content of `b`.
@@ -198,7 +198,7 @@ pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8)
 pub fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_u8x16(pshufb128(BitVec::to_u8x16(a), BitVec::to_u8x16(b)))
+    transmute(pshufb128(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
@@ -232,8 +232,8 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     }
 
     let r: i8x16 = simd_shuffle(
-        BitVec::to_i8x16(b),
-        BitVec::to_i8x16(a),
+        b.as_i8x16(),
+        a.as_i8x16(),
         [
             mask(IMM8 as u32, 0),
             mask(IMM8 as u32, 1),
@@ -262,7 +262,7 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16)
 
 pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phaddw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    phaddw128(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -272,7 +272,7 @@ pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16)
 
 pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phaddsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    phaddsw128(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -281,7 +281,7 @@ pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32)
 
 pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
-    phaddd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    phaddd128(a.as_i32x4(), b.as_i32x4()).into()
 }
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
@@ -290,7 +290,7 @@ pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16)
 
 pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phsubw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    phsubw128(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
@@ -301,7 +301,7 @@ pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16)
 
 pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phsubsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    phsubsw128(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
@@ -310,7 +310,7 @@ pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32)
 
 pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    phsubd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    phsubd128(a.as_i32x4(), b.as_i32x4()).into()
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -322,7 +322,7 @@ pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16)
 
 pub fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    pmaddubsw128(BitVec::to_u8x16(a), BitVec::to_i8x16(b)).into()
+    pmaddubsw128(a.as_u8x16(), b.as_i8x16()).into()
 }
 
 /// Multiplies packed 16-bit signed integer values, truncate the 32-bit
@@ -332,7 +332,7 @@ pub fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16)
 
 pub fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    pmulhrsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    pmulhrsw128(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit
@@ -343,7 +343,7 @@ pub fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8)
 
 pub fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
-    psignb128(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+    psignb128(a.as_i8x16(), b.as_i8x16()).into()
 }
 
 /// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit
@@ -354,7 +354,7 @@ pub fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16)
 
 pub fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
-    psignw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    psignw128(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit
@@ -365,5 +365,5 @@ pub fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32)
 
 pub fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
-    psignd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    psignd128(a.as_i32x4(), b.as_i32x4()).into()
 }

From 356e1e2b825c0268a64a558b4733d41ab9cf7f8b Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Sat, 26 Jul 2025 17:02:21 -0400
Subject: [PATCH 06/47] fmt

---
 testable-simd-models/src/abstractions/bit.rs  |  3 +-
 .../src/abstractions/funarr.rs                |  2 +-
 testable-simd-models/src/abstractions/simd.rs |  5 ++--
 .../src/core_arch/x86/models/avx.rs           |  2 +-
 .../src/core_arch/x86/models/avx2.rs          | 30 ++++---------------
 .../src/core_arch/x86/models/sse2.rs          | 14 ++-------
 6 files changed, 14 insertions(+), 42 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index f4c1f99d42093..75cfd7e755ccb 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -70,7 +70,7 @@ impl std::ops::BitXor for Bit {
 
 impl std::ops::Not for Bit {
     type Output = Self;
-     fn not(self) -> Self {
+    fn not(self) -> Self {
         match self {
             Bit::One => Bit::Zero,
             Bit::Zero => Bit::One,
@@ -78,7 +78,6 @@ impl std::ops::Not for Bit {
     }
 }
 
-
 impl std::ops::Neg for Bit {
     type Output = Self;
     fn neg(self) -> Self {
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index a57d6d2fd58d2..c5c5ece29f1f8 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -55,7 +55,7 @@ impl<const N: u32, T> FunArray<N, T> {
     }
 }
 
-impl<const N: u32, T:MachineInteger> FunArray<N, T> {
+impl<const N: u32, T: MachineInteger> FunArray<N, T> {
     #[allow(non_snake_case)]
     pub fn ZERO() -> Self {
         Self::from_fn(|_| T::ZEROS)
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index b0193748b554d..622690f263037 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -70,7 +70,6 @@ interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x
 interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
 interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
 
-
 /// Inserts an element into a vector, returning the updated vector.
 ///
 /// # Safety
@@ -942,6 +941,6 @@ pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy + MachineInte
 }
 
 /// Converts one type to another
-pub fn transmute<T, U:From<T>>(a:T) -> U {
+pub fn transmute<T, U: From<T>>(a: T) -> U {
     a.into()
-}
\ No newline at end of file
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index d114290c44a05..fb7f5fe70cdbe 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -112,7 +112,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
 pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-   // static_assert_uimm_bits!(IMM8, 8);
+    // static_assert_uimm_bits!(IMM8, 8);
     vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8).into()
 }
 
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 82efd1fb53f05..cb57efb4d5a9d 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -318,7 +318,7 @@ mod c_extern {
     }
 
     pub fn psllw(a: i16x16, count: i16x8) -> i16x16 {
-        let count4 = (count[0] as u16) as  u64;
+        let count4 = (count[0] as u16) as u64;
         let count3 = ((count[1] as u16) as u64) * 65536;
         let count2 = ((count[2] as u16) as u64) * 4294967296;
         let count1 = ((count[3] as u16) as u64) * 281474976710656;
@@ -714,10 +714,7 @@ pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
 
 pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_saturating_add(
-        a.as_i8x32(),
-        b.as_i8x32(),
-    ))
+    transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Adds packed 16-bit integers in `a` and `b` using saturation.
@@ -725,10 +722,7 @@ pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
 
 pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_saturating_add(
-        a.as_i16x16(),
-        b.as_i16x16(),
-    ))
+    transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
@@ -923,11 +917,7 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
     let all_ones = _mm256_set1_epi8(-1);
-    simd_and(
-        simd_xor(a.as_i64x4(),all_ones.as_i64x4()),
-        b.as_i64x4(),
-    )
-    .into()
+    simd_and(simd_xor(a.as_i64x4(), all_ones.as_i64x4()), b.as_i64x4()).into()
 }
 
 /// Averages packed unsigned 16-bit integers in `a` and `b`.
@@ -2418,11 +2408,7 @@ pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
 
 pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle(
-        a.as_i32x8(),
-        b.as_i32x8(),
-        [2, 10, 3, 11, 6, 14, 7, 15],
-    );
+    let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
     r.into()
 }
 
@@ -2431,11 +2417,7 @@ pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
 
 pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle(
-        a.as_i32x8(),
-        b.as_i32x8(),
-        [0, 8, 1, 9, 4, 12, 5, 13],
-    );
+    let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
     r.into()
 }
 
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 64dc661b9fc8f..711726550ea82 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -815,7 +815,7 @@ pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
 
 pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute (simd_eq(a.as_i16x8(), b.as_i16x8()))
+    transmute(simd_eq(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares packed 32-bit integers in `a` and `b` for equality.
@@ -1225,11 +1225,7 @@ pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
 
 pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        [4, 12, 5, 13, 6, 14, 7, 15],
-    );
+    let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
     (x).into()
 }
 
@@ -1267,11 +1263,7 @@ pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
 
 pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        [0, 8, 1, 9, 2, 10, 3, 11],
-    );
+    let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
     x.into()
 }
 

From 73268aa7c12806487043511639ae316c578c2b43 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Tue, 29 Jul 2025 14:59:45 -0400
Subject: [PATCH 07/47] factor out handwritten models

---
 testable-simd-models/src/abstractions/bit.rs  |   2 +-
 .../src/abstractions/bitvec.rs                |  74 +++++-----
 .../src/abstractions/funarr.rs                | 129 +++++++++++++++---
 testable-simd-models/src/abstractions/mod.rs  |   1 +
 testable-simd-models/src/abstractions/simd.rs |  65 +++++----
 .../src/core_arch/x86/models/avx.rs           |  43 ++----
 .../src/core_arch/x86/models/avx2.rs          |  14 +-
 .../src/core_arch/x86/models/mod.rs           |   1 +
 .../src/core_arch/x86/models/ssse3.rs         |   8 +-
 testable-simd-models/src/helpers.rs           |   4 +-
 10 files changed, 211 insertions(+), 130 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 75cfd7e755ccb..0fd7d6ec78771 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -208,7 +208,7 @@ impl Bit {
         if x >= 0 {
             Self::of_raw_int(x as u128, nth)
         } else {
-            Self::of_raw_int((2i128.pow(T::bits()) + x) as u128, nth)
+            Self::of_raw_int((2i128.pow(T::bits() as u32) + x) as u128, nth)
         }
     }
 }
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index 952c5a21cb1a6..f0384420e5812 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -15,9 +15,9 @@ use std::fmt::Formatter;
 /// making the bit pattern more human-readable. The type also implements indexing,
 /// allowing for easy access to individual bits.
 #[derive(Copy, Clone, Eq, PartialEq)]
-pub struct BitVec<const N: u32>(FunArray<N, Bit>);
+pub struct BitVec<const N: usize>(FunArray<N, Bit>);
 
-impl<const N: u32> BitVec<N> {
+impl<const N: usize> BitVec<N> {
     #[allow(non_snake_case)]
     pub fn ZERO() -> Self {
         Self::from_fn(|_| Bit::Zero)
@@ -40,15 +40,15 @@ fn bit_slice_to_string(bits: &[Bit]) -> String {
         .into()
 }
 
-impl<const N: u32> core::fmt::Debug for BitVec<N> {
+impl<const N: usize> core::fmt::Debug for BitVec<N> {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
         write!(f, "{}", bit_slice_to_string(&self.0.as_vec()))
     }
 }
 
-impl<const N: u32> core::ops::Index<u32> for BitVec<N> {
+impl<const N: usize> core::ops::Index<usize> for BitVec<N> {
     type Output = Bit;
-    fn index(&self, index: u32) -> &Self::Output {
+    fn index(&self, index: usize) -> &Self::Output {
         self.0.get(index)
     }
 }
@@ -82,19 +82,19 @@ fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) ->
     };
     n
 }
-impl<const N: u32> BitVec<N> {
+impl<const N: usize> BitVec<N> {
     /// Constructor for BitVec. `BitVec::<N>::from_fn` constructs a bitvector out of a function that takes usizes smaller than `N` and produces bits.
-    pub fn from_fn<F: Fn(u32) -> Bit>(f: F) -> Self {
+    pub fn from_fn<F: Fn(usize) -> Bit>(f: F) -> Self {
         Self(FunArray::from_fn(f))
     }
     /// Convert a slice of machine integers where only the `d` least significant bits are relevant.
-    pub fn from_slice<T: Into<i128> + MachineInteger + Copy>(x: &[T], d: u32) -> Self {
+    pub fn from_slice<T: Into<i128> + MachineInteger + Copy>(x: &[T], d: usize) -> Self {
         Self::from_fn(|i| Bit::of_int::<T>(x[(i / d) as usize], (i % d) as u32))
     }
 
     /// Construct a BitVec out of a machine integer.
     pub fn from_int<T: Into<i128> + MachineInteger + Copy>(n: T) -> Self {
-        Self::from_slice::<T>(&[n], T::bits() as u32)
+        Self::from_slice::<T>(&[n], T::bits() as usize)
     }
 
     /// Convert a BitVec into a machine integer of type `T`.
@@ -122,34 +122,34 @@ impl<const N: u32> BitVec<N> {
     }
 }
 
-impl<const N: u32> BitVec<N> {
-    pub fn chunked_shift<const CHUNK: u32, const SHIFTS: u32>(
-        self,
-        shl: FunArray<SHIFTS, i128>,
-    ) -> BitVec<N> {
-        fn chunked_shift<const N: u32, const CHUNK: u32, const SHIFTS: u32>(
-            bitvec: BitVec<N>,
-            shl: FunArray<SHIFTS, i128>,
-        ) -> BitVec<N> {
-            BitVec::from_fn(|i| {
-                let nth_bit = i % CHUNK;
-                let nth_chunk = i / CHUNK;
-                let shift: i128 = if nth_chunk < SHIFTS {
-                    shl[nth_chunk]
-                } else {
-                    0
-                };
-                let local_index = (nth_bit as i128).wrapping_sub(shift);
-                if local_index < CHUNK as i128 && local_index >= 0 {
-                    let local_index = local_index as u32;
-                    bitvec[nth_chunk * CHUNK + local_index]
-                } else {
-                    Bit::Zero
-                }
-            })
-        }
-        chunked_shift::<N, CHUNK, SHIFTS>(self, shl)
-    }
+impl<const N: usize> BitVec<N> {
+    // pub fn chunked_shift<const CHUNK: u32, const SHIFTS: usize>(
+    //     self,
+    //     shl: FunArray<SHIFTS, i128>,
+    // ) -> BitVec<N> {
+    //     fn chunked_shift<const N: usize, const CHUNK: usize, const SHIFTS: usize>(
+    //         bitvec: BitVec<N>,
+    //         shl: FunArray<SHIFTS, i128>,
+    //     ) -> BitVec<N> {
+    //         BitVec::from_fn(|i| {
+    //             let nth_bit = i % CHUNK;
+    //             let nth_chunk = i / CHUNK;
+    //             let shift: i128 = if nth_chunk < SHIFTS {
+    //                 shl[nth_chunk]
+    //             } else {
+    //                 0
+    //             };
+    //             let local_index = (nth_bit as i128).wrapping_sub(shift);
+    //             if local_index < CHUNK && local_index >= 0 {
+    //                 let local_index = local_index as u32;
+    //                 bitvec[nth_chunk * CHUNK + local_index]
+    //             } else {
+    //                 Bit::Zero
+    //             }
+    //         })
+    //     }
+    //     chunked_shift::<N, CHUNK, SHIFTS>(self, shl)
+    // }
 
     /// Folds over the array, accumulating a result.
     ///
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index c5c5ece29f1f8..95ad1ca04ecc7 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -4,26 +4,18 @@
 use crate::abstractions::bit::MachineInteger;
 
 /// `FunArray<N, T>` represents an array of `T` values of length `N`, where `N` is a compile-time constant.
-/// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
-/// Unused elements beyond `N` are filled with `None`.
 #[derive(Copy, Clone, Eq, PartialEq)]
-pub struct FunArray<const N: u32, T>([Option<T>; 512]);
+pub struct FunArray<const N: usize, T>([Option<T>; N]);
 
-impl<const N: u32, T> FunArray<N, T> {
+impl<const N: usize, T> FunArray<N, T> {
     /// Gets a reference to the element at index `i`.
-    pub fn get(&self, i: u32) -> &T {
-        self.0[i as usize].as_ref().unwrap()
+    pub fn get(&self, i: usize) -> &T {
+        self.0[i].as_ref().unwrap()
     }
     /// Constructor for FunArray. `FunArray<N,T>::from_fn` constructs a funarray out of a function that takes usizes smaller than `N` and produces an element of type T.
-    pub fn from_fn<F: Fn(u32) -> T>(f: F) -> Self {
+    pub fn from_fn<F: Fn(usize) -> T>(f: F) -> Self {
         // let vec = (0..N).map(f).collect();
-        let arr = core::array::from_fn(|i| {
-            if (i as u32) < N {
-                Some(f(i as u32))
-            } else {
-                None
-            }
-        });
+        let arr = core::array::from_fn(|i| if i < N { Some(f(i)) } else { None });
         Self(arr)
     }
 
@@ -55,17 +47,17 @@ impl<const N: u32, T> FunArray<N, T> {
     }
 }
 
-impl<const N: u32, T: MachineInteger> FunArray<N, T> {
+impl<const N: usize, T: MachineInteger> FunArray<N, T> {
     #[allow(non_snake_case)]
     pub fn ZERO() -> Self {
         Self::from_fn(|_| T::ZEROS)
     }
 }
 
-impl<const N: u32, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
+impl<const N: usize, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     type Error = ();
     fn try_from(v: Vec<T>) -> Result<Self, ()> {
-        if (v.len() as u32) < N {
+        if v.len() < N {
             Err(())
         } else {
             Ok(Self::from_fn(|i| v[i as usize].clone()))
@@ -73,16 +65,113 @@ impl<const N: u32, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     }
 }
 
-impl<const N: u32, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
+impl<const N: usize, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         write!(f, "{:?}", self.as_vec())
     }
 }
 
-impl<const N: u32, T> core::ops::Index<u32> for FunArray<N, T> {
+impl<const N: usize, T> core::ops::Index<usize> for FunArray<N, T> {
     type Output = T;
 
-    fn index(&self, index: u32) -> &Self::Output {
+    fn index(&self, index: usize) -> &Self::Output {
         self.get(index)
     }
 }
+
+impl<T: Copy> FunArray<1, T> {
+    pub fn new(x: T) -> Self {
+        let v = [x];
+        Self::from_fn(|i| v[i])
+    }
+}
+
+impl<T: Copy> FunArray<2, T> {
+    pub fn new(x0: T, x1: T) -> Self {
+        let v = [x0, x1];
+        Self::from_fn(|i| v[i])
+    }
+}
+
+impl<T: Copy> FunArray<4, T> {
+    pub fn new(x0: T, x1: T, x2: T, x3: T) -> Self {
+        let v = [x0, x1, x2, x3];
+        Self::from_fn(|i| v[i])
+    }
+}
+
+impl<T: Copy> FunArray<8, T> {
+    pub fn new(x0: T, x1: T, x2: T, x3: T, x4: T, x5: T, x6: T, x7: T) -> Self {
+        let v = [x0, x1, x2, x3, x4, x5, x6, x7];
+        Self::from_fn(|i| v[i])
+    }
+}
+
+impl<T: Copy> FunArray<16, T> {
+    pub fn new(
+        x0: T,
+        x1: T,
+        x2: T,
+        x3: T,
+        x4: T,
+        x5: T,
+        x6: T,
+        x7: T,
+        x8: T,
+        x9: T,
+        x10: T,
+        x11: T,
+        x12: T,
+        x13: T,
+        x14: T,
+        x15: T,
+    ) -> Self {
+        let v = [
+            x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
+        ];
+        Self::from_fn(|i| v[i])
+    }
+}
+
+impl<T: Copy> FunArray<32, T> {
+    pub fn new(
+        x0: T,
+        x1: T,
+        x2: T,
+        x3: T,
+        x4: T,
+        x5: T,
+        x6: T,
+        x7: T,
+        x8: T,
+        x9: T,
+        x10: T,
+        x11: T,
+        x12: T,
+        x13: T,
+        x14: T,
+        x15: T,
+        x16: T,
+        x17: T,
+        x18: T,
+        x19: T,
+        x20: T,
+        x21: T,
+        x22: T,
+        x23: T,
+        x24: T,
+        x25: T,
+        x26: T,
+        x27: T,
+        x28: T,
+        x29: T,
+        x30: T,
+        x31: T,
+    ) -> Self {
+        let v = [
+            x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18,
+            x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,
+        ];
+        Self::from_fn(|i| v[i])
+    }
+}
diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs
index b3018a8189569..b8b4de83d877d 100644
--- a/testable-simd-models/src/abstractions/mod.rs
+++ b/testable-simd-models/src/abstractions/mod.rs
@@ -23,4 +23,5 @@
 pub mod bit;
 pub mod bitvec;
 pub mod funarr;
+#[macro_use]
 pub mod simd;
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 622690f263037..17997cc79be1d 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -20,7 +20,7 @@ macro_rules! interpretations {
                         #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
                         pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
             let vec: Vec<$ty> = iv.as_vec();
-            Self::from_slice(&vec[..], <$ty>::bits() as u32)
+            Self::from_slice(&vec[..], <$ty>::bits() as usize)
                         }
                         #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
                         pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
@@ -75,8 +75,8 @@ interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
 /// # Safety
 ///
 /// `idx` must be in-bounds of the vector, ie. idx < N
-pub fn simd_insert<const N: u32, T: Copy>(x: FunArray<N, T>, idx: u32, val: T) -> FunArray<N, T> {
-    FunArray::from_fn(|i| if i == idx { val } else { x[i] })
+pub fn simd_insert<const N: usize, T: Copy>(x: FunArray<N, T>, idx: u32, val: T) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if i == idx as usize { val } else { x[i] })
 }
 
 /// Extracts an element from a vector.
@@ -84,12 +84,12 @@ pub fn simd_insert<const N: u32, T: Copy>(x: FunArray<N, T>, idx: u32, val: T) -
 /// # Safety
 ///
 /// `idx` must be in-bounds of the vector, ie. idx < N
-pub fn simd_extract<const N: u32, T: Clone>(x: FunArray<N, T>, idx: u32) -> T {
-    x.get(idx).clone()
+pub fn simd_extract<const N: usize, T: Clone>(x: FunArray<N, T>, idx: u32) -> T {
+    x.get(idx as usize).clone()
 }
 
 /// Adds two vectors elementwise with wrapping on overflow/underflow.
-pub fn simd_add<const N: u32, T: MachineInteger + Copy>(
+pub fn simd_add<const N: usize, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -97,7 +97,7 @@ pub fn simd_add<const N: u32, T: MachineInteger + Copy>(
 }
 
 /// Subtracts `rhs` from `lhs` elementwise with wrapping on overflow/underflow.
-pub fn simd_sub<const N: u32, T: MachineInteger + Copy>(
+pub fn simd_sub<const N: usize, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -105,7 +105,7 @@ pub fn simd_sub<const N: u32, T: MachineInteger + Copy>(
 }
 
 /// Multiplies two vectors elementwise with wrapping on overflow/underflow.
-pub fn simd_mul<const N: u32, T: MachineInteger + Copy>(
+pub fn simd_mul<const N: usize, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -115,14 +115,14 @@ pub fn simd_mul<const N: u32, T: MachineInteger + Copy>(
 /// Produces the elementwise absolute values.
 /// For vectors of unsigned integers it returns the vector untouched.
 /// If the element is the minimum value of a signed integer, it returns the element as is.
-pub fn simd_abs<const N: u32, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
+pub fn simd_abs<const N: usize, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
     FunArray::from_fn(|i| x[i].absolute_val())
 }
 
 /// Produces the elementwise absolute difference of two vectors.
 /// Note: Absolute difference in this case is simply the element with the smaller value subtracted from the element with the larger value, with overflow/underflow.
 /// For example, if the elements are i8, the absolute difference of 255 and -2 is -255.
-pub fn simd_abs_diff<const N: u32, T: MachineInteger + Copy>(
+pub fn simd_abs_diff<const N: usize, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -134,7 +134,7 @@ pub fn simd_abs_diff<const N: u32, T: MachineInteger + Copy>(
 /// # Safety
 ///
 /// Each element of `rhs` must be less than `<int>::BITS`.
-pub fn simd_shl<const N: u32, T: Shl + Copy>(
+pub fn simd_shl<const N: usize, T: Shl + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as Shl>::Output> {
@@ -149,7 +149,7 @@ pub fn simd_shl<const N: u32, T: Shl + Copy>(
 ///
 /// Each element of `rhs` must be less than `<int>::BITS`.
 
-pub fn simd_shr<const N: u32, T: Shr + Copy>(
+pub fn simd_shr<const N: usize, T: Shr + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as Shr>::Output> {
@@ -158,7 +158,7 @@ pub fn simd_shr<const N: u32, T: Shr + Copy>(
 
 /// "Ands" vectors elementwise.
 
-pub fn simd_and<const N: u32, T: BitAnd + Copy>(
+pub fn simd_and<const N: usize, T: BitAnd + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitAnd>::Output> {
@@ -167,7 +167,7 @@ pub fn simd_and<const N: u32, T: BitAnd + Copy>(
 
 /// "Ors" vectors elementwise.
 
-pub fn simd_or<const N: u32, T: BitOr + Copy>(
+pub fn simd_or<const N: usize, T: BitOr + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitOr>::Output> {
@@ -176,7 +176,7 @@ pub fn simd_or<const N: u32, T: BitOr + Copy>(
 
 /// "Exclusive ors" vectors elementwise.
 
-pub fn simd_xor<const N: u32, T: BitXor + Copy>(
+pub fn simd_xor<const N: usize, T: BitXor + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitXor>::Output> {
@@ -330,7 +330,9 @@ self_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
 ///
 /// When casting from a wider number to a smaller number, the higher bits are removed.
 /// Otherwise, it extends the number, following signedness.
-pub fn simd_cast<const N: u32, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>) -> FunArray<N, T2> {
+pub fn simd_cast<const N: usize, T1: Copy, T2: CastsFrom<T1>>(
+    x: FunArray<N, T1>,
+) -> FunArray<N, T2> {
     FunArray::from_fn(|i| T2::cast(x[i]))
 }
 
@@ -338,7 +340,7 @@ pub fn simd_cast<const N: u32, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>)
 ///
 /// Rust panics for `-<int>::Min` due to overflow, but here, it just returns the element as is.
 
-pub fn simd_neg<const N: u32, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
+pub fn simd_neg<const N: usize, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
     x: FunArray<N, T>,
 ) -> FunArray<N, T> {
     FunArray::from_fn(|i| {
@@ -353,7 +355,7 @@ pub fn simd_neg<const N: u32, T: From<<T as Neg>::Output> + MachineInteger + Eq
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_eq<const N: u32, T: Eq + MachineInteger + Copy>(
+pub fn simd_eq<const N: usize, T: Eq + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -364,7 +366,7 @@ pub fn simd_eq<const N: u32, T: Eq + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_ne<const N: u32, T: Eq + MachineInteger + Copy>(
+pub fn simd_ne<const N: usize, T: Eq + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -375,7 +377,7 @@ pub fn simd_ne<const N: u32, T: Eq + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_lt<const N: u32, T: Ord + MachineInteger + Copy>(
+pub fn simd_lt<const N: usize, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -386,7 +388,7 @@ pub fn simd_lt<const N: u32, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_le<const N: u32, T: Ord + MachineInteger + Copy>(
+pub fn simd_le<const N: usize, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -397,7 +399,7 @@ pub fn simd_le<const N: u32, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_gt<const N: u32, T: Ord + MachineInteger + Copy>(
+pub fn simd_gt<const N: usize, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -408,7 +410,7 @@ pub fn simd_gt<const N: u32, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_ge<const N: u32, T: Ord + MachineInteger + Copy>(
+pub fn simd_ge<const N: usize, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -418,13 +420,13 @@ pub fn simd_ge<const N: u32, T: Ord + MachineInteger + Copy>(
 /// Shuffles two vectors by the indices in idx.
 ///
 /// For safety, `N2 <= N1 + N3` must hold.
-pub fn simd_shuffle<T: Copy, const N1: u32, const N2: usize, const N3: u32>(
+pub fn simd_shuffle<T: Copy, const N1: usize, const N2: usize, const N3: usize>(
     x: FunArray<N1, T>,
     y: FunArray<N1, T>,
     idx: [u32; N2],
 ) -> FunArray<N3, T> {
     FunArray::from_fn(|i| {
-        let i = idx[i as usize];
+        let i = idx[i] as usize;
         if i < N1 {
             x[i]
         } else {
@@ -435,7 +437,7 @@ pub fn simd_shuffle<T: Copy, const N1: u32, const N2: usize, const N3: u32>(
 
 /// Adds two vectors elementwise, with saturation.
 
-pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u32>(
+pub fn simd_saturating_add<T: MachineInteger + Copy, const N: usize>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -444,7 +446,7 @@ pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u32>(
 
 /// Subtracts `y` from `x` elementwise, with saturation.
 
-pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: u32>(
+pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: usize>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -926,7 +928,7 @@ pub(crate) use simd_bitmask_big;
 /// # Safety
 /// `mask` must only contain `0` and `!0`.
 
-pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy + MachineInteger>(
+pub fn simd_select<const N: usize, T1: Eq + MachineInteger, T2: Copy + MachineInteger>(
     mask: FunArray<N, T1>,
     if_true: FunArray<N, T2>,
     if_false: FunArray<N, T2>,
@@ -944,3 +946,10 @@ pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy + MachineInte
 pub fn transmute<T, U: From<T>>(a: T) -> U {
     a.into()
 }
+
+#[macro_export]
+macro_rules! static_assert_uimm_bits {
+    ($imm:ident, $size:literal) => {};
+}
+
+pub use static_assert_uimm_bits;
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index fb7f5fe70cdbe..2402488c25c75 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -13,26 +13,10 @@
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
+use super::avx_handwritten::*;
 use super::types::*;
 use crate::abstractions::simd::*;
 
-mod c_extern {
-    use crate::abstractions::simd::*;
-
-    pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
-        let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
-            0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
-            1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
-            2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
-            3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
-            _ => unreachable!(),
-        });
-
-        i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
-    }
-}
-
-use c_extern::*;
 /// Blends packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
 ///
@@ -112,7 +96,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
 pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    // static_assert_uimm_bits!(IMM8, 8);
+    static_assert_uimm_bits!(IMM8, 8);
     vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8).into()
 }
 
@@ -163,12 +147,7 @@ pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
 pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    let c = __m256i::from_fn(|i| a[i] & b[i]);
-    if c == __m256i::ZERO() {
-        1
-    } else {
-        0
-    }
+    ptestz256(a.as_i64x4(), b.as_i64x4())
 }
 
 /// Sets each bit of the returned mask based on the most significant bit of the
@@ -240,11 +219,10 @@ pub fn _mm256_set_epi8(
     e30: i8,
     e31: i8,
 ) -> __m256i {
-    let vec = [
+    transmute(i8x32::new(
         e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
         e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
-    ];
-    transmute(i8x32::from_fn(|i| vec[(31 - i) as usize]))
+    ))
 }
 
 /// Sets packed 16-bit integers in returned vector with the supplied values.
@@ -271,10 +249,9 @@ pub fn _mm256_set_epi16(
     e14: i16,
     e15: i16,
 ) -> __m256i {
-    let vec = [
+    transmute(i16x16::new(
         e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
-    ];
-    transmute(i16x16::from_fn(|i| vec[(15 - i) as usize]))
+    ))
 }
 
 /// Sets packed 32-bit integers in returned vector with the supplied values.
@@ -293,8 +270,7 @@ pub fn _mm256_set_epi32(
     e6: i32,
     e7: i32,
 ) -> __m256i {
-    let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
-    transmute(i32x8::from_fn(|i| vec[(7 - i) as usize]))
+    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
 
 /// Sets packed 64-bit integers in returned vector with the supplied values.
@@ -302,8 +278,7 @@ pub fn _mm256_set_epi32(
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
 // This intrinsic has no corresponding instruction.
 pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
-    let vec = [d, c, b, a];
-    transmute(i64x4::from_fn(|i| vec[i as usize]))
+    transmute(i64x4::new(d, c, b, a))
 }
 
 /// Broadcasts 8-bit integer `a` to all elements of returned vector.
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index cb57efb4d5a9d..3327be549bf5c 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -540,14 +540,14 @@ mod c_extern {
                 if b[i] > 127 {
                     0
                 } else {
-                    let index = (b[i] % 16) as u32;
+                    let index = (b[i] % 16) as usize;
                     a[index]
                 }
             } else {
                 if b[i] > 127 {
                     0
                 } else {
-                    let index = (b[i] % 16) as u32;
+                    let index = (b[i] % 16) as usize;
                     a[index + 16]
                 }
             }
@@ -556,7 +556,7 @@ mod c_extern {
 
     pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
         u32x8::from_fn(|i| {
-            let id = b[i] % 8;
+            let id = (b[i] % 8) as usize;
             a[id]
         })
     }
@@ -564,8 +564,8 @@ mod c_extern {
     pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
         u16x16::from_fn(|i| {
             if i < 8 {
-                let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
-                let b_offset = ((imm8 & 3) * 4) as u32;
+                let a_offset = (((imm8 & 4) >> 2) * 4) as usize;
+                let b_offset = ((imm8 & 3) * 4) as usize;
                 let k = a_offset + i;
                 let l = b_offset;
                 ((a[k].absolute_diff(b[l]) as i8) as u8 as u16)
@@ -575,8 +575,8 @@ mod c_extern {
             } else {
                 let i = i - 8;
                 let imm8 = imm8 >> 3;
-                let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
-                let b_offset = ((imm8 & 3) * 4) as u32;
+                let a_offset = (((imm8 & 4) >> 2) * 4) as usize;
+                let b_offset = ((imm8 & 3) * 4) as usize;
                 let k = a_offset + i;
                 let l = b_offset;
                 ((a[16 + k].absolute_diff(b[16 + l]) as i8) as u8 as u16)
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 95c9eb4061b6a..fc9b88d997e08 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -22,6 +22,7 @@
 
 pub mod avx;
 pub mod avx2;
+pub mod avx_handwritten;
 pub mod sse2;
 pub mod ssse3;
 
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index 7fc29262232a9..ff2c7713540f5 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -7,7 +7,13 @@ use super::types::*;
 mod c_extern {
     use crate::abstractions::simd::*;
     pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
-        u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u32] })
+        u8x16::from_fn(|i| {
+            if b[i] > 127 {
+                0
+            } else {
+                a[(b[i] % 16) as usize]
+            }
+        })
     }
 
     pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
diff --git a/testable-simd-models/src/helpers.rs b/testable-simd-models/src/helpers.rs
index c2ea42b4b7ed6..d646d49f5dc64 100644
--- a/testable-simd-models/src/helpers.rs
+++ b/testable-simd-models/src/helpers.rs
@@ -38,13 +38,13 @@ pub mod test {
             crate::abstractions::bit::Bit::from(bool::random())
         }
     }
-    impl<const N: u32> HasRandom for BitVec<N> {
+    impl<const N: usize> HasRandom for BitVec<N> {
         fn random() -> Self {
             Self::from_fn(|_| Bit::random())
         }
     }
 
-    impl<const N: u32, T: HasRandom> HasRandom for FunArray<N, T> {
+    impl<const N: usize, T: HasRandom> HasRandom for FunArray<N, T> {
         fn random() -> Self {
             FunArray::from_fn(|_| T::random())
         }

From 4a228fb42871cdabd4199fc0480b157ac2bf0315 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Tue, 29 Jul 2025 16:44:47 -0400
Subject: [PATCH 08/47] fmt

---
 testable-simd-models/src/abstractions/simd.rs |  6 ++---
 .../src/core_arch/x86/models/avx.rs           | 14 ++++-------
 .../core_arch/x86/models/avx_handwritten.rs   | 24 +++++++++++++++++++
 .../src/core_arch/x86/models/mod.rs           |  5 +++-
 4 files changed, 36 insertions(+), 13 deletions(-)
 create mode 100644 testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs

diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 17997cc79be1d..d16763b19a17d 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -62,12 +62,12 @@ macro_rules! interpretations {
 }
 
 interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
-            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32]);
+            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32], f32x8 [f32; 8], f64x4 [f64; 4]);
 interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
-            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16]);
+            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16], f32x4 [f32; 4], f64x2 [f64; 2]);
 
 interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
-interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
+interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8], f32x2 [f32; 2]);
 interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
 
 /// Inserts an element into a vector, returning the updated vector.
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 2402488c25c75..20516f42c0683 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -13,9 +13,9 @@
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
-use super::avx_handwritten::*;
 use super::types::*;
 use crate::abstractions::simd::*;
+use super::avx_handwritten::*;
 
 /// Blends packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
@@ -147,7 +147,7 @@ pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
 pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    ptestz256(a.as_i64x4(), b.as_i64x4())
+    ptestz256(a.as_i64x4(), b.as_i64x4()) 
 }
 
 /// Sets each bit of the returned mask based on the most significant bit of the
@@ -219,10 +219,8 @@ pub fn _mm256_set_epi8(
     e30: i8,
     e31: i8,
 ) -> __m256i {
-    transmute(i8x32::new(
-        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
-        e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
-    ))
+    transmute(i8x32::new(e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
+        e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,))
 }
 
 /// Sets packed 16-bit integers in returned vector with the supplied values.
@@ -249,9 +247,7 @@ pub fn _mm256_set_epi16(
     e14: i16,
     e15: i16,
 ) -> __m256i {
-    transmute(i16x16::new(
-        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
-    ))
+    transmute(i16x16::new(e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15))
 }
 
 /// Sets packed 32-bit integers in returned vector with the supplied values.
diff --git a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
new file mode 100644
index 0000000000000..d6427e3bc7a4b
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
@@ -0,0 +1,24 @@
+//! LLVM intrinsics used in AVX
+
+use crate::abstractions::simd::*;
+
+pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
+    let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
+        0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
+        1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
+        2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
+        3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
+        _ => unreachable!(),
+    });
+
+    i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
+}
+
+pub fn ptestz256(a: i64x4, b: i64x4) -> i32 {
+    let c = i64x4::from_fn(|i| a[i] & b[i]);
+    if c == i64x4::ZERO() {
+        1
+    } else {
+        0
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index fc9b88d997e08..a65223cee0d14 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -28,11 +28,14 @@ pub mod ssse3;
 
 pub(crate) mod types {
     use crate::abstractions::bitvec::*;
+    use crate::abstractions::simd::*;
 
     #[allow(non_camel_case_types)]
     pub type __m256i = BitVec<256>;
     #[allow(non_camel_case_types)]
-    pub type __m256 = BitVec<256>;
+    pub type __m256d = f64x4;
+    #[allow(non_camel_case_types)]
+    pub type __m256 = f32x8;
     #[allow(non_camel_case_types)]
     pub type __m128i = BitVec<128>;
 }

From a21609d36695c2816221dbaef3e89c83b928075b Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 05:54:03 -0400
Subject: [PATCH 09/47] test

---
 testable-simd-models/src/abstractions/bit.rs  | 63 ++++++++++++++++---
 .../src/abstractions/bitvec.rs                | 43 ++-----------
 testable-simd-models/src/abstractions/simd.rs |  2 +-
 .../src/core_arch/x86/models/mod.rs           |  5 +-
 4 files changed, 65 insertions(+), 48 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 0fd7d6ec78771..45b3147694212 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -132,6 +132,10 @@ pub trait MachineInteger {
     /// Maximum value of the integer type.
     const MAX: Self;
 
+    /// Casts integer to u128 
+    fn to_u128(self) -> u128;
+    /// Casts u128 to integer 
+    fn from_u128(x:u128) -> Self;
     /// Implements functionality for `simd_add` in `crate::abstractions::simd`.
     fn wrapping_add(self, rhs: Self) -> Self;
     /// Implements functionality for `simd_sub` in `crate::abstractions::simd`.
@@ -158,6 +162,8 @@ macro_rules! generate_imachine_integer_impls {
 		const MIN: $ty = $ty::MIN;
 		const MAX: $ty = $ty::MAX;
 		fn bits() -> u32 { $ty::BITS }
+        fn to_u128(self) -> u128 {self as u128}
+        fn from_u128(x:u128) -> Self {x as $ty}
 		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
 		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
 		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
@@ -181,6 +187,8 @@ macro_rules! generate_umachine_integer_impls {
 
 
 		fn bits() -> u32 { $ty::BITS }
+        fn to_u128(self) -> u128 {self as u128}
+        fn from_u128(x:u128) -> Self {x as $ty}
 		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
 		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
 		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
@@ -194,6 +202,51 @@ macro_rules! generate_umachine_integer_impls {
 generate_imachine_integer_impls!(i8, i16, i32, i64, i128);
 generate_umachine_integer_impls!(u8, u16, u32, u64, u128);
 
+macro_rules! unsupported_op {
+    () => { panic!("unsupported operation")}
+}
+
+impl MachineInteger for f32 {
+		const SIGNED: bool = true;
+		const ZEROS: f32 = 0.0;
+		const ONES: f32 = !0u32 as f32;
+		const MIN: f32 = f32::MIN;
+		const MAX: f32 = f32::MAX;
+
+
+		fn bits() -> u32 { 32 }
+        fn to_u128(self) -> u128 {self as u128}
+        fn from_u128(x:u128) -> Self {x as f32}
+		fn wrapping_add(self, _rhs: Self) -> Self { unsupported_op!() }
+		fn wrapping_sub(self, _rhs: Self) -> Self { unsupported_op!() }
+		fn overflowing_mul(self, _rhs: Self) -> Self { unsupported_op!() }
+		fn saturating_add(self, _rhs: Self) -> Self { unsupported_op!()}
+		fn saturating_sub(self, _rhs: Self) -> Self { unsupported_op!()}
+		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
+		fn absolute_val(self) -> Self {self.abs()}
+}
+
+impl MachineInteger for f64 {
+		const SIGNED: bool = true;
+		const ZEROS: f64 = 0.0;
+		const ONES: f64 = !0u64 as f64;
+		const MIN: f64 = f64::MIN;
+		const MAX: f64 = f64::MAX;
+
+
+		fn bits() -> u32 { 32 }
+        fn to_u128(self) -> u128 {self as u128}
+        fn from_u128(x:u128) -> Self {x as f64}
+		fn wrapping_add(self, _rhs: Self) -> Self { unsupported_op!() }
+		fn wrapping_sub(self, _rhs: Self) -> Self { unsupported_op!() }
+		fn overflowing_mul(self, _rhs: Self) -> Self { unsupported_op!() }
+		fn saturating_add(self, _rhs: Self) -> Self { unsupported_op!()}
+		fn saturating_sub(self, _rhs: Self) -> Self { unsupported_op!()}
+		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
+		fn absolute_val(self) -> Self {self.abs()}
+}
+
+
 impl Bit {
     fn of_raw_int(x: u128, nth: u32) -> Self {
         if x / 2u128.pow(nth) % 2 == 1 {
@@ -203,12 +256,8 @@ impl Bit {
         }
     }
 
-    pub fn of_int<T: Into<i128> + MachineInteger>(x: T, nth: u32) -> Bit {
-        let x: i128 = x.into();
-        if x >= 0 {
-            Self::of_raw_int(x as u128, nth)
-        } else {
-            Self::of_raw_int((2i128.pow(T::bits() as u32) + x) as u128, nth)
-        }
+    pub fn of_int<T: MachineInteger>(x: T, nth: u32) -> Bit {
+        let x: u128 = x.to_u128();
+        Self::of_raw_int(x, nth)
     }
 }
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index f0384420e5812..11341d05449e3 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -63,7 +63,7 @@ fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
 }
 
 /// Convert a bit slice into a machine integer of type `T`.
-fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) -> T {
+fn int_from_bit_slice<T: MachineInteger + Copy>(bits: &[Bit]) -> T {
     debug_assert!(bits.len() <= T::bits() as usize);
     let result = if T::SIGNED {
         let is_negative = matches!(bits[T::bits() as usize - 1], Bit::One);
@@ -76,10 +76,7 @@ fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) ->
     } else {
         u128_int_from_bit_slice(bits) as i128
     };
-    let Ok(n) = result.try_into() else {
-        // Conversion must succeed as `result` is guaranteed to be in range due to the bit-length check.
-        unreachable!()
-    };
+    let n = T::from_u128(result as u128);
     n
 }
 impl<const N: usize> BitVec<N> {
@@ -88,22 +85,22 @@ impl<const N: usize> BitVec<N> {
         Self(FunArray::from_fn(f))
     }
     /// Convert a slice of machine integers where only the `d` least significant bits are relevant.
-    pub fn from_slice<T: Into<i128> + MachineInteger + Copy>(x: &[T], d: usize) -> Self {
+    pub fn from_slice<T: MachineInteger + Copy>(x: &[T], d: usize) -> Self {
         Self::from_fn(|i| Bit::of_int::<T>(x[(i / d) as usize], (i % d) as u32))
     }
 
     /// Construct a BitVec out of a machine integer.
-    pub fn from_int<T: Into<i128> + MachineInteger + Copy>(n: T) -> Self {
+    pub fn from_int<T: MachineInteger + Copy>(n: T) -> Self {
         Self::from_slice::<T>(&[n], T::bits() as usize)
     }
 
     /// Convert a BitVec into a machine integer of type `T`.
-    pub fn to_int<T: TryFrom<i128> + MachineInteger + Copy>(self) -> T {
+    pub fn to_int<T: MachineInteger + Copy>(self) -> T {
         int_from_bit_slice(&self.0.as_vec())
     }
 
     /// Convert a BitVec into a vector of machine integers of type `T`.
-    pub fn to_vec<T: TryFrom<i128> + MachineInteger + Copy>(&self) -> Vec<T> {
+    pub fn to_vec<T: MachineInteger + Copy>(&self) -> Vec<T> {
         self.0
             .as_vec()
             .chunks(T::bits() as usize)
@@ -123,34 +120,6 @@ impl<const N: usize> BitVec<N> {
 }
 
 impl<const N: usize> BitVec<N> {
-    // pub fn chunked_shift<const CHUNK: u32, const SHIFTS: usize>(
-    //     self,
-    //     shl: FunArray<SHIFTS, i128>,
-    // ) -> BitVec<N> {
-    //     fn chunked_shift<const N: usize, const CHUNK: usize, const SHIFTS: usize>(
-    //         bitvec: BitVec<N>,
-    //         shl: FunArray<SHIFTS, i128>,
-    //     ) -> BitVec<N> {
-    //         BitVec::from_fn(|i| {
-    //             let nth_bit = i % CHUNK;
-    //             let nth_chunk = i / CHUNK;
-    //             let shift: i128 = if nth_chunk < SHIFTS {
-    //                 shl[nth_chunk]
-    //             } else {
-    //                 0
-    //             };
-    //             let local_index = (nth_bit as i128).wrapping_sub(shift);
-    //             if local_index < CHUNK && local_index >= 0 {
-    //                 let local_index = local_index as u32;
-    //                 bitvec[nth_chunk * CHUNK + local_index]
-    //             } else {
-    //                 Bit::Zero
-    //             }
-    //         })
-    //     }
-    //     chunked_shift::<N, CHUNK, SHIFTS>(self, shl)
-    // }
-
     /// Folds over the array, accumulating a result.
     ///
     /// # Arguments
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index d16763b19a17d..9b07f8c4114de 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -928,7 +928,7 @@ pub(crate) use simd_bitmask_big;
 /// # Safety
 /// `mask` must only contain `0` and `!0`.
 
-pub fn simd_select<const N: usize, T1: Eq + MachineInteger, T2: Copy + MachineInteger>(
+pub fn simd_select<const N: usize, T1: Eq + MachineInteger, T2: Copy>(
     mask: FunArray<N, T1>,
     if_true: FunArray<N, T2>,
     if_false: FunArray<N, T2>,
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index a65223cee0d14..7cd247e27f181 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -28,14 +28,13 @@ pub mod ssse3;
 
 pub(crate) mod types {
     use crate::abstractions::bitvec::*;
-    use crate::abstractions::simd::*;
 
     #[allow(non_camel_case_types)]
     pub type __m256i = BitVec<256>;
     #[allow(non_camel_case_types)]
-    pub type __m256d = f64x4;
+    pub type __m256d = BitVec<256>;
     #[allow(non_camel_case_types)]
-    pub type __m256 = f32x8;
+    pub type __m256 = BitVec<256>;
     #[allow(non_camel_case_types)]
     pub type __m128i = BitVec<128>;
 }

From b12d0e270420f26b7f8d4128acb57f5db33a4046 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 05:54:31 -0400
Subject: [PATCH 10/47] script

---
 testable-simd-models/modelize/Cargo.toml  |  8 +++
 testable-simd-models/modelize/src/main.rs | 69 +++++++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 testable-simd-models/modelize/Cargo.toml
 create mode 100644 testable-simd-models/modelize/src/main.rs

diff --git a/testable-simd-models/modelize/Cargo.toml b/testable-simd-models/modelize/Cargo.toml
new file mode 100644
index 0000000000000..3e2ae5f44eefc
--- /dev/null
+++ b/testable-simd-models/modelize/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "modelize"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+prettyplease = "0.2.36"
+syn = { version = "2", features = ["full"] }
diff --git a/testable-simd-models/modelize/src/main.rs b/testable-simd-models/modelize/src/main.rs
new file mode 100644
index 0000000000000..af19d06e1f90f
--- /dev/null
+++ b/testable-simd-models/modelize/src/main.rs
@@ -0,0 +1,69 @@
+use syn::{parse_file, Item, File};
+use std::fs;
+use std::env;
+
+fn remove_all_attributes(input_file_path: &str, handwritten_module: &str, output_file_path: &str) -> Result<(), Box<dyn std::error::Error>> {
+    let source_code = fs::read_to_string(input_file_path)?;
+    let mut syntax_tree: File = parse_file(&source_code)?;
+
+    syntax_tree.items.retain(|item|
+        match item {
+            Item::Use(_) => false,
+            _ => true
+        }
+    );
+
+    let use_abstractions: Item = syn::parse_quote! {
+        use crate::abstractions::simd::*;
+    };
+
+    let use_types: Item = syn::parse_quote! {
+        use super::types::*;
+    };
+
+    let use_handwritten: Item = syn::parse_quote! {
+        use super::avx_handwritten::*;
+    };
+
+    syntax_tree.items.insert(0, use_handwritten);
+    syntax_tree.items.insert(0, use_types);
+    syntax_tree.items.insert(0, use_abstractions);
+
+    // Clear attributes from the file's top-level items
+    for item in &mut syntax_tree.items {
+        match item {
+            Item::Fn(item_fn) => {
+                item_fn.attrs.retain(|attr| attr.path().is_ident("doc"));
+            },
+            Item::Struct(item_struct) => {
+                item_struct.attrs.clear();
+                for field in &mut item_struct.fields {
+                    field.attrs.retain(|attr| attr.path().is_ident("doc"));
+                }
+            },
+            Item::Enum(item_enum) => {
+                item_enum.attrs.clear();
+                for variant in &mut item_enum.variants {
+                    variant.attrs.retain(|attr| attr.path().is_ident("doc"));
+                }
+            },
+            // Add more cases for other Item types (e.g., Item::Mod, Item::Impl, etc.)
+            _ => {
+                // For other item types, if they have an 'attrs' field, clear it.
+                // This requires more specific matching or a helper trait.
+            }
+        }
+    }
+
+//    let output_tokens = quote! { #syntax_tree };
+    let formatted_string = prettyplease::unparse(&syntax_tree);
+
+    fs::write(output_file_path, formatted_string)?;
+
+    Ok(())
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let args: Vec<String> = env::args().collect();
+    remove_all_attributes(&args[1], &args[2], &args[3])
+}

From 9f5e22ebdb60ab1799d9d888f850a38b32565f07 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 07:04:09 -0400
Subject: [PATCH 11/47] fix bit::of_int

---
 testable-simd-models/src/abstractions/bit.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 45b3147694212..6e6ccc30031db 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -256,8 +256,11 @@ impl Bit {
         }
     }
 
-    pub fn of_int<T: MachineInteger>(x: T, nth: u32) -> Bit {
-        let x: u128 = x.to_u128();
-        Self::of_raw_int(x, nth)
+    pub fn of_int<T: MachineInteger + Ord>(x: T, nth: u32) -> Bit {
+        if x >= T::ZEROS {
+            Self::of_raw_int(x.to_u128(), nth)
+        } else {
+            Self::of_raw_int((2u128.pow(T::bits() as u32) + x.to_u128()) as u128, nth)
+        }
     }
 }

From 44d819d571e374790fac1539c4b28ba9b2dabc8f Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 07:50:58 -0400
Subject: [PATCH 12/47] Revert "factor out handwritten models"

This reverts commit 73268aa7c12806487043511639ae316c578c2b43.
---
 testable-simd-models/modelize/Cargo.toml      |   8 --
 testable-simd-models/modelize/src/main.rs     |  69 ----------
 testable-simd-models/src/abstractions/bit.rs  |  62 +--------
 .../src/abstractions/bitvec.rs                |  61 +++++++--
 .../src/abstractions/funarr.rs                | 129 +++---------------
 testable-simd-models/src/abstractions/mod.rs  |   1 -
 testable-simd-models/src/abstractions/simd.rs |  71 +++++-----
 .../src/core_arch/x86/models/avx.rs           |  45 ++++--
 .../src/core_arch/x86/models/avx2.rs          |  14 +-
 .../core_arch/x86/models/avx_handwritten.rs   |  24 ----
 .../src/core_arch/x86/models/mod.rs           |   3 -
 .../src/core_arch/x86/models/ssse3.rs         |   8 +-
 testable-simd-models/src/helpers.rs           |   4 +-
 13 files changed, 149 insertions(+), 350 deletions(-)
 delete mode 100644 testable-simd-models/modelize/Cargo.toml
 delete mode 100644 testable-simd-models/modelize/src/main.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs

diff --git a/testable-simd-models/modelize/Cargo.toml b/testable-simd-models/modelize/Cargo.toml
deleted file mode 100644
index 3e2ae5f44eefc..0000000000000
--- a/testable-simd-models/modelize/Cargo.toml
+++ /dev/null
@@ -1,8 +0,0 @@
-[package]
-name = "modelize"
-version = "0.1.0"
-edition = "2024"
-
-[dependencies]
-prettyplease = "0.2.36"
-syn = { version = "2", features = ["full"] }
diff --git a/testable-simd-models/modelize/src/main.rs b/testable-simd-models/modelize/src/main.rs
deleted file mode 100644
index af19d06e1f90f..0000000000000
--- a/testable-simd-models/modelize/src/main.rs
+++ /dev/null
@@ -1,69 +0,0 @@
-use syn::{parse_file, Item, File};
-use std::fs;
-use std::env;
-
-fn remove_all_attributes(input_file_path: &str, handwritten_module: &str, output_file_path: &str) -> Result<(), Box<dyn std::error::Error>> {
-    let source_code = fs::read_to_string(input_file_path)?;
-    let mut syntax_tree: File = parse_file(&source_code)?;
-
-    syntax_tree.items.retain(|item|
-        match item {
-            Item::Use(_) => false,
-            _ => true
-        }
-    );
-
-    let use_abstractions: Item = syn::parse_quote! {
-        use crate::abstractions::simd::*;
-    };
-
-    let use_types: Item = syn::parse_quote! {
-        use super::types::*;
-    };
-
-    let use_handwritten: Item = syn::parse_quote! {
-        use super::avx_handwritten::*;
-    };
-
-    syntax_tree.items.insert(0, use_handwritten);
-    syntax_tree.items.insert(0, use_types);
-    syntax_tree.items.insert(0, use_abstractions);
-
-    // Clear attributes from the file's top-level items
-    for item in &mut syntax_tree.items {
-        match item {
-            Item::Fn(item_fn) => {
-                item_fn.attrs.retain(|attr| attr.path().is_ident("doc"));
-            },
-            Item::Struct(item_struct) => {
-                item_struct.attrs.clear();
-                for field in &mut item_struct.fields {
-                    field.attrs.retain(|attr| attr.path().is_ident("doc"));
-                }
-            },
-            Item::Enum(item_enum) => {
-                item_enum.attrs.clear();
-                for variant in &mut item_enum.variants {
-                    variant.attrs.retain(|attr| attr.path().is_ident("doc"));
-                }
-            },
-            // Add more cases for other Item types (e.g., Item::Mod, Item::Impl, etc.)
-            _ => {
-                // For other item types, if they have an 'attrs' field, clear it.
-                // This requires more specific matching or a helper trait.
-            }
-        }
-    }
-
-//    let output_tokens = quote! { #syntax_tree };
-    let formatted_string = prettyplease::unparse(&syntax_tree);
-
-    fs::write(output_file_path, formatted_string)?;
-
-    Ok(())
-}
-
-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let args: Vec<String> = env::args().collect();
-    remove_all_attributes(&args[1], &args[2], &args[3])
-}
diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 6e6ccc30031db..75cfd7e755ccb 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -132,10 +132,6 @@ pub trait MachineInteger {
     /// Maximum value of the integer type.
     const MAX: Self;
 
-    /// Casts integer to u128 
-    fn to_u128(self) -> u128;
-    /// Casts u128 to integer 
-    fn from_u128(x:u128) -> Self;
     /// Implements functionality for `simd_add` in `crate::abstractions::simd`.
     fn wrapping_add(self, rhs: Self) -> Self;
     /// Implements functionality for `simd_sub` in `crate::abstractions::simd`.
@@ -162,8 +158,6 @@ macro_rules! generate_imachine_integer_impls {
 		const MIN: $ty = $ty::MIN;
 		const MAX: $ty = $ty::MAX;
 		fn bits() -> u32 { $ty::BITS }
-        fn to_u128(self) -> u128 {self as u128}
-        fn from_u128(x:u128) -> Self {x as $ty}
 		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
 		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
 		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
@@ -187,8 +181,6 @@ macro_rules! generate_umachine_integer_impls {
 
 
 		fn bits() -> u32 { $ty::BITS }
-        fn to_u128(self) -> u128 {self as u128}
-        fn from_u128(x:u128) -> Self {x as $ty}
 		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
 		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
 		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
@@ -202,51 +194,6 @@ macro_rules! generate_umachine_integer_impls {
 generate_imachine_integer_impls!(i8, i16, i32, i64, i128);
 generate_umachine_integer_impls!(u8, u16, u32, u64, u128);
 
-macro_rules! unsupported_op {
-    () => { panic!("unsupported operation")}
-}
-
-impl MachineInteger for f32 {
-		const SIGNED: bool = true;
-		const ZEROS: f32 = 0.0;
-		const ONES: f32 = !0u32 as f32;
-		const MIN: f32 = f32::MIN;
-		const MAX: f32 = f32::MAX;
-
-
-		fn bits() -> u32 { 32 }
-        fn to_u128(self) -> u128 {self as u128}
-        fn from_u128(x:u128) -> Self {x as f32}
-		fn wrapping_add(self, _rhs: Self) -> Self { unsupported_op!() }
-		fn wrapping_sub(self, _rhs: Self) -> Self { unsupported_op!() }
-		fn overflowing_mul(self, _rhs: Self) -> Self { unsupported_op!() }
-		fn saturating_add(self, _rhs: Self) -> Self { unsupported_op!()}
-		fn saturating_sub(self, _rhs: Self) -> Self { unsupported_op!()}
-		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
-		fn absolute_val(self) -> Self {self.abs()}
-}
-
-impl MachineInteger for f64 {
-		const SIGNED: bool = true;
-		const ZEROS: f64 = 0.0;
-		const ONES: f64 = !0u64 as f64;
-		const MIN: f64 = f64::MIN;
-		const MAX: f64 = f64::MAX;
-
-
-		fn bits() -> u32 { 32 }
-        fn to_u128(self) -> u128 {self as u128}
-        fn from_u128(x:u128) -> Self {x as f64}
-		fn wrapping_add(self, _rhs: Self) -> Self { unsupported_op!() }
-		fn wrapping_sub(self, _rhs: Self) -> Self { unsupported_op!() }
-		fn overflowing_mul(self, _rhs: Self) -> Self { unsupported_op!() }
-		fn saturating_add(self, _rhs: Self) -> Self { unsupported_op!()}
-		fn saturating_sub(self, _rhs: Self) -> Self { unsupported_op!()}
-		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
-		fn absolute_val(self) -> Self {self.abs()}
-}
-
-
 impl Bit {
     fn of_raw_int(x: u128, nth: u32) -> Self {
         if x / 2u128.pow(nth) % 2 == 1 {
@@ -256,11 +203,12 @@ impl Bit {
         }
     }
 
-    pub fn of_int<T: MachineInteger + Ord>(x: T, nth: u32) -> Bit {
-        if x >= T::ZEROS {
-            Self::of_raw_int(x.to_u128(), nth)
+    pub fn of_int<T: Into<i128> + MachineInteger>(x: T, nth: u32) -> Bit {
+        let x: i128 = x.into();
+        if x >= 0 {
+            Self::of_raw_int(x as u128, nth)
         } else {
-            Self::of_raw_int((2u128.pow(T::bits() as u32) + x.to_u128()) as u128, nth)
+            Self::of_raw_int((2i128.pow(T::bits()) + x) as u128, nth)
         }
     }
 }
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index 11341d05449e3..952c5a21cb1a6 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -15,9 +15,9 @@ use std::fmt::Formatter;
 /// making the bit pattern more human-readable. The type also implements indexing,
 /// allowing for easy access to individual bits.
 #[derive(Copy, Clone, Eq, PartialEq)]
-pub struct BitVec<const N: usize>(FunArray<N, Bit>);
+pub struct BitVec<const N: u32>(FunArray<N, Bit>);
 
-impl<const N: usize> BitVec<N> {
+impl<const N: u32> BitVec<N> {
     #[allow(non_snake_case)]
     pub fn ZERO() -> Self {
         Self::from_fn(|_| Bit::Zero)
@@ -40,15 +40,15 @@ fn bit_slice_to_string(bits: &[Bit]) -> String {
         .into()
 }
 
-impl<const N: usize> core::fmt::Debug for BitVec<N> {
+impl<const N: u32> core::fmt::Debug for BitVec<N> {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
         write!(f, "{}", bit_slice_to_string(&self.0.as_vec()))
     }
 }
 
-impl<const N: usize> core::ops::Index<usize> for BitVec<N> {
+impl<const N: u32> core::ops::Index<u32> for BitVec<N> {
     type Output = Bit;
-    fn index(&self, index: usize) -> &Self::Output {
+    fn index(&self, index: u32) -> &Self::Output {
         self.0.get(index)
     }
 }
@@ -63,7 +63,7 @@ fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
 }
 
 /// Convert a bit slice into a machine integer of type `T`.
-fn int_from_bit_slice<T: MachineInteger + Copy>(bits: &[Bit]) -> T {
+fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) -> T {
     debug_assert!(bits.len() <= T::bits() as usize);
     let result = if T::SIGNED {
         let is_negative = matches!(bits[T::bits() as usize - 1], Bit::One);
@@ -76,31 +76,34 @@ fn int_from_bit_slice<T: MachineInteger + Copy>(bits: &[Bit]) -> T {
     } else {
         u128_int_from_bit_slice(bits) as i128
     };
-    let n = T::from_u128(result as u128);
+    let Ok(n) = result.try_into() else {
+        // Conversion must succeed as `result` is guaranteed to be in range due to the bit-length check.
+        unreachable!()
+    };
     n
 }
-impl<const N: usize> BitVec<N> {
+impl<const N: u32> BitVec<N> {
     /// Constructor for BitVec. `BitVec::<N>::from_fn` constructs a bitvector out of a function that takes usizes smaller than `N` and produces bits.
-    pub fn from_fn<F: Fn(usize) -> Bit>(f: F) -> Self {
+    pub fn from_fn<F: Fn(u32) -> Bit>(f: F) -> Self {
         Self(FunArray::from_fn(f))
     }
     /// Convert a slice of machine integers where only the `d` least significant bits are relevant.
-    pub fn from_slice<T: MachineInteger + Copy>(x: &[T], d: usize) -> Self {
+    pub fn from_slice<T: Into<i128> + MachineInteger + Copy>(x: &[T], d: u32) -> Self {
         Self::from_fn(|i| Bit::of_int::<T>(x[(i / d) as usize], (i % d) as u32))
     }
 
     /// Construct a BitVec out of a machine integer.
-    pub fn from_int<T: MachineInteger + Copy>(n: T) -> Self {
-        Self::from_slice::<T>(&[n], T::bits() as usize)
+    pub fn from_int<T: Into<i128> + MachineInteger + Copy>(n: T) -> Self {
+        Self::from_slice::<T>(&[n], T::bits() as u32)
     }
 
     /// Convert a BitVec into a machine integer of type `T`.
-    pub fn to_int<T: MachineInteger + Copy>(self) -> T {
+    pub fn to_int<T: TryFrom<i128> + MachineInteger + Copy>(self) -> T {
         int_from_bit_slice(&self.0.as_vec())
     }
 
     /// Convert a BitVec into a vector of machine integers of type `T`.
-    pub fn to_vec<T: MachineInteger + Copy>(&self) -> Vec<T> {
+    pub fn to_vec<T: TryFrom<i128> + MachineInteger + Copy>(&self) -> Vec<T> {
         self.0
             .as_vec()
             .chunks(T::bits() as usize)
@@ -119,7 +122,35 @@ impl<const N: usize> BitVec<N> {
     }
 }
 
-impl<const N: usize> BitVec<N> {
+impl<const N: u32> BitVec<N> {
+    pub fn chunked_shift<const CHUNK: u32, const SHIFTS: u32>(
+        self,
+        shl: FunArray<SHIFTS, i128>,
+    ) -> BitVec<N> {
+        fn chunked_shift<const N: u32, const CHUNK: u32, const SHIFTS: u32>(
+            bitvec: BitVec<N>,
+            shl: FunArray<SHIFTS, i128>,
+        ) -> BitVec<N> {
+            BitVec::from_fn(|i| {
+                let nth_bit = i % CHUNK;
+                let nth_chunk = i / CHUNK;
+                let shift: i128 = if nth_chunk < SHIFTS {
+                    shl[nth_chunk]
+                } else {
+                    0
+                };
+                let local_index = (nth_bit as i128).wrapping_sub(shift);
+                if local_index < CHUNK as i128 && local_index >= 0 {
+                    let local_index = local_index as u32;
+                    bitvec[nth_chunk * CHUNK + local_index]
+                } else {
+                    Bit::Zero
+                }
+            })
+        }
+        chunked_shift::<N, CHUNK, SHIFTS>(self, shl)
+    }
+
     /// Folds over the array, accumulating a result.
     ///
     /// # Arguments
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index 95ad1ca04ecc7..c5c5ece29f1f8 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -4,18 +4,26 @@
 use crate::abstractions::bit::MachineInteger;
 
 /// `FunArray<N, T>` represents an array of `T` values of length `N`, where `N` is a compile-time constant.
+/// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
+/// Unused elements beyond `N` are filled with `None`.
 #[derive(Copy, Clone, Eq, PartialEq)]
-pub struct FunArray<const N: usize, T>([Option<T>; N]);
+pub struct FunArray<const N: u32, T>([Option<T>; 512]);
 
-impl<const N: usize, T> FunArray<N, T> {
+impl<const N: u32, T> FunArray<N, T> {
     /// Gets a reference to the element at index `i`.
-    pub fn get(&self, i: usize) -> &T {
-        self.0[i].as_ref().unwrap()
+    pub fn get(&self, i: u32) -> &T {
+        self.0[i as usize].as_ref().unwrap()
     }
     /// Constructor for FunArray. `FunArray<N,T>::from_fn` constructs a funarray out of a function that takes usizes smaller than `N` and produces an element of type T.
-    pub fn from_fn<F: Fn(usize) -> T>(f: F) -> Self {
+    pub fn from_fn<F: Fn(u32) -> T>(f: F) -> Self {
         // let vec = (0..N).map(f).collect();
-        let arr = core::array::from_fn(|i| if i < N { Some(f(i)) } else { None });
+        let arr = core::array::from_fn(|i| {
+            if (i as u32) < N {
+                Some(f(i as u32))
+            } else {
+                None
+            }
+        });
         Self(arr)
     }
 
@@ -47,17 +55,17 @@ impl<const N: usize, T> FunArray<N, T> {
     }
 }
 
-impl<const N: usize, T: MachineInteger> FunArray<N, T> {
+impl<const N: u32, T: MachineInteger> FunArray<N, T> {
     #[allow(non_snake_case)]
     pub fn ZERO() -> Self {
         Self::from_fn(|_| T::ZEROS)
     }
 }
 
-impl<const N: usize, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
+impl<const N: u32, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     type Error = ();
     fn try_from(v: Vec<T>) -> Result<Self, ()> {
-        if v.len() < N {
+        if (v.len() as u32) < N {
             Err(())
         } else {
             Ok(Self::from_fn(|i| v[i as usize].clone()))
@@ -65,113 +73,16 @@ impl<const N: usize, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     }
 }
 
-impl<const N: usize, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
+impl<const N: u32, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         write!(f, "{:?}", self.as_vec())
     }
 }
 
-impl<const N: usize, T> core::ops::Index<usize> for FunArray<N, T> {
+impl<const N: u32, T> core::ops::Index<u32> for FunArray<N, T> {
     type Output = T;
 
-    fn index(&self, index: usize) -> &Self::Output {
+    fn index(&self, index: u32) -> &Self::Output {
         self.get(index)
     }
 }
-
-impl<T: Copy> FunArray<1, T> {
-    pub fn new(x: T) -> Self {
-        let v = [x];
-        Self::from_fn(|i| v[i])
-    }
-}
-
-impl<T: Copy> FunArray<2, T> {
-    pub fn new(x0: T, x1: T) -> Self {
-        let v = [x0, x1];
-        Self::from_fn(|i| v[i])
-    }
-}
-
-impl<T: Copy> FunArray<4, T> {
-    pub fn new(x0: T, x1: T, x2: T, x3: T) -> Self {
-        let v = [x0, x1, x2, x3];
-        Self::from_fn(|i| v[i])
-    }
-}
-
-impl<T: Copy> FunArray<8, T> {
-    pub fn new(x0: T, x1: T, x2: T, x3: T, x4: T, x5: T, x6: T, x7: T) -> Self {
-        let v = [x0, x1, x2, x3, x4, x5, x6, x7];
-        Self::from_fn(|i| v[i])
-    }
-}
-
-impl<T: Copy> FunArray<16, T> {
-    pub fn new(
-        x0: T,
-        x1: T,
-        x2: T,
-        x3: T,
-        x4: T,
-        x5: T,
-        x6: T,
-        x7: T,
-        x8: T,
-        x9: T,
-        x10: T,
-        x11: T,
-        x12: T,
-        x13: T,
-        x14: T,
-        x15: T,
-    ) -> Self {
-        let v = [
-            x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
-        ];
-        Self::from_fn(|i| v[i])
-    }
-}
-
-impl<T: Copy> FunArray<32, T> {
-    pub fn new(
-        x0: T,
-        x1: T,
-        x2: T,
-        x3: T,
-        x4: T,
-        x5: T,
-        x6: T,
-        x7: T,
-        x8: T,
-        x9: T,
-        x10: T,
-        x11: T,
-        x12: T,
-        x13: T,
-        x14: T,
-        x15: T,
-        x16: T,
-        x17: T,
-        x18: T,
-        x19: T,
-        x20: T,
-        x21: T,
-        x22: T,
-        x23: T,
-        x24: T,
-        x25: T,
-        x26: T,
-        x27: T,
-        x28: T,
-        x29: T,
-        x30: T,
-        x31: T,
-    ) -> Self {
-        let v = [
-            x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18,
-            x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,
-        ];
-        Self::from_fn(|i| v[i])
-    }
-}
diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs
index b8b4de83d877d..b3018a8189569 100644
--- a/testable-simd-models/src/abstractions/mod.rs
+++ b/testable-simd-models/src/abstractions/mod.rs
@@ -23,5 +23,4 @@
 pub mod bit;
 pub mod bitvec;
 pub mod funarr;
-#[macro_use]
 pub mod simd;
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 9b07f8c4114de..622690f263037 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -20,7 +20,7 @@ macro_rules! interpretations {
                         #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
                         pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
             let vec: Vec<$ty> = iv.as_vec();
-            Self::from_slice(&vec[..], <$ty>::bits() as usize)
+            Self::from_slice(&vec[..], <$ty>::bits() as u32)
                         }
                         #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
                         pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
@@ -62,12 +62,12 @@ macro_rules! interpretations {
 }
 
 interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
-            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32], f32x8 [f32; 8], f64x4 [f64; 4]);
+            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32]);
 interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
-            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16], f32x4 [f32; 4], f64x2 [f64; 2]);
+            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16]);
 
 interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
-interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8], f32x2 [f32; 2]);
+interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
 interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
 
 /// Inserts an element into a vector, returning the updated vector.
@@ -75,8 +75,8 @@ interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
 /// # Safety
 ///
 /// `idx` must be in-bounds of the vector, ie. idx < N
-pub fn simd_insert<const N: usize, T: Copy>(x: FunArray<N, T>, idx: u32, val: T) -> FunArray<N, T> {
-    FunArray::from_fn(|i| if i == idx as usize { val } else { x[i] })
+pub fn simd_insert<const N: u32, T: Copy>(x: FunArray<N, T>, idx: u32, val: T) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if i == idx { val } else { x[i] })
 }
 
 /// Extracts an element from a vector.
@@ -84,12 +84,12 @@ pub fn simd_insert<const N: usize, T: Copy>(x: FunArray<N, T>, idx: u32, val: T)
 /// # Safety
 ///
 /// `idx` must be in-bounds of the vector, ie. idx < N
-pub fn simd_extract<const N: usize, T: Clone>(x: FunArray<N, T>, idx: u32) -> T {
-    x.get(idx as usize).clone()
+pub fn simd_extract<const N: u32, T: Clone>(x: FunArray<N, T>, idx: u32) -> T {
+    x.get(idx).clone()
 }
 
 /// Adds two vectors elementwise with wrapping on overflow/underflow.
-pub fn simd_add<const N: usize, T: MachineInteger + Copy>(
+pub fn simd_add<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -97,7 +97,7 @@ pub fn simd_add<const N: usize, T: MachineInteger + Copy>(
 }
 
 /// Subtracts `rhs` from `lhs` elementwise with wrapping on overflow/underflow.
-pub fn simd_sub<const N: usize, T: MachineInteger + Copy>(
+pub fn simd_sub<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -105,7 +105,7 @@ pub fn simd_sub<const N: usize, T: MachineInteger + Copy>(
 }
 
 /// Multiplies two vectors elementwise with wrapping on overflow/underflow.
-pub fn simd_mul<const N: usize, T: MachineInteger + Copy>(
+pub fn simd_mul<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -115,14 +115,14 @@ pub fn simd_mul<const N: usize, T: MachineInteger + Copy>(
 /// Produces the elementwise absolute values.
 /// For vectors of unsigned integers it returns the vector untouched.
 /// If the element is the minimum value of a signed integer, it returns the element as is.
-pub fn simd_abs<const N: usize, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
+pub fn simd_abs<const N: u32, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
     FunArray::from_fn(|i| x[i].absolute_val())
 }
 
 /// Produces the elementwise absolute difference of two vectors.
 /// Note: Absolute difference in this case is simply the element with the smaller value subtracted from the element with the larger value, with overflow/underflow.
 /// For example, if the elements are i8, the absolute difference of 255 and -2 is -255.
-pub fn simd_abs_diff<const N: usize, T: MachineInteger + Copy>(
+pub fn simd_abs_diff<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -134,7 +134,7 @@ pub fn simd_abs_diff<const N: usize, T: MachineInteger + Copy>(
 /// # Safety
 ///
 /// Each element of `rhs` must be less than `<int>::BITS`.
-pub fn simd_shl<const N: usize, T: Shl + Copy>(
+pub fn simd_shl<const N: u32, T: Shl + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as Shl>::Output> {
@@ -149,7 +149,7 @@ pub fn simd_shl<const N: usize, T: Shl + Copy>(
 ///
 /// Each element of `rhs` must be less than `<int>::BITS`.
 
-pub fn simd_shr<const N: usize, T: Shr + Copy>(
+pub fn simd_shr<const N: u32, T: Shr + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as Shr>::Output> {
@@ -158,7 +158,7 @@ pub fn simd_shr<const N: usize, T: Shr + Copy>(
 
 /// "Ands" vectors elementwise.
 
-pub fn simd_and<const N: usize, T: BitAnd + Copy>(
+pub fn simd_and<const N: u32, T: BitAnd + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitAnd>::Output> {
@@ -167,7 +167,7 @@ pub fn simd_and<const N: usize, T: BitAnd + Copy>(
 
 /// "Ors" vectors elementwise.
 
-pub fn simd_or<const N: usize, T: BitOr + Copy>(
+pub fn simd_or<const N: u32, T: BitOr + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitOr>::Output> {
@@ -176,7 +176,7 @@ pub fn simd_or<const N: usize, T: BitOr + Copy>(
 
 /// "Exclusive ors" vectors elementwise.
 
-pub fn simd_xor<const N: usize, T: BitXor + Copy>(
+pub fn simd_xor<const N: u32, T: BitXor + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitXor>::Output> {
@@ -330,9 +330,7 @@ self_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
 ///
 /// When casting from a wider number to a smaller number, the higher bits are removed.
 /// Otherwise, it extends the number, following signedness.
-pub fn simd_cast<const N: usize, T1: Copy, T2: CastsFrom<T1>>(
-    x: FunArray<N, T1>,
-) -> FunArray<N, T2> {
+pub fn simd_cast<const N: u32, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>) -> FunArray<N, T2> {
     FunArray::from_fn(|i| T2::cast(x[i]))
 }
 
@@ -340,7 +338,7 @@ pub fn simd_cast<const N: usize, T1: Copy, T2: CastsFrom<T1>>(
 ///
 /// Rust panics for `-<int>::Min` due to overflow, but here, it just returns the element as is.
 
-pub fn simd_neg<const N: usize, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
+pub fn simd_neg<const N: u32, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
     x: FunArray<N, T>,
 ) -> FunArray<N, T> {
     FunArray::from_fn(|i| {
@@ -355,7 +353,7 @@ pub fn simd_neg<const N: usize, T: From<<T as Neg>::Output> + MachineInteger + E
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_eq<const N: usize, T: Eq + MachineInteger + Copy>(
+pub fn simd_eq<const N: u32, T: Eq + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -366,7 +364,7 @@ pub fn simd_eq<const N: usize, T: Eq + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_ne<const N: usize, T: Eq + MachineInteger + Copy>(
+pub fn simd_ne<const N: u32, T: Eq + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -377,7 +375,7 @@ pub fn simd_ne<const N: usize, T: Eq + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_lt<const N: usize, T: Ord + MachineInteger + Copy>(
+pub fn simd_lt<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -388,7 +386,7 @@ pub fn simd_lt<const N: usize, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_le<const N: usize, T: Ord + MachineInteger + Copy>(
+pub fn simd_le<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -399,7 +397,7 @@ pub fn simd_le<const N: usize, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_gt<const N: usize, T: Ord + MachineInteger + Copy>(
+pub fn simd_gt<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -410,7 +408,7 @@ pub fn simd_gt<const N: usize, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_ge<const N: usize, T: Ord + MachineInteger + Copy>(
+pub fn simd_ge<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -420,13 +418,13 @@ pub fn simd_ge<const N: usize, T: Ord + MachineInteger + Copy>(
 /// Shuffles two vectors by the indices in idx.
 ///
 /// For safety, `N2 <= N1 + N3` must hold.
-pub fn simd_shuffle<T: Copy, const N1: usize, const N2: usize, const N3: usize>(
+pub fn simd_shuffle<T: Copy, const N1: u32, const N2: usize, const N3: u32>(
     x: FunArray<N1, T>,
     y: FunArray<N1, T>,
     idx: [u32; N2],
 ) -> FunArray<N3, T> {
     FunArray::from_fn(|i| {
-        let i = idx[i] as usize;
+        let i = idx[i as usize];
         if i < N1 {
             x[i]
         } else {
@@ -437,7 +435,7 @@ pub fn simd_shuffle<T: Copy, const N1: usize, const N2: usize, const N3: usize>(
 
 /// Adds two vectors elementwise, with saturation.
 
-pub fn simd_saturating_add<T: MachineInteger + Copy, const N: usize>(
+pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u32>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -446,7 +444,7 @@ pub fn simd_saturating_add<T: MachineInteger + Copy, const N: usize>(
 
 /// Subtracts `y` from `x` elementwise, with saturation.
 
-pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: usize>(
+pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: u32>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -928,7 +926,7 @@ pub(crate) use simd_bitmask_big;
 /// # Safety
 /// `mask` must only contain `0` and `!0`.
 
-pub fn simd_select<const N: usize, T1: Eq + MachineInteger, T2: Copy>(
+pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy + MachineInteger>(
     mask: FunArray<N, T1>,
     if_true: FunArray<N, T2>,
     if_false: FunArray<N, T2>,
@@ -946,10 +944,3 @@ pub fn simd_select<const N: usize, T1: Eq + MachineInteger, T2: Copy>(
 pub fn transmute<T, U: From<T>>(a: T) -> U {
     a.into()
 }
-
-#[macro_export]
-macro_rules! static_assert_uimm_bits {
-    ($imm:ident, $size:literal) => {};
-}
-
-pub use static_assert_uimm_bits;
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 20516f42c0683..fb7f5fe70cdbe 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -15,8 +15,24 @@
 
 use super::types::*;
 use crate::abstractions::simd::*;
-use super::avx_handwritten::*;
 
+mod c_extern {
+    use crate::abstractions::simd::*;
+
+    pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
+        let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
+            0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
+            1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
+            2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
+            3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
+            _ => unreachable!(),
+        });
+
+        i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
+    }
+}
+
+use c_extern::*;
 /// Blends packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
 ///
@@ -96,7 +112,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
 pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
+    // static_assert_uimm_bits!(IMM8, 8);
     vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8).into()
 }
 
@@ -147,7 +163,12 @@ pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
 pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    ptestz256(a.as_i64x4(), b.as_i64x4()) 
+    let c = __m256i::from_fn(|i| a[i] & b[i]);
+    if c == __m256i::ZERO() {
+        1
+    } else {
+        0
+    }
 }
 
 /// Sets each bit of the returned mask based on the most significant bit of the
@@ -219,8 +240,11 @@ pub fn _mm256_set_epi8(
     e30: i8,
     e31: i8,
 ) -> __m256i {
-    transmute(i8x32::new(e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
-        e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,))
+    let vec = [
+        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
+        e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+    ];
+    transmute(i8x32::from_fn(|i| vec[(31 - i) as usize]))
 }
 
 /// Sets packed 16-bit integers in returned vector with the supplied values.
@@ -247,7 +271,10 @@ pub fn _mm256_set_epi16(
     e14: i16,
     e15: i16,
 ) -> __m256i {
-    transmute(i16x16::new(e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15))
+    let vec = [
+        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
+    ];
+    transmute(i16x16::from_fn(|i| vec[(15 - i) as usize]))
 }
 
 /// Sets packed 32-bit integers in returned vector with the supplied values.
@@ -266,7 +293,8 @@ pub fn _mm256_set_epi32(
     e6: i32,
     e7: i32,
 ) -> __m256i {
-    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+    let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
+    transmute(i32x8::from_fn(|i| vec[(7 - i) as usize]))
 }
 
 /// Sets packed 64-bit integers in returned vector with the supplied values.
@@ -274,7 +302,8 @@ pub fn _mm256_set_epi32(
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
 // This intrinsic has no corresponding instruction.
 pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
-    transmute(i64x4::new(d, c, b, a))
+    let vec = [d, c, b, a];
+    transmute(i64x4::from_fn(|i| vec[i as usize]))
 }
 
 /// Broadcasts 8-bit integer `a` to all elements of returned vector.
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 3327be549bf5c..cb57efb4d5a9d 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -540,14 +540,14 @@ mod c_extern {
                 if b[i] > 127 {
                     0
                 } else {
-                    let index = (b[i] % 16) as usize;
+                    let index = (b[i] % 16) as u32;
                     a[index]
                 }
             } else {
                 if b[i] > 127 {
                     0
                 } else {
-                    let index = (b[i] % 16) as usize;
+                    let index = (b[i] % 16) as u32;
                     a[index + 16]
                 }
             }
@@ -556,7 +556,7 @@ mod c_extern {
 
     pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
         u32x8::from_fn(|i| {
-            let id = (b[i] % 8) as usize;
+            let id = b[i] % 8;
             a[id]
         })
     }
@@ -564,8 +564,8 @@ mod c_extern {
     pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
         u16x16::from_fn(|i| {
             if i < 8 {
-                let a_offset = (((imm8 & 4) >> 2) * 4) as usize;
-                let b_offset = ((imm8 & 3) * 4) as usize;
+                let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
+                let b_offset = ((imm8 & 3) * 4) as u32;
                 let k = a_offset + i;
                 let l = b_offset;
                 ((a[k].absolute_diff(b[l]) as i8) as u8 as u16)
@@ -575,8 +575,8 @@ mod c_extern {
             } else {
                 let i = i - 8;
                 let imm8 = imm8 >> 3;
-                let a_offset = (((imm8 & 4) >> 2) * 4) as usize;
-                let b_offset = ((imm8 & 3) * 4) as usize;
+                let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
+                let b_offset = ((imm8 & 3) * 4) as u32;
                 let k = a_offset + i;
                 let l = b_offset;
                 ((a[16 + k].absolute_diff(b[16 + l]) as i8) as u8 as u16)
diff --git a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
deleted file mode 100644
index d6427e3bc7a4b..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
+++ /dev/null
@@ -1,24 +0,0 @@
-//! LLVM intrinsics used in AVX
-
-use crate::abstractions::simd::*;
-
-pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
-    let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
-        0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
-        1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
-        2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
-        3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
-        _ => unreachable!(),
-    });
-
-    i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
-}
-
-pub fn ptestz256(a: i64x4, b: i64x4) -> i32 {
-    let c = i64x4::from_fn(|i| a[i] & b[i]);
-    if c == i64x4::ZERO() {
-        1
-    } else {
-        0
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 7cd247e27f181..95c9eb4061b6a 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -22,7 +22,6 @@
 
 pub mod avx;
 pub mod avx2;
-pub mod avx_handwritten;
 pub mod sse2;
 pub mod ssse3;
 
@@ -32,8 +31,6 @@ pub(crate) mod types {
     #[allow(non_camel_case_types)]
     pub type __m256i = BitVec<256>;
     #[allow(non_camel_case_types)]
-    pub type __m256d = BitVec<256>;
-    #[allow(non_camel_case_types)]
     pub type __m256 = BitVec<256>;
     #[allow(non_camel_case_types)]
     pub type __m128i = BitVec<128>;
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index ff2c7713540f5..7fc29262232a9 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -7,13 +7,7 @@ use super::types::*;
 mod c_extern {
     use crate::abstractions::simd::*;
     pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
-        u8x16::from_fn(|i| {
-            if b[i] > 127 {
-                0
-            } else {
-                a[(b[i] % 16) as usize]
-            }
-        })
+        u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u32] })
     }
 
     pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
diff --git a/testable-simd-models/src/helpers.rs b/testable-simd-models/src/helpers.rs
index d646d49f5dc64..c2ea42b4b7ed6 100644
--- a/testable-simd-models/src/helpers.rs
+++ b/testable-simd-models/src/helpers.rs
@@ -38,13 +38,13 @@ pub mod test {
             crate::abstractions::bit::Bit::from(bool::random())
         }
     }
-    impl<const N: usize> HasRandom for BitVec<N> {
+    impl<const N: u32> HasRandom for BitVec<N> {
         fn random() -> Self {
             Self::from_fn(|_| Bit::random())
         }
     }
 
-    impl<const N: usize, T: HasRandom> HasRandom for FunArray<N, T> {
+    impl<const N: u32, T: HasRandom> HasRandom for FunArray<N, T> {
         fn random() -> Self {
             FunArray::from_fn(|_| T::random())
         }

From 320a0898c59416a0a34c6fb53ae31338302da0e3 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 13:56:56 +0200
Subject: [PATCH 13/47] fix for sse2

---
 .../src/core_arch/x86/models/sse2.rs          | 54 +++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 711726550ea82..9e3a14cfadddb 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -215,7 +215,7 @@ pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
 
 pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_add(b.as_i16x8(), a.as_i16x8()))
+    transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Adds packed 32-bit integers in `a` and `b`.
@@ -223,7 +223,7 @@ pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
 
 pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(b.as_i32x4(), a.as_i32x4()).into()
+    simd_add(a.as_i32x4(), b.as_i32x4()).into()
 }
 
 /// Adds packed 64-bit integers in `a` and `b`.
@@ -231,7 +231,7 @@ pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
 
 pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(b.as_i64x2(), a.as_i64x2()).into()
+    simd_add(a.as_i64x2(), b.as_i64x2()).into()
 }
 
 /// Adds packed 8-bit integers in `a` and `b` using saturation.
@@ -239,7 +239,7 @@ pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
 
 pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(b.as_i8x16(), a.as_i8x16()).into()
+    simd_saturating_add(a.as_i8x16(), b.as_i8x16()).into()
 }
 
 /// Adds packed 16-bit integers in `a` and `b` using saturation.
@@ -247,7 +247,7 @@ pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
 
 pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(b.as_i16x8(), a.as_i16x8()).into()
+    simd_saturating_add(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
@@ -255,7 +255,7 @@ pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
 
 pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(b.as_u8x16(), a.as_u8x16()).into()
+    simd_saturating_add(a.as_u8x16(), b.as_u8x16()).into()
 }
 
 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
@@ -263,7 +263,7 @@ pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
 
 pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(b.as_u16x8(), a.as_u16x8()).into()
+    simd_saturating_add(a.as_u16x8(), b.as_u16x8()).into()
 }
 
 /// Averages packed unsigned 8-bit integers in `a` and `b`.
@@ -297,7 +297,7 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
 
 pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    pmaddwd(b.as_i16x8(), a.as_i16x8()).into()
+    pmaddwd(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -380,7 +380,7 @@ pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
 
 pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_mul(b.as_i16x8(), a.as_i16x8()))
+    transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
@@ -407,7 +407,7 @@ pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
 
 pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
-    psadbw(b.as_u8x16(), a.as_u8x16()).into()
+    psadbw(a.as_u8x16(), b.as_u8x16()).into()
 }
 
 /// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
@@ -415,7 +415,7 @@ pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
 
 pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_sub(b.as_i8x16(), a.as_i8x16()))
+    transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
@@ -423,7 +423,7 @@ pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
 
 pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_sub(b.as_i16x8(), a.as_i16x8()))
+    transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
@@ -431,7 +431,7 @@ pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
 
 pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_sub(b.as_i32x4(), a.as_i32x4()).into()
+    simd_sub(a.as_i32x4(), b.as_i32x4()).into()
 }
 
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
@@ -439,7 +439,7 @@ pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
 
 pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_sub(b.as_i64x2(), a.as_i64x2()).into()
+    simd_sub(a.as_i64x2(), b.as_i64x2()).into()
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
@@ -448,7 +448,7 @@ pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
 
 pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(b.as_i8x16(), a.as_i8x16()).into()
+    simd_saturating_sub(a.as_i8x16(), b.as_i8x16()).into()
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
@@ -457,7 +457,7 @@ pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
 
 pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(b.as_i16x8(), a.as_i16x8()).into()
+    simd_saturating_sub(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
@@ -466,7 +466,7 @@ pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
 
 pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(b.as_u8x16(), a.as_u8x16()).into()
+    simd_saturating_sub(a.as_u8x16(), b.as_u8x16()).into()
 }
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
@@ -475,7 +475,7 @@ pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
 
 pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(b.as_u16x8(), a.as_u16x8()).into()
+    simd_saturating_sub(a.as_u16x8(), b.as_u16x8()).into()
 }
 
 /// Shifts `a` left by `IMM8` bytes while shifting in zeros.
@@ -627,7 +627,7 @@ pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
 
 pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
-    psraw(count.as_i16x8(), a.as_i16x8()).into()
+    psraw(a.as_i16x8(), count.as_i16x8()).into()
 }
 
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
@@ -646,7 +646,7 @@ pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
 
 pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psrad(count.as_i32x4(), a.as_i32x4()).into()
+    psrad(a.as_i32x4(), count.as_i32x4()).into()
 }
 
 /// Shifts `a` right by `IMM8` bytes while shifting in zeros.
@@ -1078,7 +1078,7 @@ pub fn _mm_move_epi64(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
 
 pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    packsswb(b.as_i16x8(), a.as_i16x8()).into()
+    packsswb(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -1087,7 +1087,7 @@ pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
 
 pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
-    packssdw(b.as_i32x4(), a.as_i32x4()).into()
+    packssdw(a.as_i32x4(), b.as_i32x4()).into()
 }
 
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
@@ -1096,7 +1096,7 @@ pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
 
 pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
-    packuswb(b.as_i16x8(), a.as_i16x8()).into()
+    packuswb(a.as_i16x8(), b.as_i16x8()).into()
 }
 
 /// Returns the `imm8` element of `a`.
@@ -1234,7 +1234,7 @@ pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
 
 pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(b.as_i32x4(), a.as_i32x4(), [2, 6, 3, 7])).into()
+    (simd_shuffle(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])).into()
 }
 
 /// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
@@ -1242,7 +1242,7 @@ pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
 
 pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(b.as_i64x2(), a.as_i64x2(), [1, 3])).into()
+    (simd_shuffle(a.as_i64x2(), b.as_i64x2(), [1, 3])).into()
 }
 
 /// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
@@ -1272,7 +1272,7 @@ pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
 
 pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_shuffle(b.as_i32x4(), a.as_i32x4(), [0, 4, 1, 5]).into()
+    simd_shuffle(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]).into()
 }
 
 /// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
@@ -1280,7 +1280,7 @@ pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
 
 pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_shuffle(b.as_i64x2(), a.as_i64x2(), [0, 2]).into()
+    simd_shuffle(a.as_i64x2(), b.as_i64x2(), [0, 2]).into()
 }
 
 /// Returns vector of type __m128i with indeterminate elements.with indetermination elements.

From 54d8fdf08c023f9fbcc4a122a83a901a28a36427 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 17:18:01 -0400
Subject: [PATCH 14/47] split numeric

---
 testable-simd-models/src/abstractions/bit.rs  | 33 +++++++++++--------
 .../src/abstractions/bitvec.rs                | 14 ++++----
 testable-simd-models/src/abstractions/simd.rs |  6 ++--
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 75cfd7e755ccb..b8fbfce7f420b 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -116,11 +116,11 @@ impl From<bool> for Bit {
     }
 }
 
-/// A trait for types that represent machine integers.
-pub trait MachineInteger {
-    /// The size of this integer type in bits.
-    fn bits() -> u32;
+/// A trait for integers and floats
 
+pub trait MachineNumeric {
+    /// The size of this integer type in bits.
+    const BITS: u32;
     /// The signedness of this integer type.
     const SIGNED: bool;
     /// Element of the integer type with every bit as 0.
@@ -131,7 +131,10 @@ pub trait MachineInteger {
     const MIN: Self;
     /// Maximum value of the integer type.
     const MAX: Self;
+}
 
+/// A trait for types that represent machine integers.
+pub trait MachineInteger : MachineNumeric {
     /// Implements functionality for `simd_add` in `crate::abstractions::simd`.
     fn wrapping_add(self, rhs: Self) -> Self;
     /// Implements functionality for `simd_sub` in `crate::abstractions::simd`.
@@ -145,26 +148,28 @@ pub trait MachineInteger {
     /// Implements functionality for `simd_abs_diff` in `crate::abstractions::simd`.
     fn absolute_diff(self, rhs: Self) -> Self;
     /// Implements functionality for `simd_abs` in `crate::abstractions::simd`.
-    fn absolute_val(self) -> Self;
+    fn wrapping_abs(self) -> Self;
 }
 
 macro_rules! generate_imachine_integer_impls {
     ($($ty:ident),*) => {
         $(
-	    impl MachineInteger for $ty {
+        impl MachineNumeric for $ty {
+        const BITS: u32 = $ty::BITS;
 		const SIGNED: bool = true;
 		const ZEROS: $ty = 0;
 		const ONES: $ty = -1;
 		const MIN: $ty = $ty::MIN;
 		const MAX: $ty = $ty::MAX;
-		fn bits() -> u32 { $ty::BITS }
+        }
+	    impl MachineInteger for $ty {
 		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
 		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
 		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
 		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
 		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs) }
 		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {$ty::wrapping_sub(self, rhs)} else {$ty::wrapping_sub(rhs, self)}}
-		fn absolute_val(self) -> Self {if self == $ty::MIN {self} else {self.abs()}}
+		fn wrapping_abs(self) -> Self {if self == $ty::MIN {self} else {self.abs()}}
             })*
     };
 }
@@ -172,22 +177,22 @@ macro_rules! generate_imachine_integer_impls {
 macro_rules! generate_umachine_integer_impls {
     ($($ty:ident),*) => {
         $(
-	    impl MachineInteger for $ty {
+        impl MachineNumeric for $ty {
+        const BITS: u32 = $ty::BITS;
 		const SIGNED: bool = false;
 		const ZEROS: $ty = 0;
 		const ONES: $ty = $ty::MAX;
 		const MIN: $ty = $ty::MIN;
 		const MAX: $ty = $ty::MAX;
-
-
-		fn bits() -> u32 { $ty::BITS }
+        }
+	    impl MachineInteger for $ty {
 		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
 		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
 		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
 		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
 		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs)}
 		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
-		fn absolute_val(self) -> Self {self}
+		fn wrapping_abs(self) -> Self {self}
         })*
     };
 }
@@ -208,7 +213,7 @@ impl Bit {
         if x >= 0 {
             Self::of_raw_int(x as u128, nth)
         } else {
-            Self::of_raw_int((2i128.pow(T::bits()) + x) as u128, nth)
+            Self::of_raw_int((2i128.pow(T::BITS) + x) as u128, nth)
         }
     }
 }
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index 952c5a21cb1a6..e66d60b5bce8a 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -1,5 +1,5 @@
 //! This module provides a specification-friendly bit vector type.
-use super::bit::{Bit, MachineInteger};
+use super::bit::{Bit, MachineInteger, MachineNumeric};
 use super::funarr::*;
 
 use std::fmt::Formatter;
@@ -64,12 +64,12 @@ fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
 
 /// Convert a bit slice into a machine integer of type `T`.
 fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) -> T {
-    debug_assert!(bits.len() <= T::bits() as usize);
+    debug_assert!(bits.len() <= T::BITS as usize);
     let result = if T::SIGNED {
-        let is_negative = matches!(bits[T::bits() as usize - 1], Bit::One);
-        let s = u128_int_from_bit_slice(&bits[0..T::bits() as usize - 1]) as i128;
+        let is_negative = matches!(bits[T::BITS as usize - 1], Bit::One);
+        let s = u128_int_from_bit_slice(&bits[0..T::BITS as usize - 1]) as i128;
         if is_negative {
-            s + (-2i128).pow(T::bits() - 1)
+            s + (-2i128).pow(T::BITS - 1)
         } else {
             s
         }
@@ -94,7 +94,7 @@ impl<const N: u32> BitVec<N> {
 
     /// Construct a BitVec out of a machine integer.
     pub fn from_int<T: Into<i128> + MachineInteger + Copy>(n: T) -> Self {
-        Self::from_slice::<T>(&[n], T::bits() as u32)
+        Self::from_slice::<T>(&[n], T::BITS as u32)
     }
 
     /// Convert a BitVec into a machine integer of type `T`.
@@ -106,7 +106,7 @@ impl<const N: u32> BitVec<N> {
     pub fn to_vec<T: TryFrom<i128> + MachineInteger + Copy>(&self) -> Vec<T> {
         self.0
             .as_vec()
-            .chunks(T::bits() as usize)
+            .chunks(T::BITS as usize)
             .map(int_from_bit_slice)
             .collect()
     }
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 622690f263037..3601512c392ba 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -2,7 +2,7 @@
 //!
 //! Operations are defined on FunArrs.
 
-use crate::abstractions::{bit::MachineInteger, bitvec::*, funarr::*};
+use crate::abstractions::{bit::*, bitvec::*, funarr::*};
 use std::convert::*;
 use std::ops::*;
 
@@ -20,7 +20,7 @@ macro_rules! interpretations {
                         #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
                         pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
             let vec: Vec<$ty> = iv.as_vec();
-            Self::from_slice(&vec[..], <$ty>::bits() as u32)
+            Self::from_slice(&vec[..], <$ty>::BITS as u32) 
                         }
                         #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
                         pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
@@ -116,7 +116,7 @@ pub fn simd_mul<const N: u32, T: MachineInteger + Copy>(
 /// For vectors of unsigned integers it returns the vector untouched.
 /// If the element is the minimum value of a signed integer, it returns the element as is.
 pub fn simd_abs<const N: u32, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
-    FunArray::from_fn(|i| x[i].absolute_val())
+    FunArray::from_fn(|i| x[i].wrapping_abs())
 }
 
 /// Produces the elementwise absolute difference of two vectors.

From b740804bfd2bb88ee66c3dffd221a9feca7d31b3 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 17:50:42 -0400
Subject: [PATCH 15/47] nth_bit

---
 testable-simd-models/src/abstractions/bit.rs  | 47 +++++++++++++------
 .../src/abstractions/bitvec.rs                | 18 +++----
 testable-simd-models/src/abstractions/simd.rs |  2 +-
 .../src/core_arch/x86/models/avx2.rs          | 18 +++----
 .../src/core_arch/x86/models/sse2.rs          |  2 +-
 5 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index b8fbfce7f420b..906a998bde75a 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -20,7 +20,7 @@
 //! use testable_simd_models::abstractions::bit::{Bit, MachineInteger};
 //!
 //! // Extract the 3rd bit (0-indexed) from an integer.
-//! let bit = Bit::of_int(42, 2);
+//! let bit = Bit::nth_bit(42, 2);
 //! println!("The extracted bit is: {:?}", bit);
 //!
 //! // Convert Bit to a primitive integer type.
@@ -131,6 +131,10 @@ pub trait MachineNumeric {
     const MIN: Self;
     /// Maximum value of the integer type.
     const MAX: Self;
+    /// Raw transmutation of bits to u128
+    fn to_u128(self) -> u128;
+    /// Raw transmutation of bits from u128
+    fn from_u128(x:u128) -> Self;
 }
 
 /// A trait for types that represent machine integers.
@@ -146,7 +150,7 @@ pub trait MachineInteger : MachineNumeric {
     /// Implements functionality for `simd_saturating_sub` in `crate::abstractions::simd`.
     fn saturating_sub(self, rhs: Self) -> Self;
     /// Implements functionality for `simd_abs_diff` in `crate::abstractions::simd`.
-    fn absolute_diff(self, rhs: Self) -> Self;
+    fn wrapping_abs_diff(self, rhs: Self) -> Self;
     /// Implements functionality for `simd_abs` in `crate::abstractions::simd`.
     fn wrapping_abs(self) -> Self;
 }
@@ -161,6 +165,8 @@ macro_rules! generate_imachine_integer_impls {
 		const ONES: $ty = -1;
 		const MIN: $ty = $ty::MIN;
 		const MAX: $ty = $ty::MAX;
+        fn to_u128(self) -> u128 {self as u128}
+        fn from_u128(x:u128) -> Self {x as $ty}
         }
 	    impl MachineInteger for $ty {
 		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
@@ -168,7 +174,7 @@ macro_rules! generate_imachine_integer_impls {
 		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
 		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
 		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs) }
-		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {$ty::wrapping_sub(self, rhs)} else {$ty::wrapping_sub(rhs, self)}}
+		fn wrapping_abs_diff(self, rhs: Self) -> Self {if self > rhs {$ty::wrapping_sub(self, rhs)} else {$ty::wrapping_sub(rhs, self)}}
 		fn wrapping_abs(self) -> Self {if self == $ty::MIN {self} else {self.abs()}}
             })*
     };
@@ -184,6 +190,8 @@ macro_rules! generate_umachine_integer_impls {
 		const ONES: $ty = $ty::MAX;
 		const MIN: $ty = $ty::MIN;
 		const MAX: $ty = $ty::MAX;
+        fn to_u128(self) -> u128 {self as u128}
+        fn from_u128(x:u128) -> Self {x as $ty}
         }
 	    impl MachineInteger for $ty {
 		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
@@ -191,7 +199,7 @@ macro_rules! generate_umachine_integer_impls {
 		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
 		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
 		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs)}
-		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
+		fn wrapping_abs_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
 		fn wrapping_abs(self) -> Self {self}
         })*
     };
@@ -200,20 +208,29 @@ generate_imachine_integer_impls!(i8, i16, i32, i64, i128);
 generate_umachine_integer_impls!(u8, u16, u32, u64, u128);
 
 impl Bit {
-    fn of_raw_int(x: u128, nth: u32) -> Self {
-        if x / 2u128.pow(nth) % 2 == 1 {
+    pub fn nth_bit<T: MachineInteger>(x: T, nth: usize) -> Self {
+        if (x.to_u128() >> nth) % 2 == 1 {
             Self::One
         } else {
             Self::Zero
         }
     }
-
-    pub fn of_int<T: Into<i128> + MachineInteger>(x: T, nth: u32) -> Bit {
-        let x: i128 = x.into();
-        if x >= 0 {
-            Self::of_raw_int(x as u128, nth)
-        } else {
-            Self::of_raw_int((2i128.pow(T::BITS) + x) as u128, nth)
-        }
-    }
 }
+
+// impl Bit {
+//     fn of_raw_int(x: u128, nth: u32) -> Self {
+//         if x / 2u128.pow(nth) % 2 == 1 {
+//             Self::One
+//         } else {
+//             Self::Zero
+//         }
+//     }
+
+//     pub fn of_int<T: MachineInteger + PartialOrd>(x: T, nth: u32) -> Bit {
+//         if x >= T::ZEROS {
+//             Self::of_raw_int(x.to_u128()    , nth)
+//         } else {
+//             Self::of_raw_int((2i128.pow(T::BITS) + x) as u128, nth)
+//         }
+//     }
+// }
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index e66d60b5bce8a..bd6dcca254aa5 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -63,7 +63,7 @@ fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
 }
 
 /// Convert a bit slice into a machine integer of type `T`.
-fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) -> T {
+fn int_from_bit_slice<T: MachineInteger + Copy>(bits: &[Bit]) -> T {
     debug_assert!(bits.len() <= T::BITS as usize);
     let result = if T::SIGNED {
         let is_negative = matches!(bits[T::BITS as usize - 1], Bit::One);
@@ -76,11 +76,7 @@ fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) ->
     } else {
         u128_int_from_bit_slice(bits) as i128
     };
-    let Ok(n) = result.try_into() else {
-        // Conversion must succeed as `result` is guaranteed to be in range due to the bit-length check.
-        unreachable!()
-    };
-    n
+    T::from_u128(result as u128)
 }
 impl<const N: u32> BitVec<N> {
     /// Constructor for BitVec. `BitVec::<N>::from_fn` constructs a bitvector out of a function that takes usizes smaller than `N` and produces bits.
@@ -88,22 +84,22 @@ impl<const N: u32> BitVec<N> {
         Self(FunArray::from_fn(f))
     }
     /// Convert a slice of machine integers where only the `d` least significant bits are relevant.
-    pub fn from_slice<T: Into<i128> + MachineInteger + Copy>(x: &[T], d: u32) -> Self {
-        Self::from_fn(|i| Bit::of_int::<T>(x[(i / d) as usize], (i % d) as u32))
+    pub fn from_slice<T: MachineInteger + Copy>(x: &[T], d: u32) -> Self {
+        Self::from_fn(|i| Bit::nth_bit::<T>(x[(i / d) as usize], (i % d) as usize))
     }
 
     /// Construct a BitVec out of a machine integer.
-    pub fn from_int<T: Into<i128> + MachineInteger + Copy>(n: T) -> Self {
+    pub fn from_int<T: MachineInteger + Copy>(n: T) -> Self {
         Self::from_slice::<T>(&[n], T::BITS as u32)
     }
 
     /// Convert a BitVec into a machine integer of type `T`.
-    pub fn to_int<T: TryFrom<i128> + MachineInteger + Copy>(self) -> T {
+    pub fn to_int<T: MachineInteger + Copy>(self) -> T {
         int_from_bit_slice(&self.0.as_vec())
     }
 
     /// Convert a BitVec into a vector of machine integers of type `T`.
-    pub fn to_vec<T: TryFrom<i128> + MachineInteger + Copy>(&self) -> Vec<T> {
+    pub fn to_vec<T: MachineInteger + Copy>(&self) -> Vec<T> {
         self.0
             .as_vec()
             .chunks(T::BITS as usize)
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 3601512c392ba..44469a748dd99 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -126,7 +126,7 @@ pub fn simd_abs_diff<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
-    FunArray::from_fn(|i| x[i].absolute_diff(y[i]))
+    FunArray::from_fn(|i| x[i].wrapping_abs_diff(y[i]))
 }
 
 /// Shifts vector left elementwise, with UB on overflow.
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index cb57efb4d5a9d..69f3650b823ca 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -568,10 +568,10 @@ mod c_extern {
                 let b_offset = ((imm8 & 3) * 4) as u32;
                 let k = a_offset + i;
                 let l = b_offset;
-                ((a[k].absolute_diff(b[l]) as i8) as u8 as u16)
-                    + ((a[k + 1].absolute_diff(b[l + 1]) as i8) as u8 as u16)
-                    + ((a[k + 2].absolute_diff(b[l + 2]) as i8) as u8 as u16)
-                    + ((a[k + 3].absolute_diff(b[l + 3]) as i8) as u8 as u16)
+                ((a[k].wrapping_abs_diff(b[l]) as i8) as u8 as u16)
+                    + ((a[k + 1].wrapping_abs_diff(b[l + 1]) as i8) as u8 as u16)
+                    + ((a[k + 2].wrapping_abs_diff(b[l + 2]) as i8) as u8 as u16)
+                    + ((a[k + 3].wrapping_abs_diff(b[l + 3]) as i8) as u8 as u16)
             } else {
                 let i = i - 8;
                 let imm8 = imm8 >> 3;
@@ -579,10 +579,10 @@ mod c_extern {
                 let b_offset = ((imm8 & 3) * 4) as u32;
                 let k = a_offset + i;
                 let l = b_offset;
-                ((a[16 + k].absolute_diff(b[16 + l]) as i8) as u8 as u16)
-                    + ((a[16 + k + 1].absolute_diff(b[16 + l + 1]) as i8) as u8 as u16)
-                    + ((a[16 + k + 2].absolute_diff(b[16 + l + 2]) as i8) as u8 as u16)
-                    + ((a[16 + k + 3].absolute_diff(b[16 + l + 3]) as i8) as u8 as u16)
+                ((a[16 + k].wrapping_abs_diff(b[16 + l]) as i8) as u8 as u16)
+                    + ((a[16 + k + 1].wrapping_abs_diff(b[16 + l + 1]) as i8) as u8 as u16)
+                    + ((a[16 + k + 2].wrapping_abs_diff(b[16 + l + 2]) as i8) as u8 as u16)
+                    + ((a[16 + k + 3].wrapping_abs_diff(b[16 + l + 3]) as i8) as u8 as u16)
             }
         })
     }
@@ -629,7 +629,7 @@ mod c_extern {
     }
 
     pub fn psadbw(a: u8x32, b: u8x32) -> u64x4 {
-        let tmp = u8x32::from_fn(|i| a[i].absolute_diff(b[i]));
+        let tmp = u8x32::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
         u64x4::from_fn(|i| {
             (tmp[i * 8] as u16)
                 .wrapping_add(tmp[i * 8 + 1] as u16)
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 9e3a14cfadddb..ab3eba7ded5fa 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -30,7 +30,7 @@ mod c_extern {
         })
     }
     pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 {
-        let tmp = u8x16::from_fn(|i| a[i].absolute_diff(b[i]));
+        let tmp = u8x16::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
         u64x2::from_fn(|i| {
             (tmp[i * 8] as u16)
                 .wrapping_add(tmp[i * 8 + 1] as u16)

From 2fae2c7a389bc1cd2cdc5296aaf7edf921e4d379 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 17:53:09 -0400
Subject: [PATCH 16/47] bit clean

---
 testable-simd-models/src/abstractions/bit.rs  | 22 ++-----------------
 .../src/abstractions/bitvec.rs                |  2 +-
 testable-simd-models/src/abstractions/simd.rs |  2 +-
 3 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 906a998bde75a..0a7d337ca86d1 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -134,11 +134,11 @@ pub trait MachineNumeric {
     /// Raw transmutation of bits to u128
     fn to_u128(self) -> u128;
     /// Raw transmutation of bits from u128
-    fn from_u128(x:u128) -> Self;
+    fn from_u128(x: u128) -> Self;
 }
 
 /// A trait for types that represent machine integers.
-pub trait MachineInteger : MachineNumeric {
+pub trait MachineInteger: MachineNumeric {
     /// Implements functionality for `simd_add` in `crate::abstractions::simd`.
     fn wrapping_add(self, rhs: Self) -> Self;
     /// Implements functionality for `simd_sub` in `crate::abstractions::simd`.
@@ -216,21 +216,3 @@ impl Bit {
         }
     }
 }
-
-// impl Bit {
-//     fn of_raw_int(x: u128, nth: u32) -> Self {
-//         if x / 2u128.pow(nth) % 2 == 1 {
-//             Self::One
-//         } else {
-//             Self::Zero
-//         }
-//     }
-
-//     pub fn of_int<T: MachineInteger + PartialOrd>(x: T, nth: u32) -> Bit {
-//         if x >= T::ZEROS {
-//             Self::of_raw_int(x.to_u128()    , nth)
-//         } else {
-//             Self::of_raw_int((2i128.pow(T::BITS) + x) as u128, nth)
-//         }
-//     }
-// }
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index bd6dcca254aa5..279a02c148911 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -1,5 +1,5 @@
 //! This module provides a specification-friendly bit vector type.
-use super::bit::{Bit, MachineInteger, MachineNumeric};
+use super::bit::{Bit, MachineInteger};
 use super::funarr::*;
 
 use std::fmt::Formatter;
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 44469a748dd99..12caf6c41e25e 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -20,7 +20,7 @@ macro_rules! interpretations {
                         #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
                         pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
             let vec: Vec<$ty> = iv.as_vec();
-            Self::from_slice(&vec[..], <$ty>::BITS as u32) 
+            Self::from_slice(&vec[..], <$ty>::BITS as u32)
                         }
                         #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
                         pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {

From 8a35c4c00545a7f8f07dbe2b9cbaffaed7a3ddf1 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 18:04:57 -0400
Subject: [PATCH 17/47] introduced floats

---
 testable-simd-models/src/abstractions/bit.rs  | 25 ++++++++++++++++++-
 .../src/abstractions/bitvec.rs                | 12 ++++-----
 testable-simd-models/src/abstractions/simd.rs |  6 ++---
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 0a7d337ca86d1..23d5e0a6fb2ef 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -207,8 +207,31 @@ macro_rules! generate_umachine_integer_impls {
 generate_imachine_integer_impls!(i8, i16, i32, i64, i128);
 generate_umachine_integer_impls!(u8, u16, u32, u64, u128);
 
+impl MachineNumeric for f32 {
+    const BITS: u32 = 32;
+    const SIGNED: bool = false;
+    const ZEROS: f32 = 0.0;
+    const ONES: f32 = f32::from_bits(0xffffffffu32);
+    const MIN: f32 = f32::MIN;
+    const MAX: f32 = f32::MAX;
+    fn to_u128(self) -> u128 {self.to_bits() as u128}
+    fn from_u128(x:u128) -> Self {f32::from_bits(x as u32)}
+}
+
+impl MachineNumeric for f64 {
+    const BITS: u32 = 64;
+    const SIGNED: bool = false;
+    const ZEROS: f64 = 0.0;
+    const ONES: f64 = f64::from_bits(0xffffffffffffffffu64);
+    const MIN: f64 = f64::MIN;
+    const MAX: f64 = f64::MAX;
+    fn to_u128(self) -> u128 {self.to_bits() as u128}
+    fn from_u128(x:u128) -> Self {f64::from_bits(x as u64)}
+}
+
+
 impl Bit {
-    pub fn nth_bit<T: MachineInteger>(x: T, nth: usize) -> Self {
+    pub fn nth_bit<T: MachineNumeric>(x: T, nth: usize) -> Self {
         if (x.to_u128() >> nth) % 2 == 1 {
             Self::One
         } else {
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index 279a02c148911..ac73749482e37 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -1,5 +1,5 @@
 //! This module provides a specification-friendly bit vector type.
-use super::bit::{Bit, MachineInteger};
+use super::bit::{Bit, MachineNumeric};
 use super::funarr::*;
 
 use std::fmt::Formatter;
@@ -63,7 +63,7 @@ fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
 }
 
 /// Convert a bit slice into a machine integer of type `T`.
-fn int_from_bit_slice<T: MachineInteger + Copy>(bits: &[Bit]) -> T {
+fn int_from_bit_slice<T: MachineNumeric + Copy>(bits: &[Bit]) -> T {
     debug_assert!(bits.len() <= T::BITS as usize);
     let result = if T::SIGNED {
         let is_negative = matches!(bits[T::BITS as usize - 1], Bit::One);
@@ -84,22 +84,22 @@ impl<const N: u32> BitVec<N> {
         Self(FunArray::from_fn(f))
     }
     /// Convert a slice of machine integers where only the `d` least significant bits are relevant.
-    pub fn from_slice<T: MachineInteger + Copy>(x: &[T], d: u32) -> Self {
+    pub fn from_slice<T: MachineNumeric + Copy>(x: &[T], d: u32) -> Self {
         Self::from_fn(|i| Bit::nth_bit::<T>(x[(i / d) as usize], (i % d) as usize))
     }
 
     /// Construct a BitVec out of a machine integer.
-    pub fn from_int<T: MachineInteger + Copy>(n: T) -> Self {
+    pub fn from_int<T: MachineNumeric + Copy>(n: T) -> Self {
         Self::from_slice::<T>(&[n], T::BITS as u32)
     }
 
     /// Convert a BitVec into a machine integer of type `T`.
-    pub fn to_int<T: MachineInteger + Copy>(self) -> T {
+    pub fn to_int<T: MachineNumeric + Copy>(self) -> T {
         int_from_bit_slice(&self.0.as_vec())
     }
 
     /// Convert a BitVec into a vector of machine integers of type `T`.
-    pub fn to_vec<T: MachineInteger + Copy>(&self) -> Vec<T> {
+    pub fn to_vec<T: MachineNumeric + Copy>(&self) -> Vec<T> {
         self.0
             .as_vec()
             .chunks(T::BITS as usize)
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 12caf6c41e25e..9ef66e929b97e 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -62,12 +62,12 @@ macro_rules! interpretations {
 }
 
 interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
-            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32]);
+            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32], f32x8 [f32; 8], f64x4 [f64; 4]);
 interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
-            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16]);
+            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16], f32x4 [f32; 4], f64x2 [f64; 2]);
 
 interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
-interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
+interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8], f32x2 [f32; 2], f64x1 [f64; 1]);
 interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
 
 /// Inserts an element into a vector, returning the updated vector.

From 338b109808958bd4fd68d32f50bced0c4c0fc0fb Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 18:05:11 -0400
Subject: [PATCH 18/47] fmt

---
 testable-simd-models/src/abstractions/bit.rs | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 23d5e0a6fb2ef..f8b67f2ca20f1 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -214,8 +214,12 @@ impl MachineNumeric for f32 {
     const ONES: f32 = f32::from_bits(0xffffffffu32);
     const MIN: f32 = f32::MIN;
     const MAX: f32 = f32::MAX;
-    fn to_u128(self) -> u128 {self.to_bits() as u128}
-    fn from_u128(x:u128) -> Self {f32::from_bits(x as u32)}
+    fn to_u128(self) -> u128 {
+        self.to_bits() as u128
+    }
+    fn from_u128(x: u128) -> Self {
+        f32::from_bits(x as u32)
+    }
 }
 
 impl MachineNumeric for f64 {
@@ -225,11 +229,14 @@ impl MachineNumeric for f64 {
     const ONES: f64 = f64::from_bits(0xffffffffffffffffu64);
     const MIN: f64 = f64::MIN;
     const MAX: f64 = f64::MAX;
-    fn to_u128(self) -> u128 {self.to_bits() as u128}
-    fn from_u128(x:u128) -> Self {f64::from_bits(x as u64)}
+    fn to_u128(self) -> u128 {
+        self.to_bits() as u128
+    }
+    fn from_u128(x: u128) -> Self {
+        f64::from_bits(x as u64)
+    }
 }
 
-
 impl Bit {
     pub fn nth_bit<T: MachineNumeric>(x: T, nth: usize) -> Self {
         if (x.to_u128() >> nth) % 2 == 1 {

From 20831984c3cbd4a2d54dc2fe19ccd273d6087d73 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 18:19:22 -0400
Subject: [PATCH 19/47] restored static asserts

---
 testable-simd-models/src/abstractions/mod.rs  |  1 +
 testable-simd-models/src/abstractions/simd.rs |  4 --
 .../src/abstractions/utilities.rs             | 59 +++++++++++++++++++
 .../src/core_arch/x86/models/avx.rs           | 27 ++-------
 .../src/core_arch/x86/models/avx2.rs          |  1 +
 .../core_arch/x86/models/avx_handwritten.rs   | 22 +++++++
 .../src/core_arch/x86/models/mod.rs           |  1 +
 .../src/core_arch/x86/models/sse2.rs          |  1 +
 .../src/core_arch/x86/models/ssse3.rs         |  2 +-
 9 files changed, 92 insertions(+), 26 deletions(-)
 create mode 100644 testable-simd-models/src/abstractions/utilities.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs

diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs
index b3018a8189569..4f840ab60235d 100644
--- a/testable-simd-models/src/abstractions/mod.rs
+++ b/testable-simd-models/src/abstractions/mod.rs
@@ -24,3 +24,4 @@ pub mod bit;
 pub mod bitvec;
 pub mod funarr;
 pub mod simd;
+pub mod utilities;
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 9ef66e929b97e..2c55a9bcd74d6 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -940,7 +940,3 @@ pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy + MachineInte
     })
 }
 
-/// Converts one type to another
-pub fn transmute<T, U: From<T>>(a: T) -> U {
-    a.into()
-}
diff --git a/testable-simd-models/src/abstractions/utilities.rs b/testable-simd-models/src/abstractions/utilities.rs
new file mode 100644
index 0000000000000..2e9f31f2ae8dd
--- /dev/null
+++ b/testable-simd-models/src/abstractions/utilities.rs
@@ -0,0 +1,59 @@
+/// Converts one type to another
+pub fn transmute<T, U: From<T>>(a: T) -> U {
+    a.into()
+}      
+
+#[allow(unused)]
+#[macro_export]
+macro_rules! static_assert {
+    ($e:expr) => {
+        const {
+            assert!($e);
+        }
+    };
+    ($e:expr, $msg:expr) => {
+        const {
+            assert!($e, $msg);
+        }
+    };
+}
+
+#[allow(unused_macros)]
+#[macro_export]
+macro_rules! static_assert_uimm_bits {
+    ($imm:ident, $bits:expr) => {
+        // `0 <= $imm` produces a warning if the immediate has an unsigned type
+        #[allow(unused_comparisons)]
+        {
+            static_assert!(
+                0 <= $imm && $imm < (1 << $bits),
+                concat!(
+                    stringify!($imm),
+                    " doesn't fit in ",
+                    stringify!($bits),
+                    " bits",
+                )
+            )
+        }
+    };
+}
+
+#[allow(unused_macros)]
+#[macro_export]
+macro_rules! static_assert_simm_bits {
+    ($imm:ident, $bits:expr) => {
+        static_assert!(
+            (-1 << ($bits - 1)) - 1 <= $imm && $imm < (1 << ($bits - 1)),
+            concat!(
+                stringify!($imm),
+                " doesn't fit in ",
+                stringify!($bits),
+                " bits",
+            )
+        )
+    };
+}
+
+pub use static_assert;
+pub use static_assert_uimm_bits;
+pub use static_assert_simm_bits;
\ No newline at end of file
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index fb7f5fe70cdbe..6c063419903cb 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -14,25 +14,10 @@
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
 use super::types::*;
+use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
+use super::avx_handwritten::*;
 
-mod c_extern {
-    use crate::abstractions::simd::*;
-
-    pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
-        let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
-            0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
-            1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
-            2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
-            3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
-            _ => unreachable!(),
-        });
-
-        i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
-    }
-}
-
-use c_extern::*;
 /// Blends packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
 ///
@@ -112,7 +97,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
 pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    // static_assert_uimm_bits!(IMM8, 8);
+    static_assert_uimm_bits!(IMM8, 8);
     vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8).into()
 }
 
@@ -122,7 +107,7 @@ pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
 
 pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
-    // // static_assert_uimm_bits!(IMM1, 1);
+    static_assert_uimm_bits!(IMM1, 1);
 
     let dst: i64x4 = simd_shuffle(
         a.as_i64x4(),
@@ -140,7 +125,7 @@ pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
 // This intrinsic has no corresponding instruction.
 
 pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
-    // // static_assert_uimm_bits!(INDEX, 5);
+    static_assert_uimm_bits!(INDEX, 5);
     simd_insert(a.as_i8x32(), INDEX as u32, i).into()
 }
 
@@ -152,7 +137,7 @@ pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
 // This intrinsic has no corresponding instruction.
 
 pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
-    // // static_assert_uimm_bits!(INDEX, 4);
+    static_assert_uimm_bits!(INDEX, 4);
     simd_insert(a.as_i16x16(), INDEX as u32, i).into()
 }
 
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 69f3650b823ca..279e5ee28c11c 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -19,6 +19,7 @@
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
 
 mod c_extern {
diff --git a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
new file mode 100644
index 0000000000000..9846020475c65
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
@@ -0,0 +1,22 @@
+use crate::abstractions::simd::*;
+
+pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
+    let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
+        0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
+        1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
+        2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
+        3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
+        _ => unreachable!(),
+    });
+
+    i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
+}
+
+pub fn ptestz256(a: i64x4, b: i64x4) -> i32 {
+    let c = i64x4::from_fn(|i| a[i] & b[i]);
+    if c == i64x4::ZERO() {
+        1
+    } else {
+        0
+    }
+}
\ No newline at end of file
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 95c9eb4061b6a..bf13b10d5ac2f 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -20,6 +20,7 @@
 //! In general, it is best to gain an idea of how an implementation should be written by looking
 //! at how other functions are implemented. Also see `core::arch::x86` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
 
+pub mod avx_handwritten;
 pub mod avx;
 pub mod avx2;
 pub mod sse2;
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index ab3eba7ded5fa..cc1d1c0624c59 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -1,5 +1,6 @@
 //! Streaming SIMD Extensions 2 (SSE2)
 use super::types::*;
+use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
 mod c_extern {
     use crate::abstractions::{bit::MachineInteger, simd::*};
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index 7fc29262232a9..ccb7f2e799362 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -1,5 +1,5 @@
 //! Supplemental Streaming SIMD Extensions 3 (SSSE3)
-
+use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
 
 use super::types::*;

From 2af9eaafac55df3db34aa0f9b91af7e802d26e28 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 18:21:24 -0400
Subject: [PATCH 20/47] new

---
 .../src/abstractions/funarr.rs                | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index c5c5ece29f1f8..9d3b6f3de93ba 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -86,3 +86,100 @@ impl<const N: u32, T> core::ops::Index<u32> for FunArray<N, T> {
         self.get(index)
     }
 }
+
+impl<T: Copy> FunArray<1, T> {
+    pub fn new(x: T) -> Self {
+        let v = [x];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<2, T> {
+    pub fn new(x0: T, x1: T) -> Self {
+        let v = [x0, x1];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<4, T> {
+    pub fn new(x0: T, x1: T, x2: T, x3: T) -> Self {
+        let v = [x0, x1, x2, x3];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<8, T> {
+    pub fn new(x0: T, x1: T, x2: T, x3: T, x4: T, x5: T, x6: T, x7: T) -> Self {
+        let v = [x0, x1, x2, x3, x4, x5, x6, x7];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<16, T> {
+    pub fn new(
+        x0: T,
+        x1: T,
+        x2: T,
+        x3: T,
+        x4: T,
+        x5: T,
+        x6: T,
+        x7: T,
+        x8: T,
+        x9: T,
+        x10: T,
+        x11: T,
+        x12: T,
+        x13: T,
+        x14: T,
+        x15: T,
+    ) -> Self {
+        let v = [
+            x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
+        ];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<32, T> {
+    pub fn new(
+        x0: T,
+        x1: T,
+        x2: T,
+        x3: T,
+        x4: T,
+        x5: T,
+        x6: T,
+        x7: T,
+        x8: T,
+        x9: T,
+        x10: T,
+        x11: T,
+        x12: T,
+        x13: T,
+        x14: T,
+        x15: T,
+        x16: T,
+        x17: T,
+        x18: T,
+        x19: T,
+        x20: T,
+        x21: T,
+        x22: T,
+        x23: T,
+        x24: T,
+        x25: T,
+        x26: T,
+        x27: T,
+        x28: T,
+        x29: T,
+        x30: T,
+        x31: T,
+    ) -> Self {
+        let v = [
+            x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18,
+            x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,
+        ];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
\ No newline at end of file

From 81014c1331a73bc29d9f97e09b7de2f278069a8c Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 18:49:44 -0400
Subject: [PATCH 21/47] using new for simd

---
 testable-simd-models/modelize/Cargo.toml      |   9 +
 testable-simd-models/modelize/src/main.rs     |  69 +++++
 testable-simd-models/src/abstractions/simd.rs |   2 +-
 .../src/core_arch/x86/models/avx.rs           | 238 ++++++++++++++++--
 .../src/core_arch/x86/models/mod.rs           |   2 +
 5 files changed, 293 insertions(+), 27 deletions(-)
 create mode 100644 testable-simd-models/modelize/Cargo.toml
 create mode 100644 testable-simd-models/modelize/src/main.rs

diff --git a/testable-simd-models/modelize/Cargo.toml b/testable-simd-models/modelize/Cargo.toml
new file mode 100644
index 0000000000000..f1b4ab6152565
--- /dev/null
+++ b/testable-simd-models/modelize/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "modelize"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+prettyplease = "0.2.36"
+syn = { version = "2", features = ["full"] }
+
diff --git a/testable-simd-models/modelize/src/main.rs b/testable-simd-models/modelize/src/main.rs
new file mode 100644
index 0000000000000..af19d06e1f90f
--- /dev/null
+++ b/testable-simd-models/modelize/src/main.rs
@@ -0,0 +1,69 @@
+use syn::{parse_file, Item, File};
+use std::fs;
+use std::env;
+
+fn remove_all_attributes(input_file_path: &str, handwritten_module: &str, output_file_path: &str) -> Result<(), Box<dyn std::error::Error>> {
+    let source_code = fs::read_to_string(input_file_path)?;
+    let mut syntax_tree: File = parse_file(&source_code)?;
+
+    syntax_tree.items.retain(|item|
+        match item {
+            Item::Use(_) => false,
+            _ => true
+        }
+    );
+
+    let use_abstractions: Item = syn::parse_quote! {
+        use crate::abstractions::simd::*;
+    };
+
+    let use_types: Item = syn::parse_quote! {
+        use super::types::*;
+    };
+
+    let use_handwritten: Item = syn::parse_quote! {
+        use super::avx_handwritten::*;
+    };
+
+    syntax_tree.items.insert(0, use_handwritten);
+    syntax_tree.items.insert(0, use_types);
+    syntax_tree.items.insert(0, use_abstractions);
+
+    // Clear attributes from the file's top-level items
+    for item in &mut syntax_tree.items {
+        match item {
+            Item::Fn(item_fn) => {
+                item_fn.attrs.retain(|attr| attr.path().is_ident("doc"));
+            },
+            Item::Struct(item_struct) => {
+                item_struct.attrs.clear();
+                for field in &mut item_struct.fields {
+                    field.attrs.retain(|attr| attr.path().is_ident("doc"));
+                }
+            },
+            Item::Enum(item_enum) => {
+                item_enum.attrs.clear();
+                for variant in &mut item_enum.variants {
+                    variant.attrs.retain(|attr| attr.path().is_ident("doc"));
+                }
+            },
+            // Add more cases for other Item types (e.g., Item::Mod, Item::Impl, etc.)
+            _ => {
+                // For other item types, if they have an 'attrs' field, clear it.
+                // This requires more specific matching or a helper trait.
+            }
+        }
+    }
+
+//    let output_tokens = quote! { #syntax_tree };
+    let formatted_string = prettyplease::unparse(&syntax_tree);
+
+    fs::write(output_file_path, formatted_string)?;
+
+    Ok(())
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let args: Vec<String> = env::args().collect();
+    remove_all_attributes(&args[1], &args[2], &args[3])
+}
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 2c55a9bcd74d6..962c6022ab917 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -926,7 +926,7 @@ pub(crate) use simd_bitmask_big;
 /// # Safety
 /// `mask` must only contain `0` and `!0`.
 
-pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy + MachineInteger>(
+pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy>(
     mask: FunArray<N, T1>,
     if_true: FunArray<N, T2>,
     if_false: FunArray<N, T2>,
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 6c063419903cb..0f072ffe12c9a 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -27,6 +27,15 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
     transmute(simd_select(mask, b.as_i32x8(), a.as_i32x8()))
 }
 
+/// Blends packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
+pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    let mask: i64x4 = simd_lt(c.as_i64x4(), i64x4::ZERO());
+    transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
+}
+
 /// Equal (ordered, non-signaling)
 pub const _CMP_EQ_OQ: i32 = 0x00;
 /// Less-than (ordered, signaling)
@@ -98,23 +107,20 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
 pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8).into()
+    transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8))
 }
 
 /// Copies `a` to result, then inserts 128 bits from `b` into result
 /// at the location specified by `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
-
 pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
     static_assert_uimm_bits!(IMM1, 1);
-
     let dst: i64x4 = simd_shuffle(
-        a.as_i64x4(),
-        _mm256_castsi128_si256(b).as_i64x4(),
-        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        a.as_i64x4(), _mm256_castsi128_si256(b).as_i64x4(), [[4, 5, 2, 3], [0, 1, 4,
+        5]] [IMM1 as usize],
     );
-    dst.into()
+    transmute(dst)
 }
 
 /// Copies `a` to result, and inserts the 8-bit integer `i` into result
@@ -148,12 +154,7 @@ pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
 pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    let c = __m256i::from_fn(|i| a[i] & b[i]);
-    if c == __m256i::ZERO() {
-        1
-    } else {
-        0
-    }
+    ptestz256(a.as_i64x4(), b.as_i64x4())
 }
 
 /// Sets each bit of the returned mask based on the most significant bit of the
@@ -185,6 +186,149 @@ pub fn _mm256_setzero_si256() -> __m256i {
     __m256i::ZERO()
 }
 
+/// Sets packed 8-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
+pub fn _mm256_setr_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+        transmute(
+            i8x32::new(
+                e00,
+                e01,
+                e02,
+                e03,
+                e04,
+                e05,
+                e06,
+                e07,
+                e08,
+                e09,
+                e10,
+                e11,
+                e12,
+                e13,
+                e14,
+                e15,
+                e16,
+                e17,
+                e18,
+                e19,
+                e20,
+                e21,
+                e22,
+                e23,
+                e24,
+                e25,
+                e26,
+                e27,
+                e28,
+                e29,
+                e30,
+                e31,
+            ),
+        )
+}
+/// Sets packed 16-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
+pub fn _mm256_setr_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    
+        transmute(
+            i16x16::new(
+                e00,
+                e01,
+                e02,
+                e03,
+                e04,
+                e05,
+                e06,
+                e07,
+                e08,
+                e09,
+                e10,
+                e11,
+                e12,
+                e13,
+                e14,
+                e15,
+            ),
+        )
+}
+/// Sets packed 32-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
+pub fn _mm256_setr_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) 
+}
+/// Sets packed 64-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
+pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    transmute(i64x4::new(a, b, c, d)) 
+}
+
 /// Sets packed 8-bit integers in returned vector with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
@@ -225,13 +369,43 @@ pub fn _mm256_set_epi8(
     e30: i8,
     e31: i8,
 ) -> __m256i {
-    let vec = [
-        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
-        e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
-    ];
-    transmute(i8x32::from_fn(|i| vec[(31 - i) as usize]))
+    _mm256_setr_epi8(
+        e31,
+        e30,
+        e29,
+        e28,
+        e27,
+        e26,
+        e25,
+        e24,
+        e23,
+        e22,
+        e21,
+        e20,
+        e19,
+        e18,
+        e17,
+        e16,
+        e15,
+        e14,
+        e13,
+        e12,
+        e11,
+        e10,
+        e09,
+        e08,
+        e07,
+        e06,
+        e05,
+        e04,
+        e03,
+        e02,
+        e01,
+        e00,
+    )
 }
 
+
 /// Sets packed 16-bit integers in returned vector with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
@@ -256,10 +430,24 @@ pub fn _mm256_set_epi16(
     e14: i16,
     e15: i16,
 ) -> __m256i {
-    let vec = [
-        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
-    ];
-    transmute(i16x16::from_fn(|i| vec[(15 - i) as usize]))
+    _mm256_setr_epi16(
+        e15,
+        e14,
+        e13,
+        e12,
+        e11,
+        e10,
+        e09,
+        e08,
+        e07,
+        e06,
+        e05,
+        e04,
+        e03,
+        e02,
+        e01,
+        e00,
+    )
 }
 
 /// Sets packed 32-bit integers in returned vector with the supplied values.
@@ -278,8 +466,7 @@ pub fn _mm256_set_epi32(
     e6: i32,
     e7: i32,
 ) -> __m256i {
-    let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
-    transmute(i32x8::from_fn(|i| vec[(7 - i) as usize]))
+    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
 }
 
 /// Sets packed 64-bit integers in returned vector with the supplied values.
@@ -287,8 +474,7 @@ pub fn _mm256_set_epi32(
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
 // This intrinsic has no corresponding instruction.
 pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
-    let vec = [d, c, b, a];
-    transmute(i64x4::from_fn(|i| vec[i as usize]))
+    _mm256_setr_epi64x(d, c, b, a)
 }
 
 /// Broadcasts 8-bit integer `a` to all elements of returned vector.
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index bf13b10d5ac2f..2e18216b69321 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -34,5 +34,7 @@ pub(crate) mod types {
     #[allow(non_camel_case_types)]
     pub type __m256 = BitVec<256>;
     #[allow(non_camel_case_types)]
+    pub type __m256d = BitVec<256>;
+    #[allow(non_camel_case_types)]
     pub type __m128i = BitVec<128>;
 }

From d0880c1269aa90e12284a05a676376d543eb36c9 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 20:21:31 -0400
Subject: [PATCH 22/47] bringing models closer

---
 testable-simd-models/modelize/src/main.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/testable-simd-models/modelize/src/main.rs b/testable-simd-models/modelize/src/main.rs
index af19d06e1f90f..f9cd0b160a5e4 100644
--- a/testable-simd-models/modelize/src/main.rs
+++ b/testable-simd-models/modelize/src/main.rs
@@ -1,4 +1,4 @@
-use syn::{parse_file, Item, File};
+use syn::{parse_file, Item, File, Stmt, Expr, ExprBlock};
 use std::fs;
 use std::env;
 
@@ -9,6 +9,7 @@ fn remove_all_attributes(input_file_path: &str, handwritten_module: &str, output
     syntax_tree.items.retain(|item|
         match item {
             Item::Use(_) => false,
+            Item::Fn(item_fn) => item_fn.sig.unsafety.is_none(),
             _ => true
         }
     );
@@ -34,6 +35,13 @@ fn remove_all_attributes(input_file_path: &str, handwritten_module: &str, output
         match item {
             Item::Fn(item_fn) => {
                 item_fn.attrs.retain(|attr| attr.path().is_ident("doc"));
+                for stmt in &mut item_fn.block.stmts {
+                    match stmt {
+                        Stmt::Expr(Expr::Unsafe(u), tok) => *stmt = Stmt::Expr(Expr::Block(
+                                ExprBlock {attrs : Vec::new(), label : None, block : u.block.clone()}), *tok),
+                        _ => ()
+                    }
+                }
             },
             Item::Struct(item_struct) => {
                 item_struct.attrs.clear();

From 4db7a3c0e0338d976e6f4cacd1737566b3b15653 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 20:34:44 -0400
Subject: [PATCH 23/47] avx2 edits

---
 .../src/core_arch/x86/models/avx2.rs          | 890 +++---------------
 .../src/core_arch/x86/models/mod.rs           |   1 +
 2 files changed, 135 insertions(+), 756 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 279e5ee28c11c..9bedd3975624d 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -22,631 +22,9 @@
 use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
 
-mod c_extern {
-    use crate::abstractions::{bit::MachineInteger, simd::*};
-    pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].wrapping_add(a[2 * i + 1])
-            } else if i < 8 {
-                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
-            } else if i < 12 {
-                a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1])
-            } else {
-                b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1])
-            }
-        })
-    }
-
-    pub fn phaddd(a: i32x8, b: i32x8) -> i32x8 {
-        i32x8::from_fn(|i| {
-            if i < 2 {
-                a[2 * i].wrapping_add(a[2 * i + 1])
-            } else if i < 4 {
-                b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
-            } else if i < 6 {
-                a[2 * (i - 2)].wrapping_add(a[2 * (i - 2) + 1])
-            } else {
-                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phaddsw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].saturating_add(a[2 * i + 1])
-            } else if i < 8 {
-                b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
-            } else if i < 12 {
-                a[2 * (i - 4)].saturating_add(a[2 * (i - 4) + 1])
-            } else {
-                b[2 * (i - 8)].saturating_add(b[2 * (i - 8) + 1])
-            }
-        })
-    }
-
-    pub fn phsubw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].wrapping_sub(a[2 * i + 1])
-            } else if i < 8 {
-                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
-            } else if i < 12 {
-                a[2 * (i - 4)].wrapping_sub(a[2 * (i - 4) + 1])
-            } else {
-                b[2 * (i - 8)].wrapping_sub(b[2 * (i - 8) + 1])
-            }
-        })
-    }
-
-    pub fn phsubd(a: i32x8, b: i32x8) -> i32x8 {
-        i32x8::from_fn(|i| {
-            if i < 2 {
-                a[2 * i].wrapping_sub(a[2 * i + 1])
-            } else if i < 4 {
-                b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
-            } else if i < 6 {
-                a[2 * (i - 2)].wrapping_sub(a[2 * (i - 2) + 1])
-            } else {
-                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phsubsw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].saturating_sub(a[2 * i + 1])
-            } else if i < 8 {
-                b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
-            } else if i < 12 {
-                a[2 * (i - 4)].saturating_sub(a[2 * (i - 4) + 1])
-            } else {
-                b[2 * (i - 8)].saturating_sub(b[2 * (i - 8) + 1])
-            }
-        })
-    }
-    pub fn pmaddwd(a: i16x16, b: i16x16) -> i32x8 {
-        i32x8::from_fn(|i| {
-            (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
-        })
-    }
-
-    pub fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16 {
-        i16x16::from_fn(|i| {
-            ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
-                .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
-        })
-    }
-    pub fn packsswb(a: i16x16, b: i16x16) -> i8x32 {
-        i8x32::from_fn(|i| {
-            if i < 8 {
-                if a[i] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if a[i] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    a[i] as i8
-                }
-            } else if i < 16 {
-                if b[i - 8] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if b[i - 8] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    b[i - 8] as i8
-                }
-            } else if i < 24 {
-                if a[i - 8] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if a[i - 8] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    a[i - 8] as i8
-                }
-            } else {
-                if b[i - 16] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if b[i - 16] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    b[i - 16] as i8
-                }
-            }
-        })
-    }
-
-    pub fn packssdw(a: i32x8, b: i32x8) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if i < 4 {
-                if a[i] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if a[i] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    a[i] as i16
-                }
-            } else if i < 8 {
-                if b[i - 4] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if b[i - 4] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    b[i - 4] as i16
-                }
-            } else if i < 12 {
-                if a[i - 4] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if a[i - 4] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    a[i - 4] as i16
-                }
-            } else {
-                if b[i - 8] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if b[i - 8] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    b[i - 8] as i16
-                }
-            }
-        })
-    }
-
-    pub fn packuswb(a: i16x16, b: i16x16) -> u8x32 {
-        u8x32::from_fn(|i| {
-            if i < 8 {
-                if a[i] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if a[i] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    a[i] as u8
-                }
-            } else if i < 16 {
-                if b[i - 8] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if b[i - 8] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    b[i - 8] as u8
-                }
-            } else if i < 24 {
-                if a[i - 8] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if a[i - 8] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    a[i - 8] as u8
-                }
-            } else {
-                if b[i - 16] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if b[i - 16] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    b[i - 16] as u8
-                }
-            }
-        })
-    }
-
-    pub fn packusdw(a: i32x8, b: i32x8) -> u16x16 {
-        u16x16::from_fn(|i| {
-            if i < 4 {
-                if a[i] > (u16::MAX as i32) {
-                    u16::MAX
-                } else if a[i] < (u16::MIN as i32) {
-                    u16::MIN
-                } else {
-                    a[i] as u16
-                }
-            } else if i < 8 {
-                if b[i - 4] > (u16::MAX as i32) {
-                    u16::MAX
-                } else if b[i - 4] < (u16::MIN as i32) {
-                    u16::MIN
-                } else {
-                    b[i - 4] as u16
-                }
-            } else if i < 12 {
-                if a[i - 4] > (u16::MAX as i32) {
-                    u16::MAX
-                } else if a[i - 4] < (u16::MIN as i32) {
-                    u16::MIN
-                } else {
-                    a[i - 4] as u16
-                }
-            } else {
-                if b[i - 8] > (u16::MAX as i32) {
-                    u16::MAX
-                } else if b[i - 8] < (u16::MIN as i32) {
-                    u16::MIN
-                } else {
-                    b[i - 8] as u16
-                }
-            }
-        })
-    }
-
-    pub fn psignb(a: i8x32, b: i8x32) -> i8x32 {
-        i8x32::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i8::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-    pub fn psignw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i16::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-
-    pub fn psignd(a: i32x8, b: i32x8) -> i32x8 {
-        i32x8::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i32::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-
-    pub fn psllw(a: i16x16, count: i16x8) -> i16x16 {
-        let count4 = (count[0] as u16) as u64;
-        let count3 = ((count[1] as u16) as u64) * 65536;
-        let count2 = ((count[2] as u16) as u64) * 4294967296;
-        let count1 = ((count[3] as u16) as u64) * 281474976710656;
-        let count = count1 + count2 + count3 + count4;
-        i16x16::from_fn(|i| {
-            if count > 15 {
-                0
-            } else {
-                ((a[i] as u16) << count) as i16
-            }
-        })
-    }
-
-    pub fn pslld(a: i32x8, count: i32x4) -> i32x8 {
-        let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x8::from_fn(|i| {
-            if count > 31 {
-                0
-            } else {
-                ((a[i] as u32) << count) as i32
-            }
-        })
-    }
-    pub fn psllq(a: i64x4, count: i64x2) -> i64x4 {
-        let count = count[0] as u32;
-
-        i64x4::from_fn(|i| {
-            if count > 63 {
-                0
-            } else {
-                ((a[i] as u32) << count) as i64
-            }
-        })
-    }
-
-    pub fn psllvd(a: i32x4, count: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) << count[i]) as i32
-            }
-        })
-    }
-    pub fn psllvd256(a: i32x8, count: i32x8) -> i32x8 {
-        i32x8::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) << count[i]) as i32
-            }
-        })
-    }
-
-    pub fn psllvq(a: i64x2, count: i64x2) -> i64x2 {
-        i64x2::from_fn(|i| {
-            if count[i] > 63 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) << count[i]) as i64
-            }
-        })
-    }
-    pub fn psllvq256(a: i64x4, count: i64x4) -> i64x4 {
-        i64x4::from_fn(|i| {
-            if count[i] > 63 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) << count[i]) as i64
-            }
-        })
-    }
-
-    pub fn psraw(a: i16x16, count: i16x8) -> i16x16 {
-        let count = ((count[3] as u16) as u64) * 281474976710656
-            + ((count[2] as u16) as u64) * 4294967296
-            + ((count[1] as u16) as u64) * 65536
-            + ((count[0] as u16) as u64);
-
-        i16x16::from_fn(|i| {
-            if count > 15 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] >> count
-            }
-        })
-    }
-
-    pub fn psrad(a: i32x8, count: i32x4) -> i32x8 {
-        let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x8::from_fn(|i| {
-            if count > 31 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] << count
-            }
-        })
-    }
-
-    pub fn psravd(a: i32x4, count: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] >> count[i]
-            }
-        })
-    }
-
-    pub fn psravd256(a: i32x8, count: i32x8) -> i32x8 {
-        dbg!(a, count);
-        i32x8::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] >> count[i]
-            }
-        })
-    }
-
-    pub fn psrlw(a: i16x16, count: i16x8) -> i16x16 {
-        let count = (count[3] as u16 as u64) * 281474976710656
-            + (count[2] as u16 as u64) * 4294967296
-            + (count[1] as u16 as u64) * 65536
-            + (count[0] as u16 as u64);
-
-        i16x16::from_fn(|i| {
-            if count > 15 {
-                0
-            } else {
-                ((a[i] as u16) >> count) as i16
-            }
-        })
-    }
-
-    pub fn psrld(a: i32x8, count: i32x4) -> i32x8 {
-        let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x8::from_fn(|i| {
-            if count > 31 {
-                0
-            } else {
-                ((a[i] as u32) >> count) as i32
-            }
-        })
-    }
-
-    pub fn psrlq(a: i64x4, count: i64x2) -> i64x4 {
-        let count: u64 = count[0] as u64;
-
-        i64x4::from_fn(|i| {
-            if count > 63 {
-                0
-            } else {
-                ((a[i] as u32) >> count) as i64
-            }
-        })
-    }
-
-    pub fn psrlvd(a: i32x4, count: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) >> count[i]) as i32
-            }
-        })
-    }
-    pub fn psrlvd256(a: i32x8, count: i32x8) -> i32x8 {
-        i32x8::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) >> count[i]) as i32
-            }
-        })
-    }
-
-    pub fn psrlvq(a: i64x2, count: i64x2) -> i64x2 {
-        i64x2::from_fn(|i| {
-            if count[i] > 63 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) >> count[i]) as i64
-            }
-        })
-    }
-    pub fn psrlvq256(a: i64x4, count: i64x4) -> i64x4 {
-        i64x4::from_fn(|i| {
-            if count[i] > 63 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) >> count[i]) as i64
-            }
-        })
-    }
-
-    pub fn pshufb(a: u8x32, b: u8x32) -> u8x32 {
-        u8x32::from_fn(|i| {
-            if i < 16 {
-                if b[i] > 127 {
-                    0
-                } else {
-                    let index = (b[i] % 16) as u32;
-                    a[index]
-                }
-            } else {
-                if b[i] > 127 {
-                    0
-                } else {
-                    let index = (b[i] % 16) as u32;
-                    a[index + 16]
-                }
-            }
-        })
-    }
-
-    pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
-        u32x8::from_fn(|i| {
-            let id = b[i] % 8;
-            a[id]
-        })
-    }
-
-    pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
-        u16x16::from_fn(|i| {
-            if i < 8 {
-                let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
-                let b_offset = ((imm8 & 3) * 4) as u32;
-                let k = a_offset + i;
-                let l = b_offset;
-                ((a[k].wrapping_abs_diff(b[l]) as i8) as u8 as u16)
-                    + ((a[k + 1].wrapping_abs_diff(b[l + 1]) as i8) as u8 as u16)
-                    + ((a[k + 2].wrapping_abs_diff(b[l + 2]) as i8) as u8 as u16)
-                    + ((a[k + 3].wrapping_abs_diff(b[l + 3]) as i8) as u8 as u16)
-            } else {
-                let i = i - 8;
-                let imm8 = imm8 >> 3;
-                let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
-                let b_offset = ((imm8 & 3) * 4) as u32;
-                let k = a_offset + i;
-                let l = b_offset;
-                ((a[16 + k].wrapping_abs_diff(b[16 + l]) as i8) as u8 as u16)
-                    + ((a[16 + k + 1].wrapping_abs_diff(b[16 + l + 1]) as i8) as u8 as u16)
-                    + ((a[16 + k + 2].wrapping_abs_diff(b[16 + l + 2]) as i8) as u8 as u16)
-                    + ((a[16 + k + 3].wrapping_abs_diff(b[16 + l + 3]) as i8) as u8 as u16)
-            }
-        })
-    }
-
-    pub fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4 {
-        let a = i128x2::from_fn(|i| {
-            ((a[2 * i] as u64 as u128) + ((a[2 * i + 1] as u64 as u128) << 64)) as i128
-        });
-        let b = i128x2::from_fn(|i| {
-            ((b[2 * i] as u64 as u128) + ((b[2 * i + 1] as u64 as u128) << 64)) as i128
-        });
-        let imm8 = imm8 as u8 as u32 as i32;
-        let r = i128x2::from_fn(|i| {
-            let control = imm8 >> (i * 4);
-            if (control >> 3) % 2 == 1 {
-                0
-            } else {
-                match control % 4 {
-                    0 => a[0],
-                    1 => a[1],
-                    2 => b[0],
-                    3 => b[1],
-                    _ => unreachable!(),
-                }
-            }
-        });
-        i64x4::from_fn(|i| {
-            let index = i >> 1;
-            let hilo = i.rem_euclid(2);
-            let val = r[index];
-            if hilo == 0 {
-                i64::cast(val)
-            } else {
-                i64::cast(val >> 64)
-            }
-        })
-    }
-    pub fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            let temp = (a[i] as i32) * (b[i] as i32);
-            let temp = (temp >> 14).wrapping_add(1) >> 1;
-            temp as i16
-        })
-    }
-
-    pub fn psadbw(a: u8x32, b: u8x32) -> u64x4 {
-        let tmp = u8x32::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
-        u64x4::from_fn(|i| {
-            (tmp[i * 8] as u16)
-                .wrapping_add(tmp[i * 8 + 1] as u16)
-                .wrapping_add(tmp[i * 8 + 2] as u16)
-                .wrapping_add(tmp[i * 8 + 3] as u16)
-                .wrapping_add(tmp[i * 8 + 4] as u16)
-                .wrapping_add(tmp[i * 8 + 5] as u16)
-                .wrapping_add(tmp[i * 8 + 6] as u16)
-                .wrapping_add(tmp[i * 8 + 7] as u16) as u64
-        })
-    }
-}
-use c_extern::*;
-
 use super::avx::*;
 use super::types::*;
+use super::avx2_handwritten::*;
 
 /// Computes the absolute values of packed 32-bit integers in `a`.
 ///
@@ -731,7 +109,7 @@ pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
 
 pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_add(a.as_u8x32(), b.as_u8x32()).into()
+    transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
 }
 
 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
@@ -739,7 +117,7 @@ pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
 
 pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_add(a.as_u16x16(), b.as_u16x16()).into()
+    transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
 }
 
 /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
@@ -765,7 +143,7 @@ pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     let b = b.as_i8x32();
 
     if IMM8 == 16 {
-        return a.into();
+        return transmute(a);
     }
 
     let r: i8x32 = match IMM8 % 16 {
@@ -899,7 +277,7 @@ pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
         ),
         _ => unreachable!(),
     };
-    r.into()
+    transmute(r)
 }
 
 /// Computes the bitwise AND of 256 bits (representing integer data)
@@ -908,7 +286,7 @@ pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
 
 pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
-    simd_and(a.as_i64x4(), b.as_i64x4()).into()
+    transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Computes the bitwise NOT of 256 bits (representing integer data)
@@ -918,7 +296,7 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
     let all_ones = _mm256_set1_epi8(-1);
-    simd_and(simd_xor(a.as_i64x4(), all_ones.as_i64x4()), b.as_i64x4()).into()
+    transmute(simd_and(simd_xor(a.as_i64x4(), all_ones.as_i64x4()), b.as_i64x4()))
 }
 
 /// Averages packed unsigned 16-bit integers in `a` and `b`.
@@ -929,7 +307,7 @@ pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
     let a = simd_cast::<16, _, u32>(a.as_u16x16());
     let b = simd_cast::<16, _, u32>(b.as_u16x16());
     let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
-    simd_cast::<16, _, u16>(r).into()
+    transmute(simd_cast::<16, _, u16>(r))
 }
 
 /// Averages packed unsigned 8-bit integers in `a` and `b`.
@@ -940,7 +318,7 @@ pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
     let a = simd_cast::<32, _, u16>(a.as_u8x32());
     let b = simd_cast::<32, _, u16>(b.as_u8x32());
     let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
-    simd_cast::<32, _, u8>(r).into()
+    transmute(simd_cast::<32, _, u8>(r))
 }
 
 /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
@@ -960,7 +338,7 @@ pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
             [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
         ],
     );
-    r.into()
+    transmute(r)
 }
 
 /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
@@ -984,7 +362,7 @@ pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
             [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
         ],
     );
-    r.into()
+    transmute(r)
 }
 
 /// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
@@ -1016,7 +394,7 @@ pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
             [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
         ],
     );
-    r.into()
+    transmute(r)
 }
 
 /// Blends packed 8-bit integers from `a` and `b` using `mask`.
@@ -1024,7 +402,7 @@ pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
 pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
     let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO());
-    simd_select(mask, b.as_i8x32(), a.as_i8x32()).into()
+    transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
 }
 
 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
@@ -1033,7 +411,7 @@ pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
 pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
     let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 16]);
-    ret.into()
+    transmute(ret)
 }
 
 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
@@ -1042,7 +420,7 @@ pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
 pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
     let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 32]);
-    ret.into()
+    transmute(ret)
 }
 
 // N.B., `simd_shuffle4` with integer data types for `a` and `b` is
@@ -1054,7 +432,7 @@ pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 
 pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
     let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 4]);
-    ret.into()
+    transmute(ret)
 }
 
 // N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
@@ -1066,7 +444,7 @@ pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 
 pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
     let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 8]);
-    ret.into()
+    transmute(ret)
 }
 
 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
@@ -1079,7 +457,7 @@ pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 
 pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
     let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
-    ret.into()
+    transmute(ret)
 }
 
 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
@@ -1089,7 +467,7 @@ pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 
 pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
     let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
-    ret.into()
+    transmute(ret)
 }
 
 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
@@ -1099,7 +477,7 @@ pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 
 pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
     let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
-    ret.into()
+    transmute(ret)
 }
 
 // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
@@ -1111,7 +489,7 @@ pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
 
 pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
     let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
-    ret.into()
+    transmute(ret)
 }
 
 /// Broadcasts the low packed 16-bit integer from a to all elements of
@@ -1121,7 +499,7 @@ pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 
 pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
     let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 8]);
-    ret.into()
+    transmute(ret)
 }
 
 /// Broadcasts the low packed 16-bit integer from a to all elements of
@@ -1131,7 +509,7 @@ pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 
 pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
     let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 16]);
-    ret.into()
+    transmute(ret)
 }
 
 /// Compares packed 64-bit integers in `a` and `b` for equality.
@@ -1139,7 +517,7 @@ pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
 
 pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(a.as_i64x4(), b.as_i64x4()).into()
+    transmute(simd_eq(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Compares packed 32-bit integers in `a` and `b` for equality.
@@ -1147,7 +525,7 @@ pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
 
 pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(a.as_i32x8(), b.as_i32x8()).into()
+    transmute(simd_eq(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Compares packed 16-bit integers in `a` and `b` for equality.
@@ -1155,7 +533,7 @@ pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
 
 pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(simd_eq(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Compares packed 8-bit integers in `a` and `b` for equality.
@@ -1163,7 +541,7 @@ pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
 
 pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(a.as_i8x32(), b.as_i8x32()).into()
+    transmute(simd_eq(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Compares packed 64-bit integers in `a` and `b` for greater-than.
@@ -1171,7 +549,7 @@ pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
 
 pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(a.as_i64x4(), b.as_i64x4()).into()
+    transmute(simd_gt(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
@@ -1179,7 +557,7 @@ pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
 
 pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(a.as_i32x8(), b.as_i32x8()).into()
+    transmute(simd_gt(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
@@ -1187,7 +565,7 @@ pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
 
 pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(simd_gt(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
@@ -1195,7 +573,7 @@ pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
 
 pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(a.as_i8x32(), b.as_i8x32()).into()
+    transmute(simd_gt(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Sign-extend 16-bit integers to 32-bit integers.
@@ -1203,7 +581,7 @@ pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
 
 pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
-    simd_cast::<8, _, i32>(a.as_i16x8()).into()
+    transmute(simd_cast::<8, _, i32>(a.as_i16x8()))
 }
 
 /// Sign-extend 16-bit integers to 64-bit integers.
@@ -1213,7 +591,7 @@ pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
 pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
     let a = a.as_i16x8();
     let v64: i16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    simd_cast::<4, i16, i64>(v64).into()
+    transmute(simd_cast::<4, i16, i64>(v64))
 }
 
 /// Sign-extend 32-bit integers to 64-bit integers.
@@ -1221,7 +599,7 @@ pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
 
 pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
-    simd_cast::<4, i32, i64>(a.as_i32x4()).into()
+    transmute(simd_cast::<4, i32, i64>(a.as_i32x4()))
 }
 
 /// Sign-extend 8-bit integers to 16-bit integers.
@@ -1229,7 +607,7 @@ pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
 
 pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
-    simd_cast::<16, i8, i16>(a.as_i8x16()).into()
+    transmute(simd_cast::<16, i8, i16>(a.as_i8x16()))
 }
 
 /// Sign-extend 8-bit integers to 32-bit integers.
@@ -1239,7 +617,7 @@ pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
 pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
     let a = a.as_i8x16();
     let v64: i8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    simd_cast::<8, i8, i32>(v64).into()
+    transmute(simd_cast::<8, i8, i32>(v64))
 }
 
 /// Sign-extend 8-bit integers to 64-bit integers.
@@ -1248,7 +626,7 @@ pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
 pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
     let a = a.as_i8x16();
     let v32: i8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    simd_cast::<4, i8, i64>(v32).into()
+    transmute(simd_cast::<4, i8, i64>(v32))
 }
 
 /// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
@@ -1257,7 +635,7 @@ pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
 
 pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
-    simd_cast::<8, u16, u32>(a.as_u16x8()).into()
+    transmute(simd_cast::<8, u16, u32>(a.as_u16x8()))
 }
 
 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
@@ -1268,7 +646,7 @@ pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
 pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
     let a = a.as_u16x8();
     let v64: u16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    simd_cast::<4, u16, u64>(v64).into()
+    transmute(simd_cast::<4, u16, u64>(v64))
 }
 
 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
@@ -1276,7 +654,7 @@ pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
 
 pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
-    simd_cast::<4, u32, u64>(a.as_u32x4()).into()
+    transmute(simd_cast::<4, u32, u64>(a.as_u32x4()))
 }
 
 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
@@ -1284,7 +662,7 @@ pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
 
 pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
-    simd_cast::<16, u8, u16>(a.as_u8x16()).into()
+    transmute(simd_cast::<16, u8, u16>(a.as_u8x16()))
 }
 
 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
@@ -1295,7 +673,7 @@ pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
 pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
     let a = a.as_u8x16();
     let v64: u8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    simd_cast::<8, u8, u32>(v64).into()
+    transmute(simd_cast::<8, u8, u32>(v64))
 }
 
 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
@@ -1306,7 +684,7 @@ pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
 pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
     let a = a.as_u8x16();
     let v32: u8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    simd_cast::<4, u8, u64>(v32).into()
+    transmute(simd_cast::<4, u8, u64>(v32))
 }
 
 /// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
@@ -1317,7 +695,7 @@ pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
     let a = a.as_i64x4();
     let b = i64x4::from_fn(|_| 0);
     let dst: i64x2 = simd_shuffle(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
-    dst.into()
+    transmute(dst)
 }
 
 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
@@ -1325,7 +703,7 @@ pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
 
 pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phaddw(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
@@ -1333,7 +711,7 @@ pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
 
 pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
-    phaddd(a.as_i32x8(), b.as_i32x8()).into()
+    transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
@@ -1342,7 +720,7 @@ pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
 
 pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phaddsw(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
@@ -1350,7 +728,7 @@ pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
 
 pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phsubw(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
@@ -1358,7 +736,7 @@ pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
 
 pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    phsubd(a.as_i32x8(), b.as_i32x8()).into()
+    transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@@ -1367,7 +745,7 @@ pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
 
 pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phsubsw(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
@@ -1379,7 +757,7 @@ pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
     let a = a.as_i64x4();
     let b = _mm256_castsi128_si256(b).as_i64x4();
     let dst: i64x4 = simd_shuffle(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
-    dst.into()
+    transmute(dst)
 }
 
 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
@@ -1389,7 +767,7 @@ pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
 
 pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    pmaddwd(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -1400,7 +778,7 @@ pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
 
 pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    pmaddubsw(a.as_u8x32(), b.as_u8x32()).into()
+    transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
 }
 
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -1411,7 +789,7 @@ pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_i16x16();
     let b = b.as_i16x16();
-    simd_select::<16, i16, _>(simd_gt(a, b), a, b).into()
+    transmute(simd_select::<16, i16, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
@@ -1422,7 +800,7 @@ pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_i32x8();
     let b = b.as_i32x8();
-    simd_select::<8, i32, _>(simd_gt(a, b), a, b).into()
+    transmute(simd_select::<8, i32, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
@@ -1433,7 +811,7 @@ pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_i8x32();
     let b = b.as_i8x32();
-    simd_select::<32, i8, _>(simd_gt(a, b), a, b).into()
+    transmute(simd_select::<32, i8, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
@@ -1444,7 +822,7 @@ pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_u16x16();
     let b = b.as_u16x16();
-    simd_select::<16, _, u16>(simd_gt(a, b), a, b).into()
+    transmute(simd_select::<16, _, u16>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
@@ -1455,7 +833,7 @@ pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_u32x8();
     let b = b.as_u32x8();
-    simd_select::<8, _, u32>(simd_gt(a, b), a, b).into()
+    transmute(simd_select::<8, _, u32>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
@@ -1466,7 +844,7 @@ pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    simd_select::<32, _, u8>(simd_gt(a, b), a, b).into()
+    transmute(simd_select::<32, _, u8>(simd_gt(a, b), a, b))
 }
 
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -1477,7 +855,7 @@ pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_i16x16();
     let b = b.as_i16x16();
-    simd_select::<16, _, i16>(simd_lt(a, b), a, b).into()
+    transmute(simd_select::<16, _, i16>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
@@ -1488,7 +866,7 @@ pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_i32x8();
     let b = b.as_i32x8();
-    simd_select::<8, i32, _>(simd_lt(a, b), a, b).into()
+    transmute(simd_select::<8, i32, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
@@ -1499,7 +877,7 @@ pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_i8x32();
     let b = b.as_i8x32();
-    simd_select::<32, i8, _>(simd_lt(a, b), a, b).into()
+    transmute(simd_select::<32, i8, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
@@ -1510,7 +888,7 @@ pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_u16x16();
     let b = b.as_u16x16();
-    simd_select::<16, _, u16>(simd_lt(a, b), a, b).into()
+    transmute(simd_select::<16, _, u16>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
@@ -1521,7 +899,7 @@ pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_u32x8();
     let b = b.as_u32x8();
-    simd_select::<8, _, u32>(simd_lt(a, b), a, b).into()
+    transmute(simd_select::<8, _, u32>(simd_lt(a, b), a, b))
 }
 
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
@@ -1532,7 +910,7 @@ pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    simd_select::<32, _, u8>(simd_lt(a, b), a, b).into()
+    transmute(simd_select::<32, _, u8>(simd_lt(a, b), a, b))
 }
 
 /// Creates mask from the most significant bit of each 8-bit element in `a`,
@@ -1558,7 +936,7 @@ pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
 
 pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8).into()
+    transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8))
 }
 
 /// Multiplies the low 32-bit integers from each packed 64-bit element in
@@ -1571,7 +949,7 @@ pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
 pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
     let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(a.as_i64x4()));
     let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(b.as_i64x4()));
-    simd_mul(a, b).into()
+    transmute(simd_mul(a, b))
 }
 
 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
@@ -1585,7 +963,7 @@ pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
     let a = a.as_u64x4();
     let b = b.as_u64x4();
     let mask = u64x4::splat(u32::MAX.into());
-    __m256i::from_u64x4(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
 }
 
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
@@ -1598,7 +976,7 @@ pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
     let a = simd_cast::<16, _, i32>(a.as_i16x16());
     let b = simd_cast::<16, _, i32>(b.as_i16x16());
     let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
-    simd_cast::<16, i32, i16>(r).into()
+    transmute(simd_cast::<16, i32, i16>(r))
 }
 
 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
@@ -1611,7 +989,7 @@ pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
     let a = simd_cast::<16, _, u32>(a.as_u16x16());
     let b = simd_cast::<16, _, u32>(b.as_u16x16());
     let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
-    simd_cast::<16, u32, u16>(r).into()
+    transmute(simd_cast::<16, u32, u16>(r))
 }
 
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
@@ -1621,7 +999,7 @@ pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
 
 pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_mul(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Multiplies the packed 32-bit integers in `a` and `b`, producing
@@ -1631,7 +1009,7 @@ pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
 
 pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_mul(a.as_i32x8(), b.as_i32x8()).into()
+    transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Multiplies packed 16-bit integers in `a` and `b`, producing
@@ -1642,7 +1020,7 @@ pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
 
 pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    pmulhrsw(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Computes the bitwise OR of 256 bits (representing integer data) in `a`
@@ -1651,7 +1029,7 @@ pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
 
 pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
-    simd_or(a.as_i32x8(), b.as_i32x8()).into()
+    transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
@@ -1660,7 +1038,7 @@ pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
 
 pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    packsswb(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -1669,7 +1047,7 @@ pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
 
 pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
-    packssdw(a.as_i32x8(), b.as_i32x8()).into()
+    transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
@@ -1678,7 +1056,7 @@ pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
 
 pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
-    packuswb(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -1687,7 +1065,7 @@ pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
 
 pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
-    packusdw(a.as_i32x8(), b.as_i32x8()).into()
+    transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
@@ -1698,7 +1076,7 @@ pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
 
 pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
-    permd(a.as_u32x8(), b.as_u32x8()).into()
+    transmute(permd(a.as_u32x8(), b.as_u32x8()))
 }
 
 /// Permutes 64-bit integers from `a` using control mask `imm8`.
@@ -1717,7 +1095,7 @@ pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
             (IMM8 as u32 >> 6) & 0b11,
         ],
     );
-    r.into()
+    transmute(r)
 }
 
 /// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
@@ -1725,7 +1103,7 @@ pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
 
 pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8).into()
+    transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8))
 }
 
 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
@@ -1736,7 +1114,7 @@ pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
 
 pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
-    psadbw(a.as_u8x32(), b.as_u8x32()).into()
+    transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
 }
 
 /// Shuffles bytes from `a` according to the content of `b`.
@@ -1769,7 +1147,7 @@ pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
 
 pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
-    pshufb(a.as_u8x32(), b.as_u8x32()).into()
+    transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
 }
 
 /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
@@ -1791,7 +1169,7 @@ pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
             ((MASK as u32 >> 6) & 0b11) + 4,
         ],
     );
-    r.into()
+    transmute(r)
 }
 
 /// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
@@ -1824,7 +1202,7 @@ pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
             12 + ((IMM8 as u32 >> 6) & 0b11),
         ],
     );
-    r.into()
+    transmute(r)
 }
 
 /// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
@@ -1857,7 +1235,7 @@ pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
             15,
         ],
     );
-    r.into()
+    transmute(r)
 }
 
 /// Negates packed 16-bit integers in `a` when the corresponding signed
@@ -1867,7 +1245,7 @@ pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
 
 pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
-    psignw(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(psignw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Negates packed 32-bit integers in `a` when the corresponding signed
@@ -1877,7 +1255,7 @@ pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
 
 pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
-    psignd(a.as_i32x8(), b.as_i32x8()).into()
+    transmute(psignd(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Negates packed 8-bit integers in `a` when the corresponding signed
@@ -1887,7 +1265,7 @@ pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
 
 pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
-    psignb(a.as_i8x32(), b.as_i8x32()).into()
+    transmute(psignb(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Shifts packed 16-bit integers in `a` left by `count` while
@@ -1896,7 +1274,7 @@ pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
 
 pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
-    psllw(a.as_i16x16(), count.as_i16x8()).into()
+    transmute(psllw(a.as_i16x16(), count.as_i16x8()))
 }
 
 /// Shifts packed 32-bit integers in `a` left by `count` while
@@ -1905,7 +1283,7 @@ pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
 
 pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
-    pslld(a.as_i32x8(), count.as_i32x4()).into()
+    transmute(pslld(a.as_i32x8(), count.as_i32x4()))
 }
 
 /// Shifts packed 64-bit integers in `a` left by `count` while
@@ -1914,7 +1292,7 @@ pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
 
 pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
-    psllq(a.as_i64x4(), count.as_i64x2()).into()
+    transmute(psllq(a.as_i64x4(), count.as_i64x2()))
 }
 
 /// Shifts packed 16-bit integers in `a` left by `IMM8` while
@@ -1926,7 +1304,7 @@ pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 16 {
         _mm256_setzero_si256()
     } else {
-        simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)).into()
+        transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
     }
 }
 
@@ -1939,7 +1317,7 @@ pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 32 {
         _mm256_setzero_si256()
     } else {
-        simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)).into()
+        transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
     }
 }
 
@@ -1952,7 +1330,7 @@ pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 64 {
         _mm256_setzero_si256()
     } else {
-        simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)).into()
+        transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
     }
 }
 
@@ -2016,7 +1394,7 @@ pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
             mask(IMM8, 31) as u32,
         ],
     );
-    r.into()
+    transmute(r)
 }
 
 /// Shifts packed 32-bit integers in `a` left by the amount
@@ -2026,7 +1404,7 @@ pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
 
 pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psllvd(a.as_i32x4(), count.as_i32x4()).into()
+    transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
 }
 
 /// Shifts packed 32-bit integers in `a` left by the amount
@@ -2036,7 +1414,7 @@ pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
 
 pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    psllvd256(a.as_i32x8(), count.as_i32x8()).into()
+    transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
 }
 
 /// Shifts packed 64-bit integers in `a` left by the amount
@@ -2046,7 +1424,7 @@ pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
 
 pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psllvq(a.as_i64x2(), count.as_i64x2()).into()
+    transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
 }
 
 /// Shifts packed 64-bit integers in `a` left by the amount
@@ -2056,7 +1434,7 @@ pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
 
 pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    psllvq256(a.as_i64x4(), count.as_i64x4()).into()
+    transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
 }
 
 /// Shifts packed 16-bit integers in `a` right by `count` while
@@ -2065,7 +1443,7 @@ pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
 
 pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
-    psraw(a.as_i16x16(), count.as_i16x8()).into()
+    transmute(psraw(a.as_i16x16(), count.as_i16x8()))
 }
 
 /// Shifts packed 32-bit integers in `a` right by `count` while
@@ -2074,7 +1452,7 @@ pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
 
 pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
-    psrad(a.as_i32x8(), count.as_i32x4()).into()
+    transmute(psrad(a.as_i32x8(), count.as_i32x4()))
 }
 
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while
@@ -2083,7 +1461,7 @@ pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
 
 pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16)).into()
+    transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16)))
 }
 
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while
@@ -2092,7 +1470,7 @@ pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
 
 pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31))).into()
+    transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31))))
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
@@ -2101,7 +1479,7 @@ pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
 
 pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psravd(a.as_i32x4(), count.as_i32x4()).into()
+    transmute(psravd(a.as_i32x4(), count.as_i32x4()))
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
@@ -2110,7 +1488,7 @@ pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
 
 pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
-    psravd256(a.as_i32x8(), count.as_i32x8()).into()
+    transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
 }
 
 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
@@ -2175,7 +1553,7 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
         ],
     );
 
-    r.into()
+    transmute(r)
 }
 
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
@@ -2184,7 +1562,7 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
 
 pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
-    psrlw(a.as_i16x16(), count.as_i16x8()).into()
+    transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
 }
 
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
@@ -2193,7 +1571,7 @@ pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
 
 pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
-    psrld(a.as_i32x8(), count.as_i32x4()).into()
+    transmute(psrld(a.as_i32x8(), count.as_i32x4()))
 }
 
 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
@@ -2202,7 +1580,7 @@ pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
 
 pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
-    psrlq(a.as_i64x4(), count.as_i64x2()).into()
+    transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
 }
 
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
@@ -2214,7 +1592,7 @@ pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 16 {
         _mm256_setzero_si256()
     } else {
-        simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)).into()
+        transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
     }
 }
 
@@ -2227,7 +1605,7 @@ pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 32 {
         _mm256_setzero_si256()
     } else {
-        simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)).into()
+        transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
     }
 }
 
@@ -2240,7 +1618,7 @@ pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
     if IMM8 >= 64 {
         _mm256_setzero_si256()
     } else {
-        simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)).into()
+        transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
     }
 }
 
@@ -2250,7 +1628,7 @@ pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
 
 pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psrlvd(a.as_i32x4(), count.as_i32x4()).into()
+    transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by
@@ -2259,7 +1637,7 @@ pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
 
 pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    psrlvd256(a.as_i32x8(), count.as_i32x8()).into()
+    transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
 }
 
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
@@ -2268,7 +1646,7 @@ pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
 
 pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psrlvq(a.as_i64x2(), count.as_i64x2()).into()
+    transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
 }
 
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
@@ -2277,7 +1655,7 @@ pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
 
 pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    psrlvq256(a.as_i64x4(), count.as_i64x4()).into()
+    transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
@@ -2285,7 +1663,7 @@ pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
 
 pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(a.as_i16x16(), b.as_i16x16()).into()
+    transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
@@ -2293,7 +1671,7 @@ pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
 
 pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(a.as_i32x8(), b.as_i32x8()).into()
+    transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
@@ -2301,7 +1679,7 @@ pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
 
 pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(a.as_i64x4(), b.as_i64x4()).into()
+    transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
@@ -2309,7 +1687,7 @@ pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
 
 pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(a.as_i8x32(), b.as_i8x32()).into()
+    transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
@@ -2318,7 +1696,7 @@ pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
 
 pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(a.as_i16x16(), b.as_i16x16()).into()
+   transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
@@ -2327,7 +1705,7 @@ pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
 
 pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(a.as_i8x32(), b.as_i8x32()).into()
+    transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
@@ -2336,7 +1714,7 @@ pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
 
 pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(a.as_u16x16(), b.as_u16x16()).into()
+    transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
 }
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
@@ -2345,7 +1723,7 @@ pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
 
 pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(a.as_u8x32(), b.as_u8x32()).into()
+    transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
 }
 
 /// Unpacks and interleave 8-bit integers from the high half of each
@@ -2360,7 +1738,7 @@ pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
             24, 56, 25, 57, 26, 58, 27, 59,
             28, 60, 29, 61, 30, 62, 31, 63,
     ]);
-    r.into()
+    transmute(r)
 }
 
 /// Unpacks and interleave 8-bit integers from the low half of each
@@ -2375,7 +1753,7 @@ pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
         16, 48, 17, 49, 18, 50, 19, 51,
         20, 52, 21, 53, 22, 54, 23, 55,
     ]);
-    r.into()
+    transmute(r)
 }
 
 /// Unpacks and interleave 16-bit integers from the high half of each
@@ -2388,7 +1766,7 @@ pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
         b.as_i16x16(),
         [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
     );
-    r.into()
+    transmute(r)
 }
 
 /// Unpacks and interleave 16-bit integers from the low half of each
@@ -2401,7 +1779,7 @@ pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
         b.as_i16x16(),
         [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
     );
-    r.into()
+    transmute(r)
 }
 
 /// Unpacks and interleave 32-bit integers from the high half of each
@@ -2410,7 +1788,7 @@ pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
     let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
-    r.into()
+    transmute(r)
 }
 
 /// Unpacks and interleave 32-bit integers from the low half of each
@@ -2419,7 +1797,7 @@ pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
     let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
-    r.into()
+    transmute(r)
 }
 
 /// Unpacks and interleave 64-bit integers from the high half of each
@@ -2428,7 +1806,7 @@ pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
     let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
-    r.into()
+    transmute(r)
 }
 
 /// Unpacks and interleave 64-bit integers from the low half of each
@@ -2437,7 +1815,7 @@ pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
 
 pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
     let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
-    r.into()
+    transmute(r)
 }
 
 /// Computes the bitwise XOR of 256 bits (representing integer data)
@@ -2446,7 +1824,7 @@ pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
 
 pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
-    simd_xor(a.as_i64x4(), b.as_i64x4()).into()
+    transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 2e18216b69321..b0a5b65c24a1b 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -22,6 +22,7 @@
 
 pub mod avx_handwritten;
 pub mod avx;
+pub mod avx2_handwritten;
 pub mod avx2;
 pub mod sse2;
 pub mod ssse3;

From c4780782a0ed06707d45b36be2c999058538f22b Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 21:11:55 -0400
Subject: [PATCH 24/47] avx2 handwritten

---
 testable-simd-models/modelize/src/main.rs     |  24 +-
 .../core_arch/x86/models/avx2_handwritten.rs  | 620 ++++++++++++++++++
 2 files changed, 632 insertions(+), 12 deletions(-)
 create mode 100644 testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs

diff --git a/testable-simd-models/modelize/src/main.rs b/testable-simd-models/modelize/src/main.rs
index f9cd0b160a5e4..9010fd83907e4 100644
--- a/testable-simd-models/modelize/src/main.rs
+++ b/testable-simd-models/modelize/src/main.rs
@@ -14,21 +14,21 @@ fn remove_all_attributes(input_file_path: &str, handwritten_module: &str, output
         }
     );
 
-    let use_abstractions: Item = syn::parse_quote! {
-        use crate::abstractions::simd::*;
-    };
+    // let use_abstractions: Item = syn::parse_quote! {
+    //     use crate::abstractions::simd::*;
+    // };
 
-    let use_types: Item = syn::parse_quote! {
-        use super::types::*;
-    };
+    // let use_types: Item = syn::parse_quote! {
+    //     use super::types::*;
+    // };
 
-    let use_handwritten: Item = syn::parse_quote! {
-        use super::avx_handwritten::*;
-    };
+    // let use_handwritten: Item = syn::parse_quote! {
+    //     use super::avx_handwritten::*;
+    // };
 
-    syntax_tree.items.insert(0, use_handwritten);
-    syntax_tree.items.insert(0, use_types);
-    syntax_tree.items.insert(0, use_abstractions);
+    // syntax_tree.items.insert(0, use_handwritten);
+    // syntax_tree.items.insert(0, use_types);
+    // syntax_tree.items.insert(0, use_abstractions);
 
     // Clear attributes from the file's top-level items
     for item in &mut syntax_tree.items {
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
new file mode 100644
index 0000000000000..8053daaee923a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
@@ -0,0 +1,620 @@
+use crate::abstractions::{bit::MachineInteger, simd::*};
+pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1])
+        }
+    })
+}
+
+pub fn phaddd(a: i32x8, b: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else if i < 4 {
+            b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+        } else if i < 6 {
+            a[2 * (i - 2)].wrapping_add(a[2 * (i - 2) + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phaddsw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_add(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].saturating_add(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].saturating_add(b[2 * (i - 8) + 1])
+        }
+    })
+}
+
+pub fn phsubw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].wrapping_sub(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].wrapping_sub(b[2 * (i - 8) + 1])
+        }
+    })
+}
+
+pub fn phsubd(a: i32x8, b: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else if i < 4 {
+            b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+        } else if i < 6 {
+            a[2 * (i - 2)].wrapping_sub(a[2 * (i - 2) + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phsubsw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_sub(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].saturating_sub(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].saturating_sub(b[2 * (i - 8) + 1])
+        }
+    })
+}
+pub fn pmaddwd(a: i16x16, b: i16x16) -> i32x8 {
+    i32x8::from_fn(|i| {
+        (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+    })
+}
+
+pub fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16 {
+    i16x16::from_fn(|i| {
+        ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+            .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+    })
+}
+pub fn packsswb(a: i16x16, b: i16x16) -> i8x32 {
+    i8x32::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (i8::MAX as i16) {
+                i8::MAX
+            } else if a[i] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                a[i] as i8
+            }
+        } else if i < 16 {
+            if b[i - 8] > (i8::MAX as i16) {
+                i8::MAX
+            } else if b[i - 8] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                b[i - 8] as i8
+            }
+        } else if i < 24 {
+            if a[i - 8] > (i8::MAX as i16) {
+                i8::MAX
+            } else if a[i - 8] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                a[i - 8] as i8
+            }
+        } else {
+            if b[i - 16] > (i8::MAX as i16) {
+                i8::MAX
+            } else if b[i - 16] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                b[i - 16] as i8
+            }
+        }
+    })
+}
+
+pub fn packssdw(a: i32x8, b: i32x8) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            if a[i] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                a[i] as i16
+            }
+        } else if i < 8 {
+            if b[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 4] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                b[i - 4] as i16
+            }
+        } else if i < 12 {
+            if a[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i - 4] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                a[i - 4] as i16
+            }
+        } else {
+            if b[i - 8] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 8] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                b[i - 8] as i16
+            }
+        }
+    })
+}
+
+pub fn packuswb(a: i16x16, b: i16x16) -> u8x32 {
+    u8x32::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (u8::MAX as i16) {
+                u8::MAX
+            } else if a[i] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                a[i] as u8
+            }
+        } else if i < 16 {
+            if b[i - 8] > (u8::MAX as i16) {
+                u8::MAX
+            } else if b[i - 8] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                b[i - 8] as u8
+            }
+        } else if i < 24 {
+            if a[i - 8] > (u8::MAX as i16) {
+                u8::MAX
+            } else if a[i - 8] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                a[i - 8] as u8
+            }
+        } else {
+            if b[i - 16] > (u8::MAX as i16) {
+                u8::MAX
+            } else if b[i - 16] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                b[i - 16] as u8
+            }
+        }
+    })
+}
+
+pub fn packusdw(a: i32x8, b: i32x8) -> u16x16 {
+    u16x16::from_fn(|i| {
+        if i < 4 {
+            if a[i] > (u16::MAX as i32) {
+                u16::MAX
+            } else if a[i] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                a[i] as u16
+            }
+        } else if i < 8 {
+            if b[i - 4] > (u16::MAX as i32) {
+                u16::MAX
+            } else if b[i - 4] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                b[i - 4] as u16
+            }
+        } else if i < 12 {
+            if a[i - 4] > (u16::MAX as i32) {
+                u16::MAX
+            } else if a[i - 4] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                a[i - 4] as u16
+            }
+        } else {
+            if b[i - 8] > (u16::MAX as i32) {
+                u16::MAX
+            } else if b[i - 8] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                b[i - 8] as u16
+            }
+        }
+    })
+}
+
+pub fn psignb(a: i8x32, b: i8x32) -> i8x32 {
+    i8x32::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i8::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+pub fn psignw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i16::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+
+pub fn psignd(a: i32x8, b: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i32::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+
+pub fn psllw(a: i16x16, count: i16x8) -> i16x16 {
+    let count4 = (count[0] as u16) as u64;
+    let count3 = ((count[1] as u16) as u64) * 65536;
+    let count2 = ((count[2] as u16) as u64) * 4294967296;
+    let count1 = ((count[3] as u16) as u64) * 281474976710656;
+    let count = count1 + count2 + count3 + count4;
+    i16x16::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) << count) as i16
+        }
+    })
+}
+
+pub fn pslld(a: i32x8, count: i32x4) -> i32x8 {
+    let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x8::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) << count) as i32
+        }
+    })
+}
+pub fn psllq(a: i64x4, count: i64x2) -> i64x4 {
+    let count = count[0] as u32;
+
+    i64x4::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u32) << count) as i64
+        }
+    })
+}
+
+pub fn psllvd(a: i32x4, count: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i32
+        }
+    })
+}
+pub fn psllvd256(a: i32x8, count: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i32
+        }
+    })
+}
+
+pub fn psllvq(a: i64x2, count: i64x2) -> i64x2 {
+    i64x2::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i64
+        }
+    })
+}
+pub fn psllvq256(a: i64x4, count: i64x4) -> i64x4 {
+    i64x4::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i64
+        }
+    })
+}
+
+pub fn psraw(a: i16x16, count: i16x8) -> i16x16 {
+    let count = ((count[3] as u16) as u64) * 281474976710656
+        + ((count[2] as u16) as u64) * 4294967296
+        + ((count[1] as u16) as u64) * 65536
+        + ((count[0] as u16) as u64);
+
+    i16x16::from_fn(|i| {
+        if count > 15 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count
+        }
+    })
+}
+
+pub fn psrad(a: i32x8, count: i32x4) -> i32x8 {
+    let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x8::from_fn(|i| {
+        if count > 31 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] << count
+        }
+    })
+}
+
+pub fn psravd(a: i32x4, count: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count[i]
+        }
+    })
+}
+
+pub fn psravd256(a: i32x8, count: i32x8) -> i32x8 {
+    dbg!(a, count);
+    i32x8::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count[i]
+        }
+    })
+}
+
+pub fn psrlw(a: i16x16, count: i16x8) -> i16x16 {
+    let count = (count[3] as u16 as u64) * 281474976710656
+        + (count[2] as u16 as u64) * 4294967296
+        + (count[1] as u16 as u64) * 65536
+        + (count[0] as u16 as u64);
+
+    i16x16::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) >> count) as i16
+        }
+    })
+}
+
+pub fn psrld(a: i32x8, count: i32x4) -> i32x8 {
+    let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x8::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) >> count) as i32
+        }
+    })
+}
+
+pub fn psrlq(a: i64x4, count: i64x2) -> i64x4 {
+    let count: u64 = count[0] as u64;
+
+    i64x4::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u32) >> count) as i64
+        }
+    })
+}
+
+pub fn psrlvd(a: i32x4, count: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i32
+        }
+    })
+}
+
+pub fn psrlvd256(a: i32x8, count: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i32
+        }
+    })
+}
+
+pub fn psrlvq(a: i64x2, count: i64x2) -> i64x2 {
+    i64x2::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i64
+        }
+    })
+}
+pub fn psrlvq256(a: i64x4, count: i64x4) -> i64x4 {
+    i64x4::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i64
+        }
+    })
+}
+
+pub fn pshufb(a: u8x32, b: u8x32) -> u8x32 {
+    u8x32::from_fn(|i| {
+        if i < 16 {
+            if b[i] > 127 {
+                0
+            } else {
+                let index = (b[i] % 16) as u32;
+                a[index]
+            }
+        } else {
+            if b[i] > 127 {
+                0
+            } else {
+                let index = (b[i] % 16) as u32;
+                a[index + 16]
+            }
+        }
+    })
+}
+
+pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
+    u32x8::from_fn(|i| {
+        let id = b[i] % 8;
+        a[id]
+    })
+}
+
+pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
+    u16x16::from_fn(|i| {
+        if i < 8 {
+            let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
+            let b_offset = ((imm8 & 3) * 4) as u32;
+            let k = a_offset + i;
+            let l = b_offset;
+            ((a[k].wrapping_abs_diff(b[l]) as i8) as u8 as u16)
+                + ((a[k + 1].wrapping_abs_diff(b[l + 1]) as i8) as u8 as u16)
+                + ((a[k + 2].wrapping_abs_diff(b[l + 2]) as i8) as u8 as u16)
+                + ((a[k + 3].wrapping_abs_diff(b[l + 3]) as i8) as u8 as u16)
+        } else {
+            let i = i - 8;
+            let imm8 = imm8 >> 3;
+            let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
+            let b_offset = ((imm8 & 3) * 4) as u32;
+            let k = a_offset + i;
+            let l = b_offset;
+            ((a[16 + k].wrapping_abs_diff(b[16 + l]) as i8) as u8 as u16)
+                + ((a[16 + k + 1].wrapping_abs_diff(b[16 + l + 1]) as i8) as u8 as u16)
+                + ((a[16 + k + 2].wrapping_abs_diff(b[16 + l + 2]) as i8) as u8 as u16)
+                + ((a[16 + k + 3].wrapping_abs_diff(b[16 + l + 3]) as i8) as u8 as u16)
+        }
+    })
+}
+
+pub fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4 {
+    let a = i128x2::from_fn(|i| {
+        ((a[2 * i] as u64 as u128) + ((a[2 * i + 1] as u64 as u128) << 64)) as i128
+    });
+    let b = i128x2::from_fn(|i| {
+        ((b[2 * i] as u64 as u128) + ((b[2 * i + 1] as u64 as u128) << 64)) as i128
+    });
+    let imm8 = imm8 as u8 as u32 as i32;
+    let r = i128x2::from_fn(|i| {
+        let control = imm8 >> (i * 4);
+        if (control >> 3) % 2 == 1 {
+            0
+        } else {
+            match control % 4 {
+                0 => a[0],
+                1 => a[1],
+                2 => b[0],
+                3 => b[1],
+                _ => unreachable!(),
+            }
+        }
+    });
+    i64x4::from_fn(|i| {
+        let index = i >> 1;
+        let hilo = i.rem_euclid(2);
+        let val = r[index];
+        if hilo == 0 {
+            i64::cast(val)
+        } else {
+            i64::cast(val >> 64)
+        }
+    })
+}
+pub fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        let temp = (a[i] as i32) * (b[i] as i32);
+        let temp = (temp >> 14).wrapping_add(1) >> 1;
+        temp as i16
+    })
+}
+
+pub fn psadbw(a: u8x32, b: u8x32) -> u64x4 {
+    let tmp = u8x32::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
+    u64x4::from_fn(|i| {
+        (tmp[i * 8] as u16)
+            .wrapping_add(tmp[i * 8 + 1] as u16)
+            .wrapping_add(tmp[i * 8 + 2] as u16)
+            .wrapping_add(tmp[i * 8 + 3] as u16)
+            .wrapping_add(tmp[i * 8 + 4] as u16)
+            .wrapping_add(tmp[i * 8 + 5] as u16)
+            .wrapping_add(tmp[i * 8 + 6] as u16)
+            .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+    })
+}
\ No newline at end of file

From bdfa68fc385eb4eb7569e6965bed579efc64c4dc Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 21:15:11 -0400
Subject: [PATCH 25/47] handwritten modules

---
 .../src/core_arch/x86/models/mod.rs           |   2 +
 .../src/core_arch/x86/models/sse2.rs          | 201 +-----------------
 .../core_arch/x86/models/sse2_handwritten.rs  | 196 +++++++++++++++++
 .../src/core_arch/x86/models/ssse3.rs         | 134 +-----------
 .../core_arch/x86/models/ssse3_handwritten.rs | 127 +++++++++++
 5 files changed, 328 insertions(+), 332 deletions(-)
 create mode 100644 testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs

diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index b0a5b65c24a1b..baee88b47d671 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -24,7 +24,9 @@ pub mod avx_handwritten;
 pub mod avx;
 pub mod avx2_handwritten;
 pub mod avx2;
+pub mod sse2_handwritten;
 pub mod sse2;
+pub mod ssse3_handwritten;
 pub mod ssse3;
 
 pub(crate) mod types {
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index cc1d1c0624c59..878c4c08ebe28 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -2,206 +2,7 @@
 use super::types::*;
 use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
-mod c_extern {
-    use crate::abstractions::{bit::MachineInteger, simd::*};
-    pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
-        i8x16::from_fn(|i| {
-            if i < 8 {
-                if a[i] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if a[i] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    a[i] as i8
-                }
-            } else {
-                if b[i - 8] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if b[i - 8] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    b[i - 8] as i8
-                }
-            }
-        })
-    }
-    pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4 {
-        i32x4::from_fn(|i| {
-            (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
-        })
-    }
-    pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 {
-        let tmp = u8x16::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
-        u64x2::from_fn(|i| {
-            (tmp[i * 8] as u16)
-                .wrapping_add(tmp[i * 8 + 1] as u16)
-                .wrapping_add(tmp[i * 8 + 2] as u16)
-                .wrapping_add(tmp[i * 8 + 3] as u16)
-                .wrapping_add(tmp[i * 8 + 4] as u16)
-                .wrapping_add(tmp[i * 8 + 5] as u16)
-                .wrapping_add(tmp[i * 8 + 6] as u16)
-                .wrapping_add(tmp[i * 8 + 7] as u16) as u64
-        })
-    }
-    pub fn psllw(a: i16x8, count: i16x8) -> i16x8 {
-        let count4: u64 = (count[0] as u16) as u64;
-        let count3: u64 = ((count[1] as u16) as u64) * 65536;
-        let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
-        let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
-        let count = count1 + count2 + count3 + count4;
-        i16x8::from_fn(|i| {
-            if count > 15 {
-                0
-            } else {
-                ((a[i] as u16) << count) as i16
-            }
-        })
-    }
-
-    pub fn pslld(a: i32x4, count: i32x4) -> i32x4 {
-        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x4::from_fn(|i| {
-            if count > 31 {
-                0
-            } else {
-                ((a[i] as u32) << count) as i32
-            }
-        })
-    }
-
-    pub fn psllq(a: i64x2, count: i64x2) -> i64x2 {
-        let count: u64 = count[0] as u64;
-
-        i64x2::from_fn(|i| {
-            if count > 63 {
-                0
-            } else {
-                ((a[i] as u64) << count) as i64
-            }
-        })
-    }
-
-    pub fn psraw(a: i16x8, count: i16x8) -> i16x8 {
-        let count: u64 = ((count[3] as u16) as u64) * 281474976710656
-            + ((count[2] as u16) as u64) * 4294967296
-            + ((count[1] as u16) as u64) * 65536
-            + ((count[0] as u16) as u64);
-
-        i16x8::from_fn(|i| {
-            if count > 15 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] >> count
-            }
-        })
-    }
-
-    pub fn psrad(a: i32x4, count: i32x4) -> i32x4 {
-        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x4::from_fn(|i| {
-            if count > 31 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] << count
-            }
-        })
-    }
-
-    pub fn psrlw(a: i16x8, count: i16x8) -> i16x8 {
-        let count: u64 = (count[3] as u16 as u64) * 281474976710656
-            + (count[2] as u16 as u64) * 4294967296
-            + (count[1] as u16 as u64) * 65536
-            + (count[0] as u16 as u64);
-
-        i16x8::from_fn(|i| {
-            if count > 15 {
-                0
-            } else {
-                ((a[i] as u16) >> count) as i16
-            }
-        })
-    }
-
-    pub fn psrld(a: i32x4, count: i32x4) -> i32x4 {
-        let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
-
-        i32x4::from_fn(|i| {
-            if count > 31 {
-                0
-            } else {
-                ((a[i] as u32) >> count) as i32
-            }
-        })
-    }
-
-    pub fn psrlq(a: i64x2, count: i64x2) -> i64x2 {
-        let count: u64 = count[0] as u64;
-
-        i64x2::from_fn(|i| {
-            if count > 63 {
-                0
-            } else {
-                ((a[i] as u64) >> count) as i64
-            }
-        })
-    }
-
-    pub fn packssdw(a: i32x4, b: i32x4) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                if a[i] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if a[i] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    a[i] as i16
-                }
-            } else {
-                if b[i - 4] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if b[i - 4] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    b[i - 4] as i16
-                }
-            }
-        })
-    }
-
-    pub fn packuswb(a: i16x8, b: i16x8) -> u8x16 {
-        u8x16::from_fn(|i| {
-            if i < 8 {
-                if a[i] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if a[i] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    a[i] as u8
-                }
-            } else {
-                if b[i - 8] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if b[i - 8] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    b[i - 8] as u8
-                }
-            }
-        })
-    }
-}
-
-use c_extern::*;
+use super::sse2_handwritten::*;
 
 /// Adds packed 8-bit integers in `a` and `b`.
 ///
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
new file mode 100644
index 0000000000000..d0c1308f7ef3c
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
@@ -0,0 +1,196 @@
+use crate::abstractions::{bit::MachineInteger, simd::*};
+    pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
+        i8x16::from_fn(|i| {
+            if i < 8 {
+                if a[i] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if a[i] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    a[i] as i8
+                }
+            } else {
+                if b[i - 8] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if b[i - 8] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    b[i - 8] as i8
+                }
+            }
+        })
+    }
+    pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4 {
+        i32x4::from_fn(|i| {
+            (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+        })
+    }
+    pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 {
+        let tmp = u8x16::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
+        u64x2::from_fn(|i| {
+            (tmp[i * 8] as u16)
+                .wrapping_add(tmp[i * 8 + 1] as u16)
+                .wrapping_add(tmp[i * 8 + 2] as u16)
+                .wrapping_add(tmp[i * 8 + 3] as u16)
+                .wrapping_add(tmp[i * 8 + 4] as u16)
+                .wrapping_add(tmp[i * 8 + 5] as u16)
+                .wrapping_add(tmp[i * 8 + 6] as u16)
+                .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+        })
+    }
+    pub fn psllw(a: i16x8, count: i16x8) -> i16x8 {
+        let count4: u64 = (count[0] as u16) as u64;
+        let count3: u64 = ((count[1] as u16) as u64) * 65536;
+        let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
+        let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
+        let count = count1 + count2 + count3 + count4;
+        i16x8::from_fn(|i| {
+            if count > 15 {
+                0
+            } else {
+                ((a[i] as u16) << count) as i16
+            }
+        })
+    }
+
+    pub fn pslld(a: i32x4, count: i32x4) -> i32x4 {
+        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+        i32x4::from_fn(|i| {
+            if count > 31 {
+                0
+            } else {
+                ((a[i] as u32) << count) as i32
+            }
+        })
+    }
+
+    pub fn psllq(a: i64x2, count: i64x2) -> i64x2 {
+        let count: u64 = count[0] as u64;
+
+        i64x2::from_fn(|i| {
+            if count > 63 {
+                0
+            } else {
+                ((a[i] as u64) << count) as i64
+            }
+        })
+    }
+
+    pub fn psraw(a: i16x8, count: i16x8) -> i16x8 {
+        let count: u64 = ((count[3] as u16) as u64) * 281474976710656
+            + ((count[2] as u16) as u64) * 4294967296
+            + ((count[1] as u16) as u64) * 65536
+            + ((count[0] as u16) as u64);
+
+        i16x8::from_fn(|i| {
+            if count > 15 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] >> count
+            }
+        })
+    }
+
+    pub fn psrad(a: i32x4, count: i32x4) -> i32x4 {
+        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+        i32x4::from_fn(|i| {
+            if count > 31 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] << count
+            }
+        })
+    }
+
+    pub fn psrlw(a: i16x8, count: i16x8) -> i16x8 {
+        let count: u64 = (count[3] as u16 as u64) * 281474976710656
+            + (count[2] as u16 as u64) * 4294967296
+            + (count[1] as u16 as u64) * 65536
+            + (count[0] as u16 as u64);
+
+        i16x8::from_fn(|i| {
+            if count > 15 {
+                0
+            } else {
+                ((a[i] as u16) >> count) as i16
+            }
+        })
+    }
+
+    pub fn psrld(a: i32x4, count: i32x4) -> i32x4 {
+        let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
+
+        i32x4::from_fn(|i| {
+            if count > 31 {
+                0
+            } else {
+                ((a[i] as u32) >> count) as i32
+            }
+        })
+    }
+
+    pub fn psrlq(a: i64x2, count: i64x2) -> i64x2 {
+        let count: u64 = count[0] as u64;
+
+        i64x2::from_fn(|i| {
+            if count > 63 {
+                0
+            } else {
+                ((a[i] as u64) >> count) as i64
+            }
+        })
+    }
+
+    pub fn packssdw(a: i32x4, b: i32x4) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                if a[i] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if a[i] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    a[i] as i16
+                }
+            } else {
+                if b[i - 4] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if b[i - 4] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    b[i - 4] as i16
+                }
+            }
+        })
+    }
+
+    pub fn packuswb(a: i16x8, b: i16x8) -> u8x16 {
+        u8x16::from_fn(|i| {
+            if i < 8 {
+                if a[i] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if a[i] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    a[i] as u8
+                }
+            } else {
+                if b[i - 8] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if b[i - 8] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    b[i - 8] as u8
+                }
+            }
+        })
+    }
\ No newline at end of file
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index ccb7f2e799362..66e61be3a93e7 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -3,139 +3,9 @@ use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
 
 use super::types::*;
-
-mod c_extern {
-    use crate::abstractions::simd::*;
-    pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
-        u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u32] })
-    }
-
-    pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].wrapping_add(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phaddsw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].saturating_add(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phaddd128(a: i32x4, b: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if i < 2 {
-                a[2 * i].wrapping_add(a[2 * i + 1])
-            } else {
-                b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
-            }
-        })
-    }
-
-    pub fn phsubw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].wrapping_sub(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phsubsw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].saturating_sub(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phsubd128(a: i32x4, b: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if i < 2 {
-                a[2 * i].wrapping_sub(a[2 * i + 1])
-            } else {
-                b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
-            }
-        })
-    }
-
-    pub fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8 {
-        i16x8::from_fn(|i| {
-            ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
-                .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
-        })
-    }
-
-    pub fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            let temp = (a[i] as i32) * (b[i] as i32);
-            let temp = (temp >> 14).wrapping_add(1) >> 1;
-            temp as i16
-        })
-    }
-
-    pub fn psignb128(a: i8x16, b: i8x16) -> i8x16 {
-        i8x16::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i8::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-
-    pub fn psignw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i16::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-
-    pub fn psignd128(a: i32x4, b: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i32::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-}
-
+use super::ssse3_handwritten::*;
 use super::sse2::*;
-use c_extern::*;
+
 /// Computes the absolute value of packed 8-bit signed integers in `a` and
 /// return the unsigned results.
 ///
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs
new file mode 100644
index 0000000000000..d6368798235e8
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs
@@ -0,0 +1,127 @@
+use crate::abstractions::simd::*;
+    pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
+        u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u32] })
+    }
+
+    pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].wrapping_add(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phaddsw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].saturating_add(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phaddd128(a: i32x4, b: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if i < 2 {
+                a[2 * i].wrapping_add(a[2 * i + 1])
+            } else {
+                b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+            }
+        })
+    }
+
+    pub fn phsubw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].wrapping_sub(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phsubsw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].saturating_sub(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phsubd128(a: i32x4, b: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if i < 2 {
+                a[2 * i].wrapping_sub(a[2 * i + 1])
+            } else {
+                b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+            }
+        })
+    }
+
+    pub fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8 {
+        i16x8::from_fn(|i| {
+            ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+                .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+        })
+    }
+
+    pub fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            let temp = (a[i] as i32) * (b[i] as i32);
+            let temp = (temp >> 14).wrapping_add(1) >> 1;
+            temp as i16
+        })
+    }
+
+    pub fn psignb128(a: i8x16, b: i8x16) -> i8x16 {
+        i8x16::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i8::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+
+    pub fn psignw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i16::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+
+    pub fn psignd128(a: i32x4, b: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i32::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
\ No newline at end of file

From c87885db86bda7dcca7cceefc957f636812d412f Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 30 Jul 2025 23:47:17 -0400
Subject: [PATCH 26/47] replace with generated avx2

---
 .../src/abstractions/funarr.rs                |    2 +-
 testable-simd-models/src/abstractions/simd.rs |    1 -
 .../src/abstractions/utilities.rs             |    4 +-
 .../src/core_arch/x86/models/avx.rs           |  129 +-
 .../src/core_arch/x86/models/avx2.rs          | 1822 ++++++++---------
 .../core_arch/x86/models/avx2_handwritten.rs  |    4 +-
 .../core_arch/x86/models/avx_handwritten.rs   |    2 +-
 .../src/core_arch/x86/models/mod.rs           |    8 +-
 .../src/core_arch/x86/models/sse2.rs          |    4 +-
 .../core_arch/x86/models/sse2_handwritten.rs  |  354 ++--
 .../src/core_arch/x86/models/ssse3.rs         |    6 +-
 .../core_arch/x86/models/ssse3_handwritten.rs |  212 +-
 12 files changed, 1220 insertions(+), 1328 deletions(-)

diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index 9d3b6f3de93ba..ef29dec1951e2 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -182,4 +182,4 @@ impl<T: Copy> FunArray<32, T> {
         ];
         Self::from_fn(|i| v[i as usize])
     }
-}
\ No newline at end of file
+}
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 962c6022ab917..7b7d8330c222c 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -939,4 +939,3 @@ pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy>(
         }
     })
 }
-
diff --git a/testable-simd-models/src/abstractions/utilities.rs b/testable-simd-models/src/abstractions/utilities.rs
index 2e9f31f2ae8dd..86e1c0ba52de1 100644
--- a/testable-simd-models/src/abstractions/utilities.rs
+++ b/testable-simd-models/src/abstractions/utilities.rs
@@ -1,7 +1,7 @@
 /// Converts one type to another
 pub fn transmute<T, U: From<T>>(a: T) -> U {
     a.into()
-}      
+}
 
 #[allow(unused)]
 #[macro_export]
@@ -55,5 +55,5 @@ macro_rules! static_assert_simm_bits {
 }
 
 pub use static_assert;
+pub use static_assert_simm_bits;
 pub use static_assert_uimm_bits;
-pub use static_assert_simm_bits;
\ No newline at end of file
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 0f072ffe12c9a..d6fb7a94664d9 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -13,10 +13,10 @@
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
+use super::avx_handwritten::*;
 use super::types::*;
-use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
-use super::avx_handwritten::*;
+use crate::abstractions::utilities::*;
 
 /// Blends packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
@@ -117,8 +117,9 @@ pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
     static_assert_uimm_bits!(IMM1, 1);
     let dst: i64x4 = simd_shuffle(
-        a.as_i64x4(), _mm256_castsi128_si256(b).as_i64x4(), [[4, 5, 2, 3], [0, 1, 4,
-        5]] [IMM1 as usize],
+        a.as_i64x4(),
+        _mm256_castsi128_si256(b).as_i64x4(),
+        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
     );
     transmute(dst)
 }
@@ -224,42 +225,10 @@ pub fn _mm256_setr_epi8(
     e30: i8,
     e31: i8,
 ) -> __m256i {
-        transmute(
-            i8x32::new(
-                e00,
-                e01,
-                e02,
-                e03,
-                e04,
-                e05,
-                e06,
-                e07,
-                e08,
-                e09,
-                e10,
-                e11,
-                e12,
-                e13,
-                e14,
-                e15,
-                e16,
-                e17,
-                e18,
-                e19,
-                e20,
-                e21,
-                e22,
-                e23,
-                e24,
-                e25,
-                e26,
-                e27,
-                e28,
-                e29,
-                e30,
-                e31,
-            ),
-        )
+    transmute(i8x32::new(
+        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
+        e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+    ))
 }
 /// Sets packed 16-bit integers in returned vector with the supplied values in
 /// reverse order.
@@ -283,27 +252,9 @@ pub fn _mm256_setr_epi16(
     e14: i16,
     e15: i16,
 ) -> __m256i {
-    
-        transmute(
-            i16x16::new(
-                e00,
-                e01,
-                e02,
-                e03,
-                e04,
-                e05,
-                e06,
-                e07,
-                e08,
-                e09,
-                e10,
-                e11,
-                e12,
-                e13,
-                e14,
-                e15,
-            ),
-        )
+    transmute(i16x16::new(
+        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
+    ))
 }
 /// Sets packed 32-bit integers in returned vector with the supplied values in
 /// reverse order.
@@ -319,14 +270,14 @@ pub fn _mm256_setr_epi32(
     e6: i32,
     e7: i32,
 ) -> __m256i {
-    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) 
+    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
 /// Sets packed 64-bit integers in returned vector with the supplied values in
 /// reverse order.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
 pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
-    transmute(i64x4::new(a, b, c, d)) 
+    transmute(i64x4::new(a, b, c, d))
 }
 
 /// Sets packed 8-bit integers in returned vector with the supplied values.
@@ -370,42 +321,11 @@ pub fn _mm256_set_epi8(
     e31: i8,
 ) -> __m256i {
     _mm256_setr_epi8(
-        e31,
-        e30,
-        e29,
-        e28,
-        e27,
-        e26,
-        e25,
-        e24,
-        e23,
-        e22,
-        e21,
-        e20,
-        e19,
-        e18,
-        e17,
-        e16,
-        e15,
-        e14,
-        e13,
-        e12,
-        e11,
-        e10,
-        e09,
-        e08,
-        e07,
-        e06,
-        e05,
-        e04,
-        e03,
-        e02,
-        e01,
-        e00,
+        e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14,
+        e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00,
     )
 }
 
-
 /// Sets packed 16-bit integers in returned vector with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
@@ -431,22 +351,7 @@ pub fn _mm256_set_epi16(
     e15: i16,
 ) -> __m256i {
     _mm256_setr_epi16(
-        e15,
-        e14,
-        e13,
-        e12,
-        e11,
-        e10,
-        e09,
-        e08,
-        e07,
-        e06,
-        e05,
-        e04,
-        e03,
-        e02,
-        e01,
-        e00,
+        e15, e14, e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00,
     )
 }
 
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 9bedd3975624d..b731bd4a2022f 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -19,912 +19,852 @@
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
-use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
 
 use super::avx::*;
-use super::types::*;
 use super::avx2_handwritten::*;
+use super::types::*;
 
 /// Computes the absolute values of packed 32-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
-
 pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
-    let a = a.as_i32x8();
-    let r = simd_select(simd_lt(a, i32x8::ZERO()), simd_neg(a), a);
-    transmute(r)
+    {
+        let a = a.as_i32x8();
+        let r = simd_select(simd_lt(a, i32x8::ZERO()), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Computes the absolute values of packed 16-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
-
 pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
-    let a = a.as_i16x16();
-    let r = simd_select(simd_lt(a, i16x16::ZERO()), simd_neg(a), a);
-    transmute(r)
+    {
+        let a = a.as_i16x16();
+        let r = simd_select(simd_lt(a, i16x16::ZERO()), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Computes the absolute values of packed 8-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
-
 pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
-    let a = a.as_i8x32();
-    let r = simd_select(simd_lt(a, i8x32::ZERO()), simd_neg(a), a);
-    transmute(r)
+    {
+        let a = a.as_i8x32();
+        let r = simd_select(simd_lt(a, i8x32::ZERO()), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Adds packed 64-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
-
 pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
+    {
+        transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Adds packed 32-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
-
 pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Adds packed 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
-
 pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Adds packed 8-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
-
 pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
+    {
+        transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Adds packed 8-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
-
 pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
+    {
+        transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Adds packed 16-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
-
 pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
-
 pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
+    {
+        transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
+    }
 }
-
 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
-
 pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
+    {
+        transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
+    }
 }
-
 /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
 /// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
-
 pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    // If palignr is shifting the pair of vectors more than the size of two
-    // lanes, emit zero.
+    static_assert_uimm_bits!(IMM8, 8);
     if IMM8 >= 32 {
         return _mm256_setzero_si256();
     }
-    // If palignr is shifting the pair of input vectors more than one lane,
-    // but less than two lanes, convert to shifting in zeroes.
     let (a, b) = if IMM8 > 16 {
         (_mm256_setzero_si256(), a)
     } else {
         (a, b)
     };
-
-    let a = a.as_i8x32();
-    let b = b.as_i8x32();
-
-    if IMM8 == 16 {
-        return transmute(a);
+    {
+        if IMM8 == 16 {
+            return transmute(a);
+        }
     }
-
-    let r: i8x32 = match IMM8 % 16 {
-        0 => simd_shuffle(
-            b,
-            a,
-            [
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
-                23, 24, 25, 26, 27, 28, 29, 30, 31,
-            ],
-        ),
-        1 => simd_shuffle(
-            b,
-            a,
-            [
-                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
-                24, 25, 26, 27, 28, 29, 30, 31, 48,
-            ],
-        ),
-        2 => simd_shuffle(
-            b,
-            a,
-            [
-                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
-                25, 26, 27, 28, 29, 30, 31, 48, 49,
-            ],
-        ),
-        3 => simd_shuffle(
-            b,
-            a,
-            [
-                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
-                25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
-            ],
-        ),
-        4 => simd_shuffle(
-            b,
-            a,
-            [
-                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
-                26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
-            ],
-        ),
-        5 => simd_shuffle(
-            b,
-            a,
-            [
-                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
-                27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
-            ],
-        ),
-        6 => simd_shuffle(
-            b,
-            a,
-            [
-                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
-                28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
-            ],
-        ),
-        7 => simd_shuffle(
-            b,
-            a,
-            [
-                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
-                28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
-            ],
-        ),
-        8 => simd_shuffle(
-            b,
-            a,
-            [
-                8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
-                29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
-            ],
-        ),
-        9 => simd_shuffle(
-            b,
-            a,
-            [
-                9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
-                30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
-            ],
-        ),
-        10 => simd_shuffle(
-            b,
-            a,
-            [
-                10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
-                31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
-            ],
-        ),
-        11 => simd_shuffle(
-            b,
-            a,
-            [
-                11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
-                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
-            ],
-        ),
-        12 => simd_shuffle(
-            b,
-            a,
-            [
-                12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
-                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
-            ],
-        ),
-        13 => simd_shuffle(
-            b,
-            a,
-            [
-                13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
-                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
-            ],
-        ),
-        14 => simd_shuffle(
-            b,
-            a,
-            [
-                14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
-                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
-            ],
-        ),
-        15 => simd_shuffle(
-            b,
-            a,
+    const fn mask(shift: u32, i: u32) -> u32 {
+        let shift = shift % 16;
+        let mod_i = i % 16;
+        if mod_i < (16 - shift) {
+            i + shift
+        } else {
+            i + 16 + shift
+        }
+    }
+    {
+        let r: i8x32 = simd_shuffle(
+            b.as_i8x32(),
+            a.as_i8x32(),
             [
-                15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
-                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+                mask(IMM8 as u32, 0),
+                mask(IMM8 as u32, 1),
+                mask(IMM8 as u32, 2),
+                mask(IMM8 as u32, 3),
+                mask(IMM8 as u32, 4),
+                mask(IMM8 as u32, 5),
+                mask(IMM8 as u32, 6),
+                mask(IMM8 as u32, 7),
+                mask(IMM8 as u32, 8),
+                mask(IMM8 as u32, 9),
+                mask(IMM8 as u32, 10),
+                mask(IMM8 as u32, 11),
+                mask(IMM8 as u32, 12),
+                mask(IMM8 as u32, 13),
+                mask(IMM8 as u32, 14),
+                mask(IMM8 as u32, 15),
+                mask(IMM8 as u32, 16),
+                mask(IMM8 as u32, 17),
+                mask(IMM8 as u32, 18),
+                mask(IMM8 as u32, 19),
+                mask(IMM8 as u32, 20),
+                mask(IMM8 as u32, 21),
+                mask(IMM8 as u32, 22),
+                mask(IMM8 as u32, 23),
+                mask(IMM8 as u32, 24),
+                mask(IMM8 as u32, 25),
+                mask(IMM8 as u32, 26),
+                mask(IMM8 as u32, 27),
+                mask(IMM8 as u32, 28),
+                mask(IMM8 as u32, 29),
+                mask(IMM8 as u32, 30),
+                mask(IMM8 as u32, 31),
             ],
-        ),
-        _ => unreachable!(),
-    };
-    transmute(r)
+        );
+        transmute(r)
+    }
 }
-
 /// Computes the bitwise AND of 256 bits (representing integer data)
 /// in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
-
 pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
+    {
+        transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Computes the bitwise NOT of 256 bits (representing integer data)
 /// in `a` and then AND with `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
-
 pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
-    let all_ones = _mm256_set1_epi8(-1);
-    transmute(simd_and(simd_xor(a.as_i64x4(), all_ones.as_i64x4()), b.as_i64x4()))
+    {
+        let all_ones = _mm256_set1_epi8(-1);
+        transmute(simd_and(
+            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
+            b.as_i64x4(),
+        ))
+    }
 }
-
 /// Averages packed unsigned 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
-
 pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<16, _, u32>(a.as_u16x16());
-    let b = simd_cast::<16, _, u32>(b.as_u16x16());
-    let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
-    transmute(simd_cast::<16, _, u16>(r))
+    {
+        let a = simd_cast::<16, _, u32>(a.as_u16x16());
+        let b = simd_cast::<16, _, u32>(b.as_u16x16());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
+        transmute(simd_cast::<16, _, u16>(r))
+    }
 }
-
 /// Averages packed unsigned 8-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
-
 pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<32, _, u16>(a.as_u8x32());
-    let b = simd_cast::<32, _, u16>(b.as_u8x32());
-    let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
-    transmute(simd_cast::<32, _, u8>(r))
+    {
+        let a = simd_cast::<32, _, u16>(a.as_u8x32());
+        let b = simd_cast::<32, _, u16>(b.as_u8x32());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
+        transmute(simd_cast::<32, _, u8>(r))
+    }
 }
-
 /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
-
 pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let r: i32x4 = simd_shuffle(
-        a,
-        b,
-        [
-            [0, 4, 0, 4][IMM4 as usize & 0b11],
-            [1, 1, 5, 5][IMM4 as usize & 0b11],
-            [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
-            [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
-        ],
-    );
-    transmute(r)
+    static_assert_uimm_bits!(IMM4, 4);
+    {
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let r: i32x4 = simd_shuffle(
+            a,
+            b,
+            [
+                [0, 4, 0, 4][IMM4 as usize & 0b11],
+                [1, 1, 5, 5][IMM4 as usize & 0b11],
+                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
+                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
-
 pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let r: i32x8 = simd_shuffle(
-        a,
-        b,
-        [
-            [0, 8, 0, 8][IMM8 as usize & 0b11],
-            [1, 1, 9, 9][IMM8 as usize & 0b11],
-            [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
-            [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
-            [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
-            [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
-            [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
-            [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
-        ],
-    );
-    transmute(r)
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r: i32x8 = simd_shuffle(
+            a,
+            b,
+            [
+                [0, 8, 0, 8][IMM8 as usize & 0b11],
+                [1, 1, 9, 9][IMM8 as usize & 0b11],
+                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
+                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
+                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
+                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
+                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
+                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
 pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_i16x16();
-    let b = b.as_i16x16();
-
-    let r: i16x16 = simd_shuffle(
-        a,
-        b,
-        [
-            [0, 16, 0, 16][IMM8 as usize & 0b11],
-            [1, 1, 17, 17][IMM8 as usize & 0b11],
-            [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
-            [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
-            [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
-            [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
-            [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
-            [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
-            [8, 24, 8, 24][IMM8 as usize & 0b11],
-            [9, 9, 25, 25][IMM8 as usize & 0b11],
-            [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
-            [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
-            [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
-            [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
-            [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
-            [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
-        ],
-    );
-    transmute(r)
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        let r: i16x16 = simd_shuffle(
+            a,
+            b,
+            [
+                [0, 16, 0, 16][IMM8 as usize & 0b11],
+                [1, 1, 17, 17][IMM8 as usize & 0b11],
+                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
+                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
+                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
+                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
+                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
+                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
+                [8, 24, 8, 24][IMM8 as usize & 0b11],
+                [9, 9, 25, 25][IMM8 as usize & 0b11],
+                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
+                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
+                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
+                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
+                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
+                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Blends packed 8-bit integers from `a` and `b` using `mask`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
 pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
-    let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO());
-    transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
+    {
+        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO());
+        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
+    }
 }
-
 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
 /// the 128-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
 pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 16]);
-    transmute(ret)
+    {
+        let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 16]);
+        transmute::<i8x16, _>(ret)
+    }
 }
-
 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
 /// the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
 pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 32]);
-    transmute(ret)
+    {
+        let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 32]);
+        transmute::<i8x32, _>(ret)
+    }
 }
-
-// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
-// often compiled to `vbroadcastss`.
 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
 /// the 128-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
-
 pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 4]);
-    transmute(ret)
+    {
+        let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 4]);
+        transmute::<i32x4, _>(ret)
+    }
 }
-
-// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
-// often compiled to `vbroadcastss`.
 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
 /// the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
-
 pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 8]);
-    transmute(ret)
+    {
+        let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 8]);
+        transmute::<i32x8, _>(ret)
+    }
 }
-
 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
 /// the 128-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
-
-// Emits `vmovddup` instead of `vpbroadcastq`
-// See https://github.com/rust-lang/stdarch/issues/791
-
 pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
-    transmute(ret)
+    {
+        let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
+        transmute::<i64x2, _>(ret)
+    }
 }
-
 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
 /// the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
-
 pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
-    transmute(ret)
+    {
+        let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
+        transmute::<i64x4, _>(ret)
+    }
 }
-
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
+// pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
+//     { simd_shuffle(a, _mm_setzero_pd(), [0_u32; 2]) }
+// }
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
+// pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
+//     { simd_shuffle(a, _mm_setzero_pd(), [0_u32; 4]) }
+// }
 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
 /// the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
-
 pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
-    transmute(ret)
+    {
+        let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
+        transmute::<i64x4, _>(ret)
+    }
 }
-
-// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
-// `vbroadcastf128`.
 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
 /// the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
-
 pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
-    transmute(ret)
+    {
+        let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
+        transmute::<i64x4, _>(ret)
+    }
 }
-
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
+// pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
+//     { simd_shuffle(a, _mm_setzero_ps(), [0_u32; 4]) }
+// }
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
+// pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
+//     { simd_shuffle(a, _mm_setzero_ps(), [0_u32; 8]) }
+// }
 /// Broadcasts the low packed 16-bit integer from a to all elements of
 /// the 128-bit returned value
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
-
 pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 8]);
-    transmute(ret)
+    {
+        let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 8]);
+        transmute::<i16x8, _>(ret)
+    }
 }
-
 /// Broadcasts the low packed 16-bit integer from a to all elements of
 /// the 256-bit returned value
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
-
 pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 16]);
-    transmute(ret)
+    {
+        let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 16]);
+        transmute::<i16x16, _>(ret)
+    }
 }
-
 /// Compares packed 64-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
-
 pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_eq(a.as_i64x4(), b.as_i64x4()))
+    {
+        transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Compares packed 32-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
-
 pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_eq(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Compares packed 16-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
-
 pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_eq(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Compares packed 8-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
-
 pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_eq(a.as_i8x32(), b.as_i8x32()))
+    {
+        transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Compares packed 64-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
-
 pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_gt(a.as_i64x4(), b.as_i64x4()))
+    {
+        transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
-
 pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_gt(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
-
 pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_gt(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
-
 pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_gt(a.as_i8x32(), b.as_i8x32()))
+    {
+        transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Sign-extend 16-bit integers to 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
-
 pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
-    transmute(simd_cast::<8, _, i32>(a.as_i16x8()))
+    {
+        transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
+    }
 }
-
 /// Sign-extend 16-bit integers to 64-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
-
 pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
-    let a = a.as_i16x8();
-    let v64: i16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    transmute(simd_cast::<4, i16, i64>(v64))
+    {
+        let a = a.as_i16x8();
+        let v64: i16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute::<i64x4, _>(simd_cast(v64))
+    }
 }
-
 /// Sign-extend 32-bit integers to 64-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
-
 pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
-    transmute(simd_cast::<4, i32, i64>(a.as_i32x4()))
+    {
+        transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
+    }
 }
-
 /// Sign-extend 8-bit integers to 16-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
-
 pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
-    transmute(simd_cast::<16, i8, i16>(a.as_i8x16()))
+    {
+        transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
+    }
 }
-
 /// Sign-extend 8-bit integers to 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
-
 pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
-    let a = a.as_i8x16();
-    let v64: i8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    transmute(simd_cast::<8, i8, i32>(v64))
+    {
+        let a = a.as_i8x16();
+        let v64: i8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i32x8, _>(simd_cast(v64))
+    }
 }
-
 /// Sign-extend 8-bit integers to 64-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
 pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
-    let a = a.as_i8x16();
-    let v32: i8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    transmute(simd_cast::<4, i8, i64>(v32))
+    {
+        let a = a.as_i8x16();
+        let v32: i8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute::<i64x4, _>(simd_cast(v32))
+    }
 }
-
 /// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
 /// integers, and stores the results in `dst`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
-
 pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
-    transmute(simd_cast::<8, u16, u32>(a.as_u16x8()))
+    {
+        transmute(simd_cast::<8, _, u32>(a.as_u16x8()))
+    }
 }
-
 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
 /// integers. The upper four elements of `a` are unused.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
-
 pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
-    let a = a.as_u16x8();
-    let v64: u16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    transmute(simd_cast::<4, u16, u64>(v64))
+    {
+        let a = a.as_u16x8();
+        let v64: u16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<4, _, u64>(v64))
+    }
 }
-
 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
-
 pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
-    transmute(simd_cast::<4, u32, u64>(a.as_u32x4()))
+    {
+        transmute(simd_cast::<4, _, u64>(a.as_u32x4()))
+    }
 }
-
 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
-
 pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
-    transmute(simd_cast::<16, u8, u16>(a.as_u8x16()))
+    {
+        transmute(simd_cast::<16, _, u16>(a.as_u8x16()))
+    }
 }
-
 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
 /// integers. The upper eight elements of `a` are unused.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
-
 pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
-    let a = a.as_u8x16();
-    let v64: u8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    transmute(simd_cast::<8, u8, u32>(v64))
+    {
+        let a = a.as_u8x16();
+        let v64: u8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(simd_cast::<8, _, u32>(v64))
+    }
 }
-
 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
 /// integers. The upper twelve elements of `a` are unused.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
-
 pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
-    let a = a.as_u8x16();
-    let v32: u8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    transmute(simd_cast::<4, u8, u64>(v32))
+    {
+        let a = a.as_u8x16();
+        let v32: u8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<4, _, u64>(v32))
+    }
 }
-
 /// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
-
 pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
-    let a = a.as_i64x4();
-    let b = i64x4::from_fn(|_| 0);
-    let dst: i64x2 = simd_shuffle(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
-    transmute(dst)
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let a = a.as_i64x4();
+        let b = i64x4::ZERO();
+        let dst: i64x2 = simd_shuffle(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
+        transmute(dst)
+    }
 }
-
 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
-
 pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
-
 pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
 /// using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
-
 pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
-
 pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
-
 pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
 /// using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
-
 pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
 /// location specified by `IMM1`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
-
 pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
-    let a = a.as_i64x4();
-    let b = _mm256_castsi128_si256(b).as_i64x4();
-    let dst: i64x4 = simd_shuffle(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
-    transmute(dst)
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let a = a.as_i64x4();
+        let b = _mm256_castsi128_si256(b).as_i64x4();
+        let dst: i64x4 = simd_shuffle(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
+        transmute(dst)
+    }
 }
-
 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
 /// of intermediate 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
-
 pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
 /// corresponding signed 8-bit integer from `b`, producing intermediate
 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
 /// signed 16-bit integers
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
-
 pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
+    {
+        transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
+    }
 }
-
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
 /// maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
-
 pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_i16x16();
-    let b = b.as_i16x16();
-    transmute(simd_select::<16, i16, _>(simd_gt(a, b), a, b))
+    {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
 /// maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
-
 pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    transmute(simd_select::<8, i32, _>(simd_gt(a, b), a, b))
+    {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
 /// maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
-
 pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_i8x32();
-    let b = b.as_i8x32();
-    transmute(simd_select::<32, i8, _>(simd_gt(a, b), a, b))
+    {
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
 /// the packed maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
-
 pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_u16x16();
-    let b = b.as_u16x16();
-    transmute(simd_select::<16, _, u16>(simd_gt(a, b), a, b))
+    {
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
 /// the packed maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
-
 pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_u32x8();
-    let b = b.as_u32x8();
-    transmute(simd_select::<8, _, u32>(simd_gt(a, b), a, b))
+    {
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
 /// the packed maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
-
 pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_u8x32();
-    let b = b.as_u8x32();
-    transmute(simd_select::<32, _, u8>(simd_gt(a, b), a, b))
+    {
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
 /// minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
-
 pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_i16x16();
-    let b = b.as_i16x16();
-    transmute(simd_select::<16, _, i16>(simd_lt(a, b), a, b))
+    {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
 /// minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
-
 pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    transmute(simd_select::<8, i32, _>(simd_lt(a, b), a, b))
+    {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
 /// minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
-
 pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_i8x32();
-    let b = b.as_i8x32();
-    transmute(simd_select::<32, i8, _>(simd_lt(a, b), a, b))
+    {
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
 /// the packed minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
-
 pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_u16x16();
-    let b = b.as_u16x16();
-    transmute(simd_select::<16, _, u16>(simd_lt(a, b), a, b))
+    {
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
 /// the packed minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
-
 pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_u32x8();
-    let b = b.as_u32x8();
-    transmute(simd_select::<8, _, u32>(simd_lt(a, b), a, b))
+    {
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
 /// the packed minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
-
 pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_u8x32();
-    let b = b.as_u8x32();
-    transmute(simd_select::<32, _, u8>(simd_lt(a, b), a, b))
+    {
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Creates mask from the most significant bit of each 8-bit element in `a`,
 /// return the result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
-
 pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
-    let z = i8x32::from_fn(|_| 0);
-    let m: i8x32 = simd_lt(a.as_i8x32(), z);
-    let r = simd_bitmask_little!(31, m, u32);
-    r as i32
+    {
+        let z = i8x32::ZERO();
+        let m: i8x32 = simd_lt(a.as_i8x32(), z);
+        simd_bitmask_little!(31, m, u32) as i32
+    }
 }
-
 /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
 /// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
 /// results in dst. Eight SADs are performed for each 128-bit lane using one
@@ -934,189 +874,212 @@ pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
 /// starting at the offset specified in `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
-
 pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8))
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8))
+    }
 }
-
 /// Multiplies the low 32-bit integers from each packed 64-bit element in
 /// `a` and `b`
 ///
 /// Returns the 64-bit results.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
-
 pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(a.as_i64x4()));
-    let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(b.as_i64x4()));
-    transmute(simd_mul(a, b))
+    {
+        let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(a.as_i64x4()));
+        let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(b.as_i64x4()));
+        transmute(simd_mul(a, b))
+    }
 }
-
 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
 /// element in `a` and `b`
 ///
 /// Returns the unsigned 64-bit results.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
-
 pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_u64x4();
-    let b = b.as_u64x4();
-    let mask = u64x4::splat(u32::MAX.into());
-    transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    {
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        let mask = u64x4::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
 }
-
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
-
 pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<16, _, i32>(a.as_i16x16());
-    let b = simd_cast::<16, _, i32>(b.as_i16x16());
-    let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
-    transmute(simd_cast::<16, i32, i16>(r))
+    {
+        let a = simd_cast::<16, _, i32>(a.as_i16x16());
+        let b = simd_cast::<16, _, i32>(b.as_i16x16());
+        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
+        transmute(simd_cast::<16, i32, i16>(r))
+    }
 }
-
 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
-
 pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<16, _, u32>(a.as_u16x16());
-    let b = simd_cast::<16, _, u32>(b.as_u16x16());
-    let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
-    transmute(simd_cast::<16, u32, u16>(r))
+    {
+        let a = simd_cast::<16, _, u32>(a.as_u16x16());
+        let b = simd_cast::<16, _, u32>(b.as_u16x16());
+        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
+        transmute(simd_cast::<16, u32, u16>(r))
+    }
 }
-
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers, and returns the low 16 bits of the
 /// intermediate integers
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
-
 pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Multiplies the packed 32-bit integers in `a` and `b`, producing
 /// intermediate 64-bit integers, and returns the low 32 bits of the
 /// intermediate integers
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
-
 pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Multiplies packed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Truncate each intermediate
 /// integer to the 18 most significant bits, round by adding 1, and
 /// return bits `[16:1]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
-
 pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Computes the bitwise OR of 256 bits (representing integer data) in `a`
 /// and `b`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
-
 pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
-
 pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
-
 pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
-
 pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
-
 pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
 ///
 /// The last 3 bits of each integer of `b` are used as addresses into the 8
 /// integers of `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
-
 pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(permd(a.as_u32x8(), b.as_u32x8()))
+    {
+        transmute(permd(a.as_u32x8(), b.as_u32x8()))
+    }
 }
-
 /// Permutes 64-bit integers from `a` using control mask `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
-
 pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    let zero = i64x4::from_fn(|_| 0);
-    let r: i64x4 = simd_shuffle(
-        a.as_i64x4(),
-        zero,
-        [
-            IMM8 as u32 & 0b11,
-            (IMM8 as u32 >> 2) & 0b11,
-            (IMM8 as u32 >> 4) & 0b11,
-            (IMM8 as u32 >> 6) & 0b11,
-        ],
-    );
-    transmute(r)
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let zero = i64x4::ZERO();
+        let r: i64x4 = simd_shuffle(
+            a.as_i64x4(),
+            zero,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
-
 pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8))
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8))
+    }
 }
-
+/// Shuffles 64-bit floating-point elements in `a` across lanes using the
+/// control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
+// pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     {
+//         transmute(simd_shuffle(
+//             a, _mm256_undefined_pd(), [IMM8 as u32 & 0b11, (IMM8 as u32 >> 2) & 0b11,
+//             (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
+//         ))
+//     }
+// }
+/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
+/// the corresponding 32-bit integer index in `idx`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
+// pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
+//     { permps(a, idx.as_i32x8()) }
+// }
 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
 /// and `b`, then horizontally sum each consecutive 8 differences to
 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
 /// integers in the low 16 bits of the 64-bit return value
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
-
 pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
+    {
+        transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
+    }
 }
-
 /// Shuffles bytes from `a` according to the content of `b`.
 ///
 /// For each of the 128-bit low and high halves of the vectors, the last
@@ -1133,6 +1096,8 @@ pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
 ///     let mut r = [0; 32];
 ///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
 ///         if b[i] & 0x80 == 0u8 {
 ///             r[i] = a[(b[i] % 16) as usize];
 ///         }
@@ -1145,208 +1110,215 @@ pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// ```
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
-
 pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
+    {
+        transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
+    }
 }
-
 /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
 /// `imm8`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
-
 pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle(
-        a.as_i32x8(),
-        a.as_i32x8(),
-        [
-            MASK as u32 & 0b11,
-            (MASK as u32 >> 2) & 0b11,
-            (MASK as u32 >> 4) & 0b11,
-            (MASK as u32 >> 6) & 0b11,
-            (MASK as u32 & 0b11) + 4,
-            ((MASK as u32 >> 2) & 0b11) + 4,
-            ((MASK as u32 >> 4) & 0b11) + 4,
-            ((MASK as u32 >> 6) & 0b11) + 4,
-        ],
-    );
-    transmute(r)
+    static_assert_uimm_bits!(MASK, 8);
+    {
+        let r: i32x8 = simd_shuffle(
+            a.as_i32x8(),
+            a.as_i32x8(),
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                (MASK as u32 >> 4) & 0b11,
+                (MASK as u32 >> 6) & 0b11,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
 /// to the output.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
-
 pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = a.as_i16x16();
-    let r: i16x16 = simd_shuffle(
-        a,
-        a,
-        [
-            0,
-            1,
-            2,
-            3,
-            4 + (IMM8 as u32 & 0b11),
-            4 + ((IMM8 as u32 >> 2) & 0b11),
-            4 + ((IMM8 as u32 >> 4) & 0b11),
-            4 + ((IMM8 as u32 >> 6) & 0b11),
-            8,
-            9,
-            10,
-            11,
-            12 + (IMM8 as u32 & 0b11),
-            12 + ((IMM8 as u32 >> 2) & 0b11),
-            12 + ((IMM8 as u32 >> 4) & 0b11),
-            12 + ((IMM8 as u32 >> 6) & 0b11),
-        ],
-    );
-    transmute(r)
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x16();
+        let r: i16x16 = simd_shuffle(
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                4 + (IMM8 as u32 & 0b11),
+                4 + ((IMM8 as u32 >> 2) & 0b11),
+                4 + ((IMM8 as u32 >> 4) & 0b11),
+                4 + ((IMM8 as u32 >> 6) & 0b11),
+                8,
+                9,
+                10,
+                11,
+                12 + (IMM8 as u32 & 0b11),
+                12 + ((IMM8 as u32 >> 2) & 0b11),
+                12 + ((IMM8 as u32 >> 4) & 0b11),
+                12 + ((IMM8 as u32 >> 6) & 0b11),
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
 /// to the output.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
-
 pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = a.as_i16x16();
-    let r: i16x16 = simd_shuffle(
-        a,
-        a,
-        [
-            0 + (IMM8 as u32 & 0b11),
-            0 + ((IMM8 as u32 >> 2) & 0b11),
-            0 + ((IMM8 as u32 >> 4) & 0b11),
-            0 + ((IMM8 as u32 >> 6) & 0b11),
-            4,
-            5,
-            6,
-            7,
-            8 + (IMM8 as u32 & 0b11),
-            8 + ((IMM8 as u32 >> 2) & 0b11),
-            8 + ((IMM8 as u32 >> 4) & 0b11),
-            8 + ((IMM8 as u32 >> 6) & 0b11),
-            12,
-            13,
-            14,
-            15,
-        ],
-    );
-    transmute(r)
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x16();
+        let r: i16x16 = simd_shuffle(
+            a,
+            a,
+            [
+                0 + (IMM8 as u32 & 0b11),
+                0 + ((IMM8 as u32 >> 2) & 0b11),
+                0 + ((IMM8 as u32 >> 4) & 0b11),
+                0 + ((IMM8 as u32 >> 6) & 0b11),
+                4,
+                5,
+                6,
+                7,
+                8 + (IMM8 as u32 & 0b11),
+                8 + ((IMM8 as u32 >> 2) & 0b11),
+                8 + ((IMM8 as u32 >> 4) & 0b11),
+                8 + ((IMM8 as u32 >> 6) & 0b11),
+                12,
+                13,
+                14,
+                15,
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Negates packed 16-bit integers in `a` when the corresponding signed
 /// 16-bit integer in `b` is negative, and returns the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
-
 pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(psignw(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(psignw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Negates packed 32-bit integers in `a` when the corresponding signed
 /// 32-bit integer in `b` is negative, and returns the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
-
 pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(psignd(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute(psignd(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Negates packed 8-bit integers in `a` when the corresponding signed
 /// 8-bit integer in `b` is negative, and returns the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
-
 pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(psignb(a.as_i8x32(), b.as_i8x32()))
+    {
+        transmute(psignb(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` left by `count` while
 /// shifting in zeros, and returns the result
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
-
 pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
-    transmute(psllw(a.as_i16x16(), count.as_i16x8()))
+    {
+        transmute(psllw(a.as_i16x16(), count.as_i16x8()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` left by `count` while
 /// shifting in zeros, and returns the result
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
-
 pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
-    transmute(pslld(a.as_i32x8(), count.as_i32x4()))
+    {
+        transmute(pslld(a.as_i32x8(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` left by `count` while
 /// shifting in zeros, and returns the result
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
-
 pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
-    transmute(psllq(a.as_i64x4(), count.as_i64x2()))
+    {
+        transmute(psllq(a.as_i64x4(), count.as_i64x2()))
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` left by `IMM8` while
 /// shifting in zeros, return the results;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
-
 pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 16 {
-        _mm256_setzero_si256()
-    } else {
-        transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+        }
     }
 }
-
 /// Shifts packed 32-bit integers in `a` left by `IMM8` while
 /// shifting in zeros, return the results;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
-
 pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 32 {
-        _mm256_setzero_si256()
-    } else {
-        transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+        }
     }
 }
-
 /// Shifts packed 64-bit integers in `a` left by `IMM8` while
 /// shifting in zeros, return the results;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
-
 pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 64 {
-        _mm256_setzero_si256()
-    } else {
-        transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+        }
     }
 }
-
 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
-
 pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
     _mm256_bslli_epi128::<IMM8>(a)
 }
-
 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
-
 pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
     const fn mask(shift: i32, i: u32) -> u32 {
         let shift = shift as u32 & 0xff;
         if shift > 15 || i % 16 < shift {
@@ -1355,500 +1327,516 @@ pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
             32 + (i - shift)
         }
     }
-    let a = a.as_i8x32();
-    let r: i8x32 = simd_shuffle(
-        i8x32::from_fn(|_| 0),
-        a,
-        [
-            mask(IMM8, 0) as u32,
-            mask(IMM8, 1) as u32,
-            mask(IMM8, 2) as u32,
-            mask(IMM8, 3) as u32,
-            mask(IMM8, 4) as u32,
-            mask(IMM8, 5) as u32,
-            mask(IMM8, 6) as u32,
-            mask(IMM8, 7) as u32,
-            mask(IMM8, 8) as u32,
-            mask(IMM8, 9) as u32,
-            mask(IMM8, 10) as u32,
-            mask(IMM8, 11) as u32,
-            mask(IMM8, 12) as u32,
-            mask(IMM8, 13) as u32,
-            mask(IMM8, 14) as u32,
-            mask(IMM8, 15) as u32,
-            mask(IMM8, 16) as u32,
-            mask(IMM8, 17) as u32,
-            mask(IMM8, 18) as u32,
-            mask(IMM8, 19) as u32,
-            mask(IMM8, 20) as u32,
-            mask(IMM8, 21) as u32,
-            mask(IMM8, 22) as u32,
-            mask(IMM8, 23) as u32,
-            mask(IMM8, 24) as u32,
-            mask(IMM8, 25) as u32,
-            mask(IMM8, 26) as u32,
-            mask(IMM8, 27) as u32,
-            mask(IMM8, 28) as u32,
-            mask(IMM8, 29) as u32,
-            mask(IMM8, 30) as u32,
-            mask(IMM8, 31) as u32,
-        ],
-    );
-    transmute(r)
+    {
+        let a = a.as_i8x32();
+        let r: i8x32 = simd_shuffle(
+            i8x32::ZERO(),
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and returns the result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
-
 pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
+    {
+        transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and returns the result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
-
 pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
+    {
+        transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and returns the result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
-
 pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
+    {
+        transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and returns the result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
-
 pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
+    {
+        transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `count` while
 /// shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
-
 pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
-    transmute(psraw(a.as_i16x16(), count.as_i16x8()))
+    {
+        transmute(psraw(a.as_i16x16(), count.as_i16x8()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `count` while
 /// shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
-
 pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
-    transmute(psrad(a.as_i32x8(), count.as_i32x4()))
+    {
+        transmute(psrad(a.as_i32x8(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while
 /// shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
-
 pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16)))
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16)))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while
 /// shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
-
 pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31))))
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31))))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
 /// corresponding element in `count` while shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
-
 pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
-    transmute(psravd(a.as_i32x4(), count.as_i32x4()))
+    {
+        transmute(psravd(a.as_i32x4(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
 /// corresponding element in `count` while shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
-
 pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
-    transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
+    {
+        transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
+    }
 }
-
 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
-
 pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
     _mm256_bsrli_epi128::<IMM8>(a)
 }
-
 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
-
 pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
     const fn mask(shift: i32, i: u32) -> u32 {
         let shift = shift as u32 & 0xff;
         if shift > 15 || (15 - (i % 16)) < shift {
-            0 as u32
+            0
         } else {
-            (32 + (i + shift)) as u32
+            32 + (i + shift)
         }
     }
-
-    let a = a.as_i8x32();
-    let r: i8x32 = simd_shuffle(
-        i8x32::from_fn(|_| 0),
-        a,
-        [
-            mask(IMM8, 0),
-            mask(IMM8, 1),
-            mask(IMM8, 2),
-            mask(IMM8, 3),
-            mask(IMM8, 4),
-            mask(IMM8, 5),
-            mask(IMM8, 6),
-            mask(IMM8, 7),
-            mask(IMM8, 8),
-            mask(IMM8, 9),
-            mask(IMM8, 10),
-            mask(IMM8, 11),
-            mask(IMM8, 12),
-            mask(IMM8, 13),
-            mask(IMM8, 14),
-            mask(IMM8, 15),
-            mask(IMM8, 16),
-            mask(IMM8, 17),
-            mask(IMM8, 18),
-            mask(IMM8, 19),
-            mask(IMM8, 20),
-            mask(IMM8, 21),
-            mask(IMM8, 22),
-            mask(IMM8, 23),
-            mask(IMM8, 24),
-            mask(IMM8, 25),
-            mask(IMM8, 26),
-            mask(IMM8, 27),
-            mask(IMM8, 28),
-            mask(IMM8, 29),
-            mask(IMM8, 30),
-            mask(IMM8, 31),
-        ],
-    );
-
-    transmute(r)
+    {
+        let a = a.as_i8x32();
+        let r: i8x32 = simd_shuffle(
+            i8x32::ZERO(),
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
-
 pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
-    transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
+    {
+        transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
-
 pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
-    transmute(psrld(a.as_i32x8(), count.as_i32x4()))
+    {
+        transmute(psrld(a.as_i32x8(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
-
 pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
-    transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
+    {
+        transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
 /// zeros
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
-
 pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 16 {
-        _mm256_setzero_si256()
-    } else {
-        transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+        }
     }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
 /// zeros
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
-
 pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 32 {
-        _mm256_setzero_si256()
-    } else {
-        transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+        }
     }
 }
-
 /// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
 /// zeros
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
-
 pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 64 {
-        _mm256_setzero_si256()
-    } else {
-        transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+        }
     }
 }
-
 /// Shifts packed 32-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
-
 pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
+    {
+        transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
-
 pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
+    {
+        transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
-
 pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
+    {
+        transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
-
 pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
+    {
+        transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
+    }
 }
-
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
-
 pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
-
 pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
+    {
+        transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
-
 pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
+    {
+        transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
-
 pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
+    {
+        transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
 /// `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
-
 pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
-   transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
+    {
+        transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
 /// `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
-
 pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
+    {
+        transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
 /// integers in `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
-
 pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
+    {
+        transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
+    }
 }
-
 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
 /// integers in `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
-
 pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
+    {
+        transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
+    }
 }
-
 /// Unpacks and interleave 8-bit integers from the high half of each
 /// 128-bit lane in `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
-
 pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
-    #[rustfmt::skip]
-    let r: i8x32 = simd_shuffle(a.as_i8x32(), b.as_i8x32(), [
-            8, 40, 9, 41, 10, 42, 11, 43,
-            12, 44, 13, 45, 14, 46, 15, 47,
-            24, 56, 25, 57, 26, 58, 27, 59,
-            28, 60, 29, 61, 30, 62, 31, 63,
-    ]);
-    transmute(r)
+    {
+        #[rustfmt::skip]
+        let r: i8x32 = simd_shuffle(
+            a.as_i8x32(), b.as_i8x32(), [8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45,
+            14, 46, 15, 47, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31,
+            63,]
+        );
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 8-bit integers from the low half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
-
 pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
-    #[rustfmt::skip]
-    let r: i8x32 = simd_shuffle(a.as_i8x32(), b.as_i8x32(), [
-        0, 32, 1, 33, 2, 34, 3, 35,
-        4, 36, 5, 37, 6, 38, 7, 39,
-        16, 48, 17, 49, 18, 50, 19, 51,
-        20, 52, 21, 53, 22, 54, 23, 55,
-    ]);
-    transmute(r)
+    {
+        #[rustfmt::skip]
+        let r: i8x32 = simd_shuffle(
+            a.as_i8x32(), b.as_i8x32(), [0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38,
+            7, 39, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,]
+        );
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 16-bit integers from the high half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
-
 pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let r: i16x16 = simd_shuffle(
-        a.as_i16x16(),
-        b.as_i16x16(),
-        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
-    );
-    transmute(r)
+    {
+        let r: i16x16 = simd_shuffle(
+            a.as_i16x16(),
+            b.as_i16x16(),
+            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
+        );
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 16-bit integers from the low half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
-
 pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let r: i16x16 = simd_shuffle(
-        a.as_i16x16(),
-        b.as_i16x16(),
-        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
-    );
-    transmute(r)
+    {
+        let r: i16x16 = simd_shuffle(
+            a.as_i16x16(),
+            b.as_i16x16(),
+            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
+        );
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 32-bit integers from the high half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
-
 pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
-    transmute(r)
+    {
+        let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 32-bit integers from the low half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
-
 pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
-    transmute(r)
+    {
+        let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 64-bit integers from the high half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
-
 pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
-    transmute(r)
+    {
+        let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 64-bit integers from the low half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
-
 pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
-    transmute(r)
+    {
+        let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
+        transmute(r)
+    }
 }
-
 /// Computes the bitwise XOR of 256 bits (representing integer data)
 /// in `a` and `b`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
-
 pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
+    {
+        transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
-
-// This intrinsic has no corresponding instruction.
-
 pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
-    simd_extract(a.as_u8x32(), INDEX as u32) as u32 as i32
+    static_assert_uimm_bits!(INDEX, 5);
+    {
+        simd_extract(a.as_u8x32(), INDEX as u32) as i32
+    }
 }
-
 /// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
-
-// This intrinsic has no corresponding instruction.
-
 pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
-    simd_extract(a.as_u16x16(), INDEX as u32) as u32 as i32
+    static_assert_uimm_bits!(INDEX, 4);
+    {
+        simd_extract(a.as_u16x16(), INDEX as u32) as i32
+    }
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
index 8053daaee923a..43f0a840b54bd 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
@@ -538,7 +538,7 @@ pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
     })
 }
 
-pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
+pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16 {
     u16x16::from_fn(|i| {
         if i < 8 {
             let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
@@ -617,4 +617,4 @@ pub fn psadbw(a: u8x32, b: u8x32) -> u64x4 {
             .wrapping_add(tmp[i * 8 + 6] as u16)
             .wrapping_add(tmp[i * 8 + 7] as u16) as u64
     })
-}
\ No newline at end of file
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
index 9846020475c65..0fe71a69f3d15 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
@@ -19,4 +19,4 @@ pub fn ptestz256(a: i64x4, b: i64x4) -> i32 {
     } else {
         0
     }
-}
\ No newline at end of file
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index baee88b47d671..9ca800aed6722 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -20,14 +20,14 @@
 //! In general, it is best to gain an idea of how an implementation should be written by looking
 //! at how other functions are implemented. Also see `core::arch::x86` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
 
-pub mod avx_handwritten;
 pub mod avx;
-pub mod avx2_handwritten;
 pub mod avx2;
-pub mod sse2_handwritten;
+pub mod avx2_handwritten;
+pub mod avx_handwritten;
 pub mod sse2;
-pub mod ssse3_handwritten;
+pub mod sse2_handwritten;
 pub mod ssse3;
+pub mod ssse3_handwritten;
 
 pub(crate) mod types {
     use crate::abstractions::bitvec::*;
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 878c4c08ebe28..da533234f142a 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -1,8 +1,8 @@
 //! Streaming SIMD Extensions 2 (SSE2)
+use super::sse2_handwritten::*;
 use super::types::*;
-use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
-use super::sse2_handwritten::*;
+use crate::abstractions::utilities::*;
 
 /// Adds packed 8-bit integers in `a` and `b`.
 ///
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
index d0c1308f7ef3c..217298286968c 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
@@ -1,196 +1,196 @@
 use crate::abstractions::{bit::MachineInteger, simd::*};
-    pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
-        i8x16::from_fn(|i| {
-            if i < 8 {
-                if a[i] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if a[i] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    a[i] as i8
-                }
+pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
+    i8x16::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (i8::MAX as i16) {
+                i8::MAX
+            } else if a[i] < (i8::MIN as i16) {
+                i8::MIN
             } else {
-                if b[i - 8] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if b[i - 8] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    b[i - 8] as i8
-                }
+                a[i] as i8
             }
-        })
-    }
-    pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4 {
-        i32x4::from_fn(|i| {
-            (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
-        })
-    }
-    pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 {
-        let tmp = u8x16::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
-        u64x2::from_fn(|i| {
-            (tmp[i * 8] as u16)
-                .wrapping_add(tmp[i * 8 + 1] as u16)
-                .wrapping_add(tmp[i * 8 + 2] as u16)
-                .wrapping_add(tmp[i * 8 + 3] as u16)
-                .wrapping_add(tmp[i * 8 + 4] as u16)
-                .wrapping_add(tmp[i * 8 + 5] as u16)
-                .wrapping_add(tmp[i * 8 + 6] as u16)
-                .wrapping_add(tmp[i * 8 + 7] as u16) as u64
-        })
-    }
-    pub fn psllw(a: i16x8, count: i16x8) -> i16x8 {
-        let count4: u64 = (count[0] as u16) as u64;
-        let count3: u64 = ((count[1] as u16) as u64) * 65536;
-        let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
-        let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
-        let count = count1 + count2 + count3 + count4;
-        i16x8::from_fn(|i| {
-            if count > 15 {
-                0
+        } else {
+            if b[i - 8] > (i8::MAX as i16) {
+                i8::MAX
+            } else if b[i - 8] < (i8::MIN as i16) {
+                i8::MIN
             } else {
-                ((a[i] as u16) << count) as i16
+                b[i - 8] as i8
             }
-        })
-    }
-
-    pub fn pslld(a: i32x4, count: i32x4) -> i32x4 {
-        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x4::from_fn(|i| {
-            if count > 31 {
-                0
+        }
+    })
+}
+pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4 {
+    i32x4::from_fn(|i| {
+        (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+    })
+}
+pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 {
+    let tmp = u8x16::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
+    u64x2::from_fn(|i| {
+        (tmp[i * 8] as u16)
+            .wrapping_add(tmp[i * 8 + 1] as u16)
+            .wrapping_add(tmp[i * 8 + 2] as u16)
+            .wrapping_add(tmp[i * 8 + 3] as u16)
+            .wrapping_add(tmp[i * 8 + 4] as u16)
+            .wrapping_add(tmp[i * 8 + 5] as u16)
+            .wrapping_add(tmp[i * 8 + 6] as u16)
+            .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+    })
+}
+pub fn psllw(a: i16x8, count: i16x8) -> i16x8 {
+    let count4: u64 = (count[0] as u16) as u64;
+    let count3: u64 = ((count[1] as u16) as u64) * 65536;
+    let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
+    let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
+    let count = count1 + count2 + count3 + count4;
+    i16x8::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) << count) as i16
+        }
+    })
+}
+
+pub fn pslld(a: i32x4, count: i32x4) -> i32x4 {
+    let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x4::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) << count) as i32
+        }
+    })
+}
+
+pub fn psllq(a: i64x2, count: i64x2) -> i64x2 {
+    let count: u64 = count[0] as u64;
+
+    i64x2::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u64) << count) as i64
+        }
+    })
+}
+
+pub fn psraw(a: i16x8, count: i16x8) -> i16x8 {
+    let count: u64 = ((count[3] as u16) as u64) * 281474976710656
+        + ((count[2] as u16) as u64) * 4294967296
+        + ((count[1] as u16) as u64) * 65536
+        + ((count[0] as u16) as u64);
+
+    i16x8::from_fn(|i| {
+        if count > 15 {
+            if a[i] < 0 {
+                -1
             } else {
-                ((a[i] as u32) << count) as i32
-            }
-        })
-    }
-
-    pub fn psllq(a: i64x2, count: i64x2) -> i64x2 {
-        let count: u64 = count[0] as u64;
-
-        i64x2::from_fn(|i| {
-            if count > 63 {
                 0
-            } else {
-                ((a[i] as u64) << count) as i64
-            }
-        })
-    }
-
-    pub fn psraw(a: i16x8, count: i16x8) -> i16x8 {
-        let count: u64 = ((count[3] as u16) as u64) * 281474976710656
-            + ((count[2] as u16) as u64) * 4294967296
-            + ((count[1] as u16) as u64) * 65536
-            + ((count[0] as u16) as u64);
-
-        i16x8::from_fn(|i| {
-            if count > 15 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] >> count
             }
-        })
-    }
-
-    pub fn psrad(a: i32x4, count: i32x4) -> i32x4 {
-        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x4::from_fn(|i| {
-            if count > 31 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
+        } else {
+            a[i] >> count
+        }
+    })
+}
+
+pub fn psrad(a: i32x4, count: i32x4) -> i32x4 {
+    let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x4::from_fn(|i| {
+        if count > 31 {
+            if a[i] < 0 {
+                -1
             } else {
-                a[i] << count
-            }
-        })
-    }
-
-    pub fn psrlw(a: i16x8, count: i16x8) -> i16x8 {
-        let count: u64 = (count[3] as u16 as u64) * 281474976710656
-            + (count[2] as u16 as u64) * 4294967296
-            + (count[1] as u16 as u64) * 65536
-            + (count[0] as u16 as u64);
-
-        i16x8::from_fn(|i| {
-            if count > 15 {
                 0
-            } else {
-                ((a[i] as u16) >> count) as i16
             }
-        })
-    }
-
-    pub fn psrld(a: i32x4, count: i32x4) -> i32x4 {
-        let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
-
-        i32x4::from_fn(|i| {
-            if count > 31 {
-                0
+        } else {
+            a[i] << count
+        }
+    })
+}
+
+pub fn psrlw(a: i16x8, count: i16x8) -> i16x8 {
+    let count: u64 = (count[3] as u16 as u64) * 281474976710656
+        + (count[2] as u16 as u64) * 4294967296
+        + (count[1] as u16 as u64) * 65536
+        + (count[0] as u16 as u64);
+
+    i16x8::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) >> count) as i16
+        }
+    })
+}
+
+pub fn psrld(a: i32x4, count: i32x4) -> i32x4 {
+    let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
+
+    i32x4::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) >> count) as i32
+        }
+    })
+}
+
+pub fn psrlq(a: i64x2, count: i64x2) -> i64x2 {
+    let count: u64 = count[0] as u64;
+
+    i64x2::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u64) >> count) as i64
+        }
+    })
+}
+
+pub fn packssdw(a: i32x4, b: i32x4) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            if a[i] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i] < (i16::MIN as i32) {
+                i16::MIN
             } else {
-                ((a[i] as u32) >> count) as i32
+                a[i] as i16
             }
-        })
-    }
-
-    pub fn psrlq(a: i64x2, count: i64x2) -> i64x2 {
-        let count: u64 = count[0] as u64;
-
-        i64x2::from_fn(|i| {
-            if count > 63 {
-                0
+        } else {
+            if b[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 4] < (i16::MIN as i32) {
+                i16::MIN
             } else {
-                ((a[i] as u64) >> count) as i64
+                b[i - 4] as i16
             }
-        })
-    }
-
-    pub fn packssdw(a: i32x4, b: i32x4) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                if a[i] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if a[i] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    a[i] as i16
-                }
+        }
+    })
+}
+
+pub fn packuswb(a: i16x8, b: i16x8) -> u8x16 {
+    u8x16::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (u8::MAX as i16) {
+                u8::MAX
+            } else if a[i] < (u8::MIN as i16) {
+                u8::MIN
             } else {
-                if b[i - 4] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if b[i - 4] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    b[i - 4] as i16
-                }
+                a[i] as u8
             }
-        })
-    }
-
-    pub fn packuswb(a: i16x8, b: i16x8) -> u8x16 {
-        u8x16::from_fn(|i| {
-            if i < 8 {
-                if a[i] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if a[i] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    a[i] as u8
-                }
+        } else {
+            if b[i - 8] > (u8::MAX as i16) {
+                u8::MAX
+            } else if b[i - 8] < (u8::MIN as i16) {
+                u8::MIN
             } else {
-                if b[i - 8] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if b[i - 8] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    b[i - 8] as u8
-                }
+                b[i - 8] as u8
             }
-        })
-    }
\ No newline at end of file
+        }
+    })
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index 66e61be3a93e7..900c32e5e5293 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -1,10 +1,10 @@
 //! Supplemental Streaming SIMD Extensions 3 (SSSE3)
-use crate::abstractions::utilities::*;
 use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
 
-use super::types::*;
-use super::ssse3_handwritten::*;
 use super::sse2::*;
+use super::ssse3_handwritten::*;
+use super::types::*;
 
 /// Computes the absolute value of packed 8-bit signed integers in `a` and
 /// return the unsigned results.
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs
index d6368798235e8..4e911a83fb457 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs
@@ -1,127 +1,127 @@
 use crate::abstractions::simd::*;
-    pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
-        u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u32] })
-    }
+pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
+    u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u32] })
+}
 
-    pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].wrapping_add(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
-            }
-        })
-    }
+pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        }
+    })
+}
 
-    pub fn phaddsw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].saturating_add(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
-            }
-        })
-    }
+pub fn phaddsw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_add(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+        }
+    })
+}
 
-    pub fn phaddd128(a: i32x4, b: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if i < 2 {
-                a[2 * i].wrapping_add(a[2 * i + 1])
-            } else {
-                b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
-            }
-        })
-    }
+pub fn phaddd128(a: i32x4, b: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else {
+            b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+        }
+    })
+}
 
-    pub fn phsubw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].wrapping_sub(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
-            }
-        })
-    }
+pub fn phsubw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+        }
+    })
+}
 
-    pub fn phsubsw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].saturating_sub(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
-            }
-        })
-    }
+pub fn phsubsw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_sub(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+        }
+    })
+}
 
-    pub fn phsubd128(a: i32x4, b: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if i < 2 {
-                a[2 * i].wrapping_sub(a[2 * i + 1])
-            } else {
-                b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
-            }
-        })
-    }
+pub fn phsubd128(a: i32x4, b: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else {
+            b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+        }
+    })
+}
 
-    pub fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8 {
-        i16x8::from_fn(|i| {
-            ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
-                .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
-        })
-    }
+pub fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8 {
+    i16x8::from_fn(|i| {
+        ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+            .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+    })
+}
 
-    pub fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            let temp = (a[i] as i32) * (b[i] as i32);
-            let temp = (temp >> 14).wrapping_add(1) >> 1;
-            temp as i16
-        })
-    }
+pub fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        let temp = (a[i] as i32) * (b[i] as i32);
+        let temp = (temp >> 14).wrapping_add(1) >> 1;
+        temp as i16
+    })
+}
 
-    pub fn psignb128(a: i8x16, b: i8x16) -> i8x16 {
-        i8x16::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i8::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
+pub fn psignb128(a: i8x16, b: i8x16) -> i8x16 {
+    i8x16::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i8::MIN {
                 a[i]
             } else {
-                0
+                -a[i]
             }
-        })
-    }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
 
-    pub fn psignw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i16::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
+pub fn psignw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i16::MIN {
                 a[i]
             } else {
-                0
+                -a[i]
             }
-        })
-    }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
 
-    pub fn psignd128(a: i32x4, b: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i32::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
+pub fn psignd128(a: i32x4, b: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i32::MIN {
                 a[i]
             } else {
-                0
+                -a[i]
             }
-        })
-    }
\ No newline at end of file
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}

From 947f83a95d109cce4e8f807dfa5000f1a8fe4a91 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 01:18:51 -0400
Subject: [PATCH 27/47] sse2 and avx generated

---
 .../src/abstractions/funarr.rs                |    4 +-
 testable-simd-models/src/abstractions/simd.rs |   10 +-
 .../src/core_arch/x86/models/avx.rs           | 1596 +++++++++++++++--
 .../core_arch/x86/models/avx_handwritten.rs   |    9 +
 .../src/core_arch/x86/models/mod.rs           |    4 +
 .../src/core_arch/x86/models/sse2.rs          | 1395 +++++++++-----
 6 files changed, 2374 insertions(+), 644 deletions(-)

diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index ef29dec1951e2..4026efb66c1f5 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -1,7 +1,7 @@
 //! This module implements a fixed-size array wrapper with functional semantics
 //! which are used in formulating abstractions.
 
-use crate::abstractions::bit::MachineInteger;
+use crate::abstractions::bit::MachineNumeric;
 
 /// `FunArray<N, T>` represents an array of `T` values of length `N`, where `N` is a compile-time constant.
 /// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
@@ -55,7 +55,7 @@ impl<const N: u32, T> FunArray<N, T> {
     }
 }
 
-impl<const N: u32, T: MachineInteger> FunArray<N, T> {
+impl<const N: u32, T: MachineNumeric> FunArray<N, T> {
     #[allow(non_snake_case)]
     pub fn ZERO() -> Self {
         Self::from_fn(|_| T::ZEROS)
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 7b7d8330c222c..70e0556618288 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -196,7 +196,7 @@ macro_rules! from_impls{
         $(
 	    impl CastsFrom<$ty2> for $ty1 {
 		fn cast(a: $ty2) -> $ty1 {
-		    <$ty1>::from(a)
+		    a as $ty1
 		}
 	    }
 	)*
@@ -278,7 +278,13 @@ from_impls!(
     [i128, i8],
     [i128, i16],
     [i128, i32],
-    [i128, i64]
+    [i128, i64],
+    [f64, u32],
+    [f64, i32],
+    [f32, u32],
+    [f32, i32],
+    [f32, f64],
+    [f64, f32]
 );
 truncate_from_impls!(
     [u8, u16],
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index d6fb7a94664d9..840058adfe362 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -14,28 +14,410 @@
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
 use super::avx_handwritten::*;
+use super::sse2::*;
 use super::types::*;
 use crate::abstractions::simd::*;
 use crate::abstractions::utilities::*;
 
+/// Adds packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_add_pd)
+// pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { transmute(simd_add(a.as_f64x4(), b.as_f64x4())) }
+// }
+/// Adds packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_add_ps)
+// pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
+//     { transmute(simd_add(a.as_f32x8(), b.as_f32x8())) }
+// }
+/// Computes the bitwise AND of a packed double-precision (64-bit)
+/// floating-point elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_and_pd)
+pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_and(a, b))
+    }
+}
+/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_and_ps)
+pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_and(a, b))
+    }
+}
+/// Computes the bitwise OR packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_or_pd)
+pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_or(a, b))
+    }
+}
+/// Computes the bitwise OR packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_or_ps)
+pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_or(a, b))
+    }
+}
+/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
+/// lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_shuffle_pd)
+pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(MASK, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(), b.as_f64x4(), [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 4, ((MASK as u32 >> 2)
+            & 0b1) + 2, ((MASK as u32 >> 3) & 0b1) + 6,],
+        ))
+    }
+}
+/// Shuffles single-precision (32-bit) floating-point elements in `a` within
+/// 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_shuffle_ps)
+pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(MASK, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(), b.as_f32x8(), [MASK as u32 & 0b11, (MASK as u32 >> 2) & 0b11, ((MASK as u32 >> 4) &
+            0b11) + 8, ((MASK as u32 >> 6) & 0b11) + 8, (MASK as u32 & 0b11) + 4, ((MASK
+            as u32 >> 2) & 0b11) + 4, ((MASK as u32 >> 4) & 0b11) + 12, ((MASK as u32 >>
+            6) & 0b11) + 12,],
+        ))
+    }
+}
+/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
+/// elements in `a`, and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_andnot_pd)
+pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
+    }
+}
+/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
+/// elements in `a`
+/// and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_andnot_ps)
+pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
+    }
+}
+/// Compares packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and returns packed maximum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_max_pd)
+// pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vmaxpd(a, b) }
+// }
+/// Compares packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and returns packed maximum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_max_ps)
+// pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
+//     { vmaxps(a, b) }
+// }
+/// Compares packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and returns packed minimum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_min_pd)
+// pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vminpd(a, b) }
+// }
+/// Compares packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and returns packed minimum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_min_ps)
+// pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
+//     { vminps(a, b) }
+// }
+/// Multiplies packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_mul_pd)
+// pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { transmute(simd_mul(a.as_f64x4(), b.as_f64x4())) }
+// }
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_mul_ps)
+// pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
+//     { transmute(simd_mul(a.as_f32x8(), b.as_f32x8())) }
+// }
+/// Alternatively adds and subtracts packed double-precision (64-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_addsub_pd)
+// pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
+//     {
+//         let a = a.as_f64x4();
+//         let b = b.as_f64x4();
+//         let add = simd_add(a, b);
+//         let sub = simd_sub(a, b);
+//         simd_shuffle(add, sub, [4, 1, 6, 3])
+//     }
+// }
+/// Alternatively adds and subtracts packed single-precision (32-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_addsub_ps)
+// pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
+//     {
+//         let a = a.as_f32x8();
+//         let b = b.as_f32x8();
+//         let add = simd_add(a, b);
+//         let sub = simd_sub(a, b);
+//         simd_shuffle(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+//     }
+// }
+/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sub_pd)
+// pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { simd_sub(a, b) }
+// }
+/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sub_ps)
+// pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
+//     { simd_sub(a, b) }
+// }
+/// Computes the division of each of the 8 packed 32-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_div_ps)
+// pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
+//     { simd_div(a, b) }
+// }
+/// Computes the division of each of the 4 packed 64-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_div_pd)
+// pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { simd_div(a, b) }
+// }
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_round_pd)
+// pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(ROUNDING, 4);
+//     { roundpd256(a, ROUNDING) }
+// }
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_ceil_pd)
+// pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
+//     { simd_ceil(a) }
+// }
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_floor_pd)
+// pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
+//     { simd_floor(a) }
+// }
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_round_ps)
+// pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
+//     static_assert_uimm_bits!(ROUNDING, 4);
+//     { roundps256(a, ROUNDING) }
+// }
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_ceil_ps)
+// pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
+//     { simd_ceil(a) }
+// }
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_floor_ps)
+// pub fn _mm256_floor_ps(a: __m256) -> __m256 {
+//     { simd_floor(a) }
+// }
+/// Returns the square root of packed single-precision (32-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sqrt_ps)
+// pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
+//     { simd_fsqrt(a) }
+// }
+/// Returns the square root of packed double-precision (64-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sqrt_pd)
+// pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
+//     { simd_fsqrt(a) }
+// }
+/// Blends packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blend_pd)
+pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM4, 4);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(), b.as_f64x4(), [((IMM4 as u32 >> 0) & 1) * 4 + 0, ((IMM4 as u32 >> 1) & 1) * 4 + 1,
+            ((IMM4 as u32 >> 2) & 1) * 4 + 2, ((IMM4 as u32 >> 3) & 1) * 4 + 3,],
+        ))
+    }
+}
 /// Blends packed single-precision (32-bit) floating-point elements from
-/// `a` and `b` using `c` as a mask.
+/// `a` and `b` using control mask `imm8`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
-pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    let mask: i32x8 = simd_lt(c.as_i32x8(), i32x8::ZERO());
-    transmute(simd_select(mask, b.as_i32x8(), a.as_i32x8()))
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blend_ps)
+pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(), b.as_f32x8(), [((IMM8 as u32 >> 0) & 1) * 8 + 0, ((IMM8 as u32 >> 1) & 1) * 8 + 1,
+            ((IMM8 as u32 >> 2) & 1) * 8 + 2, ((IMM8 as u32 >> 3) & 1) * 8 + 3, ((IMM8 as
+            u32 >> 4) & 1) * 8 + 4, ((IMM8 as u32 >> 5) & 1) * 8 + 5, ((IMM8 as u32 >> 6)
+            & 1) * 8 + 6, ((IMM8 as u32 >> 7) & 1) * 8 + 7,],
+        ))
+    }
 }
-
 /// Blends packed double-precision (64-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blendv_pd)
 pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    let mask: i64x4 = simd_lt(c.as_i64x4(), i64x4::ZERO());
-    transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
+    {
+        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO());
+        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
+    }
+}
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blendv_ps)
+pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    {
+        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO());
+        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
+    }
+}
+/// Conditionally multiplies the packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` using the high 4 bits in `imm8`,
+/// sum the four products, and conditionally return the sum
+///  using the low 4 bits of `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_dp_ps)
+// pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     { vdpps(a, b, IMM8 as i8) }
+// }
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hadd_pd)
+// pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vhaddpd(a, b) }
+// }
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hadd_ps)
+// pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
+//     { vhaddps(a, b) }
+// }
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hsub_pd)
+// pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vhsubpd(a, b) }
+// }
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hsub_ps)
+// pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
+//     { vhsubps(a, b) }
+// }
+/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_xor_pd)
+pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_xor(a, b))
+    }
+}
+/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_xor_ps)
+pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_xor(a, b))
+    }
 }
-
 /// Equal (ordered, non-signaling)
 pub const _CMP_EQ_OQ: i32 = 0x00;
 /// Less-than (ordered, signaling)
@@ -100,98 +482,697 @@ pub const _CMP_GE_OQ: i32 = 0x1d;
 pub const _CMP_GT_OQ: i32 = 0x1e;
 /// True (unordered, signaling)
 pub const _CMP_TRUE_US: i32 = 0x1f;
-
+/// Compares packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_pd)
+// pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmppd(a, b, const { IMM5 as i8 }) }
+// }
+/// Compares packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cmp_pd)
+// pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmppd256(a, b, IMM5 as u8) }
+// }
+/// Compares packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_ps)
+// pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpps(a, b, const { IMM5 as i8 }) }
+// }
+/// Compares packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cmp_ps)
+// pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpps256(a, b, const { IMM5 as u8 }) }
+// }
+/// Compares the lower double-precision (64-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
+/// store the result in the lower element of returned vector,
+/// and copies the upper element from `a` to the upper element of returned
+/// vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_sd)
+// pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpsd(a, b, IMM5 as i8) }
+// }
+/// Compares the lower single-precision (32-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
+/// store the result in the lower element of returned vector,
+/// and copies the upper 3 packed elements from `a` to the upper elements of
+/// returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_ss)
+// pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpss(a, b, IMM5 as i8) }
+// }
+/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtepi32_pd)
+pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
+    { transmute(simd_cast::<4,i32,f64>(a.as_i32x4())) }
+}
+/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtepi32_ps)
+pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
+    { transmute(simd_cast::<8,_,f32>(a.as_i32x8())) }
+}
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtpd_ps)
+pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
+    { transmute(simd_cast::<4,_,f32>(a.as_f64x4())) }
+}
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtps_epi32)
+// pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
+//     { transmute(vcvtps2dq(a)) }
+// }
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtps_pd)
+pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
+    { transmute(simd_cast::<4,_,f64>(a.as_f32x4())) }
+}
+/// Returns the first element of the input vector of `[4 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtsd_f64)
+pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
+    { simd_extract(a.as_f64x4(), 0) }
+}
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvttpd_epi32)
+// pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
+//     { transmute(vcvttpd2dq(a)) }
+// }
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtpd_epi32)
+// pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
+//     { transmute(vcvtpd2dq(a)) }
+// }
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvttps_epi32)
+// pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
+//     { transmute(vcvttps2dq(a)) }
+// }
+/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_ps)
+pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(), _mm256_undefined_ps().as_f32x8(), [[0, 1, 2, 3], [4, 5, 6, 7]] [IMM1 as usize],
+        ))
+    }
+}
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_pd)
+pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
+    static_assert_uimm_bits!(IMM1, 1);
+    { transmute(simd_shuffle(a.as_f64x4(), _mm256_undefined_pd().as_f64x4(), [[0, 1], [2, 3]] [IMM1 as usize])) }
+}
+/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_si256)
+pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let dst: i64x2 = simd_shuffle(
+            a.as_i64x4(), i64x4::ZERO(), [[0, 1], [2, 3]] [IMM1 as usize],
+        );
+        transmute(dst)
+    }
+}
+/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extract_epi32)
+pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_uimm_bits!(INDEX, 3);
+    { simd_extract(a.as_i32x8(), INDEX as u32) }
+}
+/// Returns the first element of the input vector of `[8 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtsi256_si32)
+pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
+    { simd_extract(a.as_i32x8(), 0) }
+}
+/// Zeroes the contents of all XMM or YMM registers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zeroall)
+// pub fn _mm256_zeroall() {
+//     { vzeroall() }
+// }
+/// Zeroes the upper 128 bits of all YMM registers;
+/// the lower 128-bits of the registers are unmodified.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zeroupper)
+// pub fn _mm256_zeroupper() {
+//     { vzeroupper() }
+// }
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permutevar_ps)
+// pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
+//     { vpermilps256(a, b.as_i32x8()) }
+// }
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permutevar_ps)
+// pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
+//     { vpermilps(a, b.as_i32x4()) }
+// }
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute_ps)
+pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(), _mm256_undefined_ps().as_f32x8(), [(IMM8 as u32 >> 0) & 0b11, (IMM8 as u32 >> 2) &
+            0b11, (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11, ((IMM8 as u32 >>
+            0) & 0b11) + 4, ((IMM8 as u32 >> 2) & 0b11) + 4, ((IMM8 as u32 >> 4) & 0b11)
+            + 4, ((IMM8 as u32 >> 6) & 0b11) + 4,],
+        ))
+    }
+}
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_ps)
+// pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     {
+//         transmute(simd_shuffle(
+//             a.as_f32x4(), _mm_undefined_ps(), [(IMM8 as u32 >> 0) & 0b11, (IMM8 as u32 >> 2) & 0b11,
+//             (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
+//         ))
+//     }
+// }
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// within 256-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permutevar_pd)
+// pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
+//     { vpermilpd256(a, b.as_i64x4()) }
+// }
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permutevar_pd)
+// pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
+//     { vpermilpd(a, b.as_i64x2()) }
+// }
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute_pd)
+pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM4, 4);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(), _mm256_undefined_pd().as_f64x4(), [((IMM4 as u32 >> 0) & 1), ((IMM4 as u32 >> 1) &
+            1), ((IMM4 as u32 >> 2) & 1) + 2, ((IMM4 as u32 >> 3) & 1) + 2,],
+        ))
+    }
+}
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_pd)
+pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM2, 2);
+    {
+       transmute(simd_shuffle(
+            a.as_f64x2(), _mm_undefined_pd().as_f64x2(), [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
+        ))
+    }
+}
+/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_ps)
+// pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     { vperm2f128ps256(a, b, IMM8 as i8) }
+// }
+/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_pd)
+// pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     { vperm2f128pd256(a, b, IMM8 as i8) }
+// }
 /// Shuffles 128-bits (composed of integer data) selected by `imm8`
 /// from `a` and `b`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_si256)
 pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8))
+    { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
 }
-
+/// Broadcasts a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ss)
+pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
+    _mm256_set1_ps(*f)
+}
+/// Broadcasts a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_broadcast_ss)
+// pub fn _mm_broadcast_ss(f: &f32) -> __m128 {
+//     _mm_set1_ps(*f)
+// }
+/// Broadcasts a double-precision (64-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_sd)
+// pub fn _mm256_broadcast_sd(f: &f64) -> __m256d {
+//     _mm256_set1_pd(*f)
+// }
+/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
+/// (32-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ps)
+// pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
+//     { transmute(simd_shuffle((*a).as_f32x4(), _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])) }
+// }
+/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
+/// (64-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_pd)
+pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
+    { transmute(simd_shuffle((*a).as_f64x2(), _mm_setzero_pd().as_f64x2(), [0, 1, 0, 1])) }
+}
+/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_ps)
+// pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
+//     static_assert_uimm_bits!(IMM1, 1);
+//     {
+//         transmute(simd_shuffle(
+//             a.as_f32x8(), _mm256_castps128_ps256(b), [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9,
+//             10, 11]] [IMM1 as usize],
+//         ))
+//     }
+// }
+/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_pd)
+// pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
+//     static_assert_uimm_bits!(IMM1, 1);
+//     {
+//         simd_shuffle(
+//             a, _mm256_castpd128_pd256(b), [[4, 5, 2, 3], [0, 1, 4, 5]] [IMM1 as usize],
+//         )
+//     }
+// }
 /// Copies `a` to result, then inserts 128 bits from `b` into result
 /// at the location specified by `imm8`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_si256)
 pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
     static_assert_uimm_bits!(IMM1, 1);
-    let dst: i64x4 = simd_shuffle(
-        a.as_i64x4(),
-        _mm256_castsi128_si256(b).as_i64x4(),
-        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
-    );
-    transmute(dst)
+    {
+        let dst: i64x4 = simd_shuffle(
+            a.as_i64x4(), _mm256_castsi128_si256(b).as_i64x4(), [[4, 5, 2, 3], [0, 1, 4,
+            5]] [IMM1 as usize],
+        );
+        transmute(dst)
+    }
 }
-
 /// Copies `a` to result, and inserts the 8-bit integer `i` into result
 /// at the location specified by `index`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi8)
 pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
     static_assert_uimm_bits!(INDEX, 5);
-    simd_insert(a.as_i8x32(), INDEX as u32, i).into()
+    { transmute(simd_insert(a.as_i8x32(), INDEX as u32, i)) }
 }
-
 /// Copies `a` to result, and inserts the 16-bit integer `i` into result
 /// at the location specified by `index`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi16)
 pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
     static_assert_uimm_bits!(INDEX, 4);
-    simd_insert(a.as_i16x16(), INDEX as u32, i).into()
+    { transmute(simd_insert(a.as_i16x16(), INDEX as u32, i)) }
+}
+/// Copies `a` to result, and inserts the 32-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi32)
+pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
+    static_assert_uimm_bits!(INDEX, 3);
+    { transmute(simd_insert(a.as_i32x8(), INDEX as u32, i)) }
+}
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movehdup_ps)
+pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
+    { transmute(simd_shuffle(a.as_f32x8(), a.as_f32x8(), [1, 1, 3, 3, 5, 5, 7, 7])) }
+}
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_moveldup_ps)
+pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
+    { transmute(simd_shuffle(a.as_f32x8(), a.as_f32x8(), [0, 0, 2, 2, 4, 4, 6, 6])) }
+}
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movedup_pd)
+pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
+    { transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 0, 2, 2])) }
+}
+/// Computes the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`, and returns the results. The maximum
+/// relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_rcp_ps)
+// pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
+//     { vrcpps(a) }
+// }
+/// Computes the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`, and returns the results.
+/// The maximum relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_rsqrt_ps)
+// pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
+//     { vrsqrtps(a) }
+// }
+/// Unpacks and interleave double-precision (64-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpackhi_pd)
+pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
+    { transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [1, 5, 3, 7])) }
+}
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpackhi_ps)
+pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
+    { transmute(simd_shuffle(a.as_f32x8(), b.as_f32x8(), [2, 10, 3, 11, 6, 14, 7, 15])) }
+}
+/// Unpacks and interleave double-precision (64-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpacklo_pd)
+pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
+    { transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [0, 4, 2, 6])) }
+}
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpacklo_ps)
+pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
+    { transmute(simd_shuffle(a.as_f32x8(), b.as_f32x8(), [0, 8, 1, 9, 4, 12, 5, 13])) }
 }
-
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
 /// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_si256)
 pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    ptestz256(a.as_i64x4(), b.as_i64x4())
+    { ptestz256(a.as_i64x4(), b.as_i64x4()) }
+}
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_si256)
+pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
+    { ptestc256(a.as_i64x4(), b.as_i64x4()) }
+}
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
+/// `CF` values are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_si256)
+// pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
+//     { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
+// }
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_pd)
+// pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
+//     { vtestzpd256(a, b) }
+// }
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_pd)
+// pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
+//     { vtestcpd256(a, b) }
+// }
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_pd)
+// pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
+//     { vtestnzcpd256(a, b) }
+// }
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testz_pd)
+// pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
+//     { vtestzpd(a, b) }
+// }
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testc_pd)
+// pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
+//     { vtestcpd(a, b) }
+// }
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testnzc_pd)
+// pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
+//     { vtestnzcpd(a, b) }
+// }
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_ps)
+// pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
+//     { vtestzps256(a, b) }
+// }
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_ps)
+// pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
+//     { vtestcps256(a, b) }
+// }
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_ps)
+// pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
+//     { vtestnzcps256(a, b) }
+// }
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testz_ps)
+// pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
+//     { vtestzps(a, b) }
+// }
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testc_ps)
+// pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
+//     { vtestcps(a, b) }
+// }
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testnzc_ps)
+// pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
+//     { vtestnzcps(a, b) }
+// }
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed double-precision (64-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movemask_pd)
+pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
+    {
+        let mask: i64x4 = simd_lt(a.as_i64x4(), i64x4::ZERO());
+        simd_bitmask_little!(3, mask, u8) as i32
+    }
 }
-
 /// Sets each bit of the returned mask based on the most significant bit of the
 /// corresponding packed single-precision (32-bit) floating-point element in
 /// `a`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movemask_ps)
 pub fn _mm256_movemask_ps(a: __m256) -> i32 {
-    // Propagate the highest bit to the rest, because simd_bitmask
-    // requires all-1 or all-0.
-    let mask: i32x8 = simd_lt(a.as_i32x8(), i32x8::ZERO());
-    let r = simd_bitmask_little!(7, mask, u8);
-    r as u32 as i32
+    {
+        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO());
+        simd_bitmask_little!(7, mask, u8) as i32
+    }
+}
+/// Returns vector of type __m256d with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_pd)
+pub fn _mm256_setzero_pd() -> __m256d {
+    transmute(f64x4::ZERO())
 }
-
 /// Returns vector of type __m256 with all elements set to zero.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_ps)
 pub fn _mm256_setzero_ps() -> __m256 {
-    __m256::ZERO()
+    transmute(f32x8::ZERO())
 }
-
 /// Returns vector of type __m256i with all elements set to zero.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_si256)
 pub fn _mm256_setzero_si256() -> __m256i {
-    __m256i::ZERO()
+    transmute(i64x4::ZERO())
 }
-
-/// Sets packed 8-bit integers in returned vector with the supplied values in
-/// reverse order.
+/// Sets packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
-pub fn _mm256_setr_epi8(
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_pd)
+pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    _mm256_setr_pd(d, c, b, a)
+}
+/// Sets packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_ps)
+pub fn _mm256_set_ps(
+    a: f32,
+    b: f32,
+    c: f32,
+    d: f32,
+    e: f32,
+    f: f32,
+    g: f32,
+    h: f32,
+) -> __m256 {
+    _mm256_setr_ps(h, g, f, e, d, c, b, a)
+}
+/// Sets packed 8-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi8)
+pub fn _mm256_set_epi8(
     e00: i8,
     e01: i8,
     e02: i8,
@@ -225,16 +1206,45 @@ pub fn _mm256_setr_epi8(
     e30: i8,
     e31: i8,
 ) -> __m256i {
-    transmute(i8x32::new(
-        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
-        e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
-    ))
+    _mm256_setr_epi8(
+        e31,
+        e30,
+        e29,
+        e28,
+        e27,
+        e26,
+        e25,
+        e24,
+        e23,
+        e22,
+        e21,
+        e20,
+        e19,
+        e18,
+        e17,
+        e16,
+        e15,
+        e14,
+        e13,
+        e12,
+        e11,
+        e10,
+        e09,
+        e08,
+        e07,
+        e06,
+        e05,
+        e04,
+        e03,
+        e02,
+        e01,
+        e00,
+    )
 }
-/// Sets packed 16-bit integers in returned vector with the supplied values in
-/// reverse order.
+/// Sets packed 16-bit integers in returned vector with the supplied values.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
-pub fn _mm256_setr_epi16(
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi16)
+pub fn _mm256_set_epi16(
     e00: i16,
     e01: i16,
     e02: i16,
@@ -252,15 +1262,29 @@ pub fn _mm256_setr_epi16(
     e14: i16,
     e15: i16,
 ) -> __m256i {
-    transmute(i16x16::new(
-        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
-    ))
+    _mm256_setr_epi16(
+        e15,
+        e14,
+        e13,
+        e12,
+        e11,
+        e10,
+        e09,
+        e08,
+        e07,
+        e06,
+        e05,
+        e04,
+        e03,
+        e02,
+        e01,
+        e00,
+    )
 }
-/// Sets packed 32-bit integers in returned vector with the supplied values in
-/// reverse order.
+/// Sets packed 32-bit integers in returned vector with the supplied values.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
-pub fn _mm256_setr_epi32(
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi32)
+pub fn _mm256_set_epi32(
     e0: i32,
     e1: i32,
     e2: i32,
@@ -270,23 +1294,42 @@ pub fn _mm256_setr_epi32(
     e6: i32,
     e7: i32,
 ) -> __m256i {
-    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
 }
-/// Sets packed 64-bit integers in returned vector with the supplied values in
-/// reverse order.
+/// Sets packed 64-bit integers in returned vector with the supplied values.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
-pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
-    transmute(i64x4::new(a, b, c, d))
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi64x)
+pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    _mm256_setr_epi64x(d, c, b, a)
 }
-
-/// Sets packed 8-bit integers in returned vector with the supplied values.
+/// Sets packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
-
-// This intrinsic has no corresponding instruction.
-
-pub fn _mm256_set_epi8(
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_pd)
+pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    transmute(f64x4::new(a, b, c, d))
+}
+/// Sets packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_ps)
+pub fn _mm256_setr_ps(
+    a: f32,
+    b: f32,
+    c: f32,
+    d: f32,
+    e: f32,
+    f: f32,
+    g: f32,
+    h: f32,
+) -> __m256 {
+    transmute(f32x8::new(a, b, c, d, e, f, g, h))
+}
+/// Sets packed 8-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi8)
+pub fn _mm256_setr_epi8(
     e00: i8,
     e01: i8,
     e02: i8,
@@ -320,19 +1363,50 @@ pub fn _mm256_set_epi8(
     e30: i8,
     e31: i8,
 ) -> __m256i {
-    _mm256_setr_epi8(
-        e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14,
-        e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00,
-    )
+    {
+        transmute(
+            i8x32::new(
+                e00,
+                e01,
+                e02,
+                e03,
+                e04,
+                e05,
+                e06,
+                e07,
+                e08,
+                e09,
+                e10,
+                e11,
+                e12,
+                e13,
+                e14,
+                e15,
+                e16,
+                e17,
+                e18,
+                e19,
+                e20,
+                e21,
+                e22,
+                e23,
+                e24,
+                e25,
+                e26,
+                e27,
+                e28,
+                e29,
+                e30,
+                e31,
+            ),
+        )
+    }
 }
-
-/// Sets packed 16-bit integers in returned vector with the supplied values.
+/// Sets packed 16-bit integers in returned vector with the supplied values in
+/// reverse order.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
-
-// This intrinsic has no corresponding instruction.
-
-pub fn _mm256_set_epi16(
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi16)
+pub fn _mm256_setr_epi16(
     e00: i16,
     e01: i16,
     e02: i16,
@@ -350,18 +1424,34 @@ pub fn _mm256_set_epi16(
     e14: i16,
     e15: i16,
 ) -> __m256i {
-    _mm256_setr_epi16(
-        e15, e14, e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00,
-    )
+    {
+        transmute(
+            i16x16::new(
+                e00,
+                e01,
+                e02,
+                e03,
+                e04,
+                e05,
+                e06,
+                e07,
+                e08,
+                e09,
+                e10,
+                e11,
+                e12,
+                e13,
+                e14,
+                e15,
+            ),
+        )
+    }
 }
-
-/// Sets packed 32-bit integers in returned vector with the supplied values.
+/// Sets packed 32-bit integers in returned vector with the supplied values in
+/// reverse order.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
-
-// This intrinsic has no corresponding instruction.
-
-pub fn _mm256_set_epi32(
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi32)
+pub fn _mm256_setr_epi32(
     e0: i32,
     e1: i32,
     e2: i32,
@@ -371,106 +1461,278 @@ pub fn _mm256_set_epi32(
     e6: i32,
     e7: i32,
 ) -> __m256i {
-    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
+    { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
 }
-
-/// Sets packed 64-bit integers in returned vector with the supplied values.
+/// Sets packed 64-bit integers in returned vector with the supplied values in
+/// reverse order.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
-// This intrinsic has no corresponding instruction.
-pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
-    _mm256_setr_epi64x(d, c, b, a)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi64x)
+pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    { transmute(i64x4::new(a, b, c, d)) }
+}
+/// Broadcasts double-precision (64-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_pd)
+pub fn _mm256_set1_pd(a: f64) -> __m256d {
+    _mm256_setr_pd(a, a, a, a)
+}
+/// Broadcasts single-precision (32-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_ps)
+pub fn _mm256_set1_ps(a: f32) -> __m256 {
+    _mm256_setr_ps(a, a, a, a, a, a, a, a)
 }
-
 /// Broadcasts 8-bit integer `a` to all elements of returned vector.
-/// This intrinsic may generate the `vpbroadcastw`.
+/// This intrinsic may generate the `vpbroadcastb`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
-
-//
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi8)
+pub fn _mm256_set1_epi8(a: i8) -> __m256i {
 
-// This intrinsic has no corresponding instruction.
-
-pub fn _mm256_set1_epi8(val: i8) -> __m256i {
-    transmute(i8x32::from_fn(|_| val))
+    _mm256_setr_epi8(
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+        a,
+    )
 }
-
 /// Broadcasts 16-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastw`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
-
-//
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi16)
 pub fn _mm256_set1_epi16(a: i16) -> __m256i {
-    transmute(i16x16::from_fn(|_| a))
+    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
-
 /// Broadcasts 32-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastd`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi32)
 pub fn _mm256_set1_epi32(a: i32) -> __m256i {
-    transmute(i32x8::from_fn(|_| a))
+    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
 }
-
 /// Broadcasts 64-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastq`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
-// This intrinsic has no corresponding instruction.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi64x)
 pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
-    transmute(i64x4::from_fn(|_| a))
+    _mm256_setr_epi64x(a, a, a, a)
 }
-
+/// Cast vector of type __m256d to type __m256.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd_ps)
+pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
+    { transmute(a) }
+}
+/// Cast vector of type __m256 to type __m256d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps_pd)
+pub fn _mm256_castps_pd(a: __m256) -> __m256d {
+    { transmute(a) }
+}
+/// Casts vector of type __m256 to type __m256i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps_si256)
 pub fn _mm256_castps_si256(a: __m256) -> __m256i {
-    a
+    { transmute(a) }
 }
-
 /// Casts vector of type __m256i to type __m256.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
-// This intrinsic is only used for compilation and does not generate any
-// instructions, thus it has zero latency.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_ps)
 pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
-    a
+    { transmute(a) }
+}
+/// Casts vector of type __m256d to type __m256i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd_si256)
+pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
+    { transmute(a) }
+}
+/// Casts vector of type __m256i to type __m256d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_pd)
+pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
+    { transmute(a) }
+}
+/// Casts vector of type __m256 to type __m128.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps256_ps128)
+pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
+    { transmute(simd_shuffle(a.as_f32x8(), a.as_f32x8(), [0, 1, 2, 3])) }
+}
+/// Casts vector of type __m256d to type __m128d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd256_pd128)
+pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
+    { transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 1])) }
 }
-
 /// Casts vector of type __m256i to type __m128i.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
-
-// This intrinsic is only used for compilation and does not generate any
-// instructions, thus it has zero latency.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_si128)
 pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
-    __m128i::from_fn(|i| a[i])
+    {
+        let a = a.as_i64x4();
+        let dst: i64x2 = simd_shuffle(a, a, [0, 1]);
+        transmute(dst)
+    }
+}
+/// Casts vector of type __m128 to type __m256;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps128_ps256)
+// pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
+//     { simd_shuffle(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
+// }
+/// Casts vector of type __m128d to type __m256d;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd128_pd256)
+pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
+    { transmute(simd_shuffle(a.as_f64x2(), _mm_undefined_pd().as_f64x2(), [0, 1, 2, 2])) }
 }
-
 /// Casts vector of type __m128i to type __m256i;
 /// the upper 128 bits of the result are undefined.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
-
-// This intrinsic is only used for compilation and does not generate any
-// instructions, thus it has zero latency.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi128_si256)
 pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
-    let a = a.as_i64x2();
-    let undefined = i64x2::from_fn(|_| 0);
-    let dst: i64x4 = simd_shuffle(a, undefined, [0, 1, 2, 2]);
-    transmute(dst)
+    {
+        let a = a.as_i64x2();
+        let undefined = i64x2::ZERO();
+        let dst: i64x4 = simd_shuffle(a, undefined, [0, 1, 2, 2]);
+        transmute(dst)
+    }
+}
+/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
+/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
+/// the value of the source vector. The upper 128 bits are set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextps128_ps256)
+// pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
+//     { simd_shuffle(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
+// }
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 128 bits are set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextsi128_si256)
+pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
+    {
+        let b = i64x2::ZERO();
+        let dst: i64x4 = simd_shuffle(a.as_i64x2(), b, [0, 1, 2, 3]);
+        transmute(dst)
+    }
+}
+/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
+/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
+/// contain the value of the source vector. The upper 128 bits are set
+/// to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextpd128_pd256)
+// pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
+//     { simd_shuffle(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
+// }
+/// Returns vector of type `__m256` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_ps)
+pub fn _mm256_undefined_ps() -> __m256 {
+    transmute(f32x8::ZERO())
+}
+/// Returns vector of type `__m256d` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_pd)
+pub fn _mm256_undefined_pd() -> __m256d {
+    transmute(f32x8::ZERO())
+}
+/// Returns vector of type __m256i with with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_si256)
+pub fn _mm256_undefined_si256() -> __m256i {
+    transmute(i32x8::ZERO())
+}
+/// Sets packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128)
+pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
+    { transmute(simd_shuffle(lo.as_i32x4(), hi.as_i32x4(), [0, 1, 2, 3, 4, 5, 6, 7])) }
+}
+/// Sets packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128d)
+pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
+    {
+        let hi: __m128 = transmute(hi);
+        let lo: __m128 = transmute(lo);
+        transmute(_mm256_set_m128(hi, lo))
+    }
 }
-
 /// Sets packed __m256i returned vector with the supplied values.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128i)
 pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
-    __m256i::from_fn(|i| if i < 128 { lo[i] } else { hi[i - 128] })
+    {
+        let hi: __m128 = transmute(hi);
+        let lo: __m128 = transmute(lo);
+        transmute(_mm256_set_m128(hi, lo))
+    }
+}
+/// Sets packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128)
+pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
+    _mm256_set_m128(hi, lo)
+}
+/// Sets packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128d)
+pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
+    _mm256_set_m128d(hi, lo)
+}
+/// Sets packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128i)
+pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
+    _mm256_set_m128i(hi, lo)
+}
+/// Returns the first element of the input vector of `[8 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtss_f32)
+pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
+    { simd_extract(a.as_f32x8(), 0) }
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
index 0fe71a69f3d15..ba61996851392 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
@@ -20,3 +20,12 @@ pub fn ptestz256(a: i64x4, b: i64x4) -> i32 {
         0
     }
 }
+
+pub fn ptestc256(a: i64x4, b: i64x4) -> i32 {
+    let c = i64x4::from_fn(|i| !a[i] & b[i]);
+    if c == i64x4::ZERO() {
+        1
+    } else {
+        0
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 9ca800aed6722..3efc7f6791f03 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -39,5 +39,9 @@ pub(crate) mod types {
     #[allow(non_camel_case_types)]
     pub type __m256d = BitVec<256>;
     #[allow(non_camel_case_types)]
+    pub type __m128 = BitVec<128>;
+    #[allow(non_camel_case_types)]
     pub type __m128i = BitVec<128>;
+    #[allow(non_camel_case_types)]
+    pub type __m128d = BitVec<128>;
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index da533234f142a..70964df12ce95 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -7,89 +7,73 @@ use crate::abstractions::utilities::*;
 /// Adds packed 8-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
-
 pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(a.as_i8x16(), b.as_i8x16()).into()
+    { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
 }
-
 /// Adds packed 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
-
 pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
+    { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Adds packed 32-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
-
 pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(a.as_i32x4(), b.as_i32x4()).into()
+    { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
 }
-
 /// Adds packed 64-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
-
 pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(a.as_i64x2(), b.as_i64x2()).into()
+    { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
 }
-
 /// Adds packed 8-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
-
 pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(a.as_i8x16(), b.as_i8x16()).into()
+    { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
 }
-
 /// Adds packed 16-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
-
 pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(a.as_i16x8(), b.as_i16x8()).into()
+    { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
-
 pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(a.as_u8x16(), b.as_u8x16()).into()
+    { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
 }
-
 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
-
 pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(a.as_u16x8(), b.as_u16x8()).into()
+    { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
 }
-
 /// Averages packed unsigned 8-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
-
 pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<16, _, u16>(a.as_u8x16());
-    let b = simd_cast::<16, _, u16>(b.as_u8x16());
-    let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
-    simd_cast::<16, _, u8>(r).into()
+    {
+        let a = simd_cast::<16, _, u16>(a.as_u8x16());
+        let b = simd_cast::<16, _, u16>(b.as_u8x16());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
+        transmute(simd_cast::<16, _, u8>(r))
+    }
 }
-
 /// Averages packed unsigned 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
-
 pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<8, _, u32>(a.as_u16x8());
-    let b = simd_cast::<8, _, u32>(b.as_u16x8());
-    let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
-    simd_cast::<8, _, u16>(r).into()
+    {
+        let a = simd_cast::<8, _, u32>(a.as_u16x8());
+        let b = simd_cast::<8, _, u32>(b.as_u16x8());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
+        transmute(simd_cast::<8, _, u16>(r))
+    }
 }
-
 /// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
 ///
 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
@@ -97,108 +81,104 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 /// intermediate 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
-
 pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    pmaddwd(a.as_i16x8(), b.as_i16x8()).into()
+    { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
 /// maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
-
 pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = a.as_i16x8();
-    let b = b.as_i16x8();
-    simd_select(simd_gt(a, b), a, b).into()
+    {
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
 /// packed maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
-
 pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
-    let a = a.as_u8x16();
-    let b = b.as_u8x16();
-    simd_select(simd_gt(a, b), a, b).into()
+    {
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
 /// minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
-
 pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = a.as_i16x8();
-    let b = b.as_i16x8();
-    simd_select(simd_lt(a, b), a, b).into()
+    {
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
 /// packed minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
-
 pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
-    let a = a.as_u8x16();
-    let b = b.as_u8x16();
-    simd_select(simd_lt(a, b), a, b).into()
+    {
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Multiplies the packed 16-bit integers in `a` and `b`.
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// high 16 bits of the intermediate integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
-
 pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<8, i16, i32>(a.as_i16x8());
-    let b = simd_cast::<8, i16, i32>(b.as_i16x8());
-    let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
-    transmute(simd_cast::<8, i32, i16>(r))
+    {
+        let a = simd_cast::<8, _, i32>(a.as_i16x8());
+        let b = simd_cast::<8, _, i32>(b.as_i16x8());
+        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
+        transmute(simd_cast::<8, i32, i16>(r))
+    }
 }
-
 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// high 16 bits of the intermediate integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
-
 pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<8, _, u32>(a.as_u16x8());
-    let b = simd_cast::<8, _, u32>(b.as_u16x8());
-    let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
-    simd_cast::<8, u32, u16>(r).into()
+    {
+        let a = simd_cast::<8, _, u32>(a.as_u16x8());
+        let b = simd_cast::<8, _, u32>(b.as_u16x8());
+        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
+        transmute(simd_cast::<8, u32, u16>(r))
+    }
 }
-
 /// Multiplies the packed 16-bit integers in `a` and `b`.
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// low 16 bits of the intermediate integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
-
 pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
+    { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
 /// in `a` and `b`.
 ///
 /// Returns the unsigned 64-bit results.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
-
 pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
-    let a = a.as_u64x2();
-    let b = b.as_u64x2();
-    let mask = u64x2::splat(u32::MAX.into());
-    simd_mul(simd_and(a, mask), simd_and(b, mask)).into()
+    {
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        let mask = u64x2::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
 }
-
 /// Sum the absolute differences of packed unsigned 8-bit integers.
 ///
 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
@@ -207,102 +187,76 @@ pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// the low 16 bits of 64-bit elements returned.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
-
 pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
-    psadbw(a.as_u8x16(), b.as_u8x16()).into()
+    { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
 }
-
 /// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
-
 pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
+    { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
 }
-
 /// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
-
 pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
+    { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
-
 pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_sub(a.as_i32x4(), b.as_i32x4()).into()
+    { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
 }
-
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
-
 pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_sub(a.as_i64x2(), b.as_i64x2()).into()
+    { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
 }
-
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 /// using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
-
 pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(a.as_i8x16(), b.as_i8x16()).into()
+    { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
 }
-
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
-
 pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(a.as_i16x8(), b.as_i16x8()).into()
+    { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
-
 pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(a.as_u8x16(), b.as_u8x16()).into()
+    { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
 }
-
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
 /// integers in `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
-
 pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(a.as_u16x8(), b.as_u16x8()).into()
+    { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
 }
-
 /// Shifts `a` left by `IMM8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
-
 pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-    _mm_slli_si128_impl::<IMM8>(a)
+    static_assert_uimm_bits!(IMM8, 8);
+    { _mm_slli_si128_impl::<IMM8>(a) }
 }
 
-/// Implementation detail: converts the immediate argument of the
-/// `_mm_slli_si128` intrinsic into a compile-time constant.
-
 fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
     const fn mask(shift: i32, i: u32) -> u32 {
         let shift = shift as u32 & 0xff;
-        if shift > 15 {
-            i as u32
-        } else {
-            (16 - shift + i) as u32
-        }
+        if shift > 15 { i } else { 16 - shift + i }
     }
-    (simd_shuffle(
-        i8x16::from_fn(|_| 0),
+    transmute::<i8x16, _>(simd_shuffle(
+        i8x16::ZERO(),
         a.as_i8x16(),
         [
             mask(IMM8, 0),
@@ -323,397 +277,361 @@ fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
             mask(IMM8, 15),
         ],
     ))
-    .into()
 }
 
 /// Shifts `a` left by `IMM8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
-
 pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-    _mm_slli_si128_impl::<IMM8>(a)
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        _mm_slli_si128_impl::<IMM8>(a)
+    }
 }
-
 /// Shifts `a` right by `IMM8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
-
 pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-    _mm_srli_si128_impl::<IMM8>(a)
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        _mm_srli_si128_impl::<IMM8>(a)
+    }
 }
 
+fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u32 {
+        if (shift as u32) > 15 {
+            i + 16
+        } else {
+            i + (shift as u32)
+        }
+    }
+    let x: i8x16 = simd_shuffle(
+        a.as_i8x16(),
+        i8x16::ZERO(),
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    );
+    transmute(x)
+}
 /// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
-
 pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 16 {
-        _mm_setzero_si128()
-    } else {
-        simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
+        }
     }
 }
-
 /// Shifts packed 16-bit integers in `a` left by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
-
 pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
-    psllw(count.as_i16x8(), a.as_i16x8()).into()
+    { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
 }
-
 /// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
-
 pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 32 {
-        _mm_setzero_si128()
-    } else {
-        simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
+        }
     }
 }
-
 /// Shifts packed 32-bit integers in `a` left by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
-
 pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
-    pslld(count.as_i32x4(), a.as_i32x4()).into()
+    { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
 }
-
 /// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
-
 pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 64 {
-        _mm_setzero_si128()
-    } else {
-        simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+        }
     }
 }
-
 /// Shifts packed 64-bit integers in `a` left by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
-
 pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psllq(count.as_i64x2(), a.as_i64x2()).into()
+    { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
 /// bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
-
 pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-    simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
 /// bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
-
 pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
-    psraw(a.as_i16x8(), count.as_i16x8()).into()
+    { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
 /// bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
-
 pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-    simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31))).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
 /// bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
-
 pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psrad(a.as_i32x4(), count.as_i32x4()).into()
+    { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
 }
-
 /// Shifts `a` right by `IMM8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
-
 pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-    _mm_srli_si128_impl::<IMM8>(a)
-}
-
-/// Implementation detail: converts the immediate argument of the
-/// `_mm_srli_si128` intrinsic into a compile-time constant.
-
-fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
-    const fn mask(shift: i32, i: u32) -> u32 {
-        if (shift as u32) > 15 {
-            (i + 16) as u32
-        } else {
-            (i + (shift as u32)) as u32
-        }
-    }
-    let x: i8x16 = simd_shuffle(
-        a.as_i8x16(),
-        i8x16::from_fn(|_| 0),
-        [
-            mask(IMM8, 0),
-            mask(IMM8, 1),
-            mask(IMM8, 2),
-            mask(IMM8, 3),
-            mask(IMM8, 4),
-            mask(IMM8, 5),
-            mask(IMM8, 6),
-            mask(IMM8, 7),
-            mask(IMM8, 8),
-            mask(IMM8, 9),
-            mask(IMM8, 10),
-            mask(IMM8, 11),
-            mask(IMM8, 12),
-            mask(IMM8, 13),
-            mask(IMM8, 14),
-            mask(IMM8, 15),
-        ],
-    );
-    x.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    { _mm_srli_si128_impl::<IMM8>(a) }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
-
 pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 16 {
-        _mm_setzero_si128()
-    } else {
-        simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
+        }
     }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
-
 pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
-    psrlw(a.as_i16x8(), count.as_i16x8()).into()
+    { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
-
 pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 32 {
-        _mm_setzero_si128()
-    } else {
-        simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
+        }
     }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
-
 pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psrld(count.as_i32x4(), a.as_i32x4()).into()
+    { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
 }
-
 /// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
-
 pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    // TODO    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 64 {
-        __m128i::ZERO()
-    } else {
-        __m128i::from_u64x2(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+        }
     }
 }
-
 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
-
 pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psrlq(count.as_i64x2(), a.as_i64x2()).into()
+    { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
 }
-
 /// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
-
 pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
-    __m128i::from_fn(|i| a[i] & b[i])
+    { transmute(simd_and(a.as_i32x4(), b.as_i32x4())) }
 }
-
 /// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
 /// then AND with `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
-
 pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
-    __m128i::from_fn(|i| (!a[i]) & b[i])
+    { transmute(simd_and(simd_xor(_mm_set1_epi8(-1).as_i32x4(), a.as_i32x4()), b.as_i32x4())) }
 }
-
 /// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
-
 pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
-    __m128i::from_fn(|i| a[i] | b[i])
+    { transmute(simd_or(a.as_i32x4(), b.as_i32x4())) }
 }
-
 /// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
-
 pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
-    __m128i::from_fn(|i| a[i] ^ b[i])
+    { transmute(simd_xor(a.as_i32x4(), b.as_i32x4())) }
 }
-
 /// Compares packed 8-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
-
 pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_eq(a.as_i8x16(), b.as_i8x16()))
+    { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
 }
-
 /// Compares packed 16-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
-
 pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_eq(a.as_i16x8(), b.as_i16x8()))
+    { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Compares packed 32-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
-
 pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_eq(a.as_i32x4(), b.as_i32x4()))
+    { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
 }
-
 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
-
 pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_gt(a.as_i8x16(), b.as_i8x16()))
+    { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
 }
-
 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
-
 pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_gt(a.as_i16x8(), b.as_i16x8()))
+    { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
-
 pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_gt(a.as_i32x4(), b.as_i32x4()))
+    { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
 }
-
 /// Compares packed 8-bit integers in `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
-
 pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_lt(a.as_i8x16(), b.as_i8x16()))
+    { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
 }
-
 /// Compares packed 16-bit integers in `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
-
 pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_lt(a.as_i16x8(), b.as_i16x8()))
+    { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Compares packed 32-bit integers in `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
-
 pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_lt(a.as_i32x4(), b.as_i32x4()))
+    { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
 }
-
+/// Converts the lower two packed 32-bit integers in `a` to packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
+pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
+    {
+        let a = a.as_i32x4();
+        transmute(simd_cast::<2, i32, f64>(simd_shuffle(a, a, [0, 1])))
+    }
+}
+/// Returns `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
+pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
+    { transmute(simd_insert(a.as_f64x2(), 0, b as f64)) }
+}
+/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
+pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
+    { transmute(simd_cast::<4, _, f32>(a.as_i32x4())) }
+}
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
+// pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
+//     { transmute(cvtps2dq(a)) }
+// }
+/// Returns a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
 pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
-    i32x4::from_fn(|i| if i == 0 { a } else { 0 }).into()
+    { transmute(i32x4::new(a, 0, 0, 0)) }
 }
-
 /// Returns the lowest element of `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
-
 pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
-    simd_extract(a.as_i32x4(), 0)
+    { simd_extract(a.as_i32x4(), 0) }
 }
-
 /// Sets packed 64-bit integers with the supplied values, from highest to
 /// lowest.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
-
-// no particular instruction to test
-
 pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
-    i64x2::from_fn(|i| if i == 0 { e0 } else { e1 }).into()
+    { transmute(i64x2::new(e0, e1)) }
 }
-
 /// Sets packed 32-bit integers with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
-// no particular instruction to test
 pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
-    let vec = [e0, e1, e2, e3];
-    transmute(i32x4::from_fn(|i| vec[i as usize]))
+    { transmute(i32x4::new(e0, e1, e2, e3)) }
 }
-
 /// Sets packed 16-bit integers with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
-
-// no particular instruction to test
-
 pub fn _mm_set_epi16(
     e7: i16,
     e6: i16,
@@ -724,14 +642,11 @@ pub fn _mm_set_epi16(
     e1: i16,
     e0: i16,
 ) -> __m128i {
-    let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
-    transmute(i16x8::from_fn(|i| vec[i as usize]))
+    { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
 }
-
 /// Sets packed 8-bit integers with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
-// no particular instruction to test
 pub fn _mm_set_epi8(
     e15: i8,
     e14: i8,
@@ -750,68 +665,62 @@ pub fn _mm_set_epi8(
     e1: i8,
     e0: i8,
 ) -> __m128i {
-    let vec = [
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
-    ];
-    transmute(i8x16::from_fn(|i| vec[i as usize]))
+    {
+        transmute(
+            i8x16::new(
+                e0,
+                e1,
+                e2,
+                e3,
+                e4,
+                e5,
+                e6,
+                e7,
+                e8,
+                e9,
+                e10,
+                e11,
+                e12,
+                e13,
+                e14,
+                e15,
+            ),
+        )
+    }
 }
-
 /// Broadcasts 64-bit integer `a` to all elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
-
-// no particular instruction to test
-
 pub fn _mm_set1_epi64x(a: i64) -> __m128i {
     _mm_set_epi64x(a, a)
 }
-
 /// Broadcasts 32-bit integer `a` to all elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
-
-// no particular instruction to test
-
 pub fn _mm_set1_epi32(a: i32) -> __m128i {
     _mm_set_epi32(a, a, a, a)
 }
-
 /// Broadcasts 16-bit integer `a` to all elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
-
-// no particular instruction to test
-
 pub fn _mm_set1_epi16(a: i16) -> __m128i {
-    transmute(i16x8::from_fn(|_| a))
+    _mm_set_epi16(a, a, a, a, a, a, a, a)
 }
-
 /// Broadcasts 8-bit integer `a` to all elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
-
-// no particular instruction to test
-
 pub fn _mm_set1_epi8(a: i8) -> __m128i {
     _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
-
 /// Sets packed 32-bit integers with the supplied values in reverse order.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
-
-// no particular instruction to test
-
 pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
     _mm_set_epi32(e0, e1, e2, e3)
 }
-
 /// Sets packed 16-bit integers with the supplied values in reverse order.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
-
-// no particular instruction to test
-
 pub fn _mm_setr_epi16(
     e7: i16,
     e6: i16,
@@ -824,13 +733,9 @@ pub fn _mm_setr_epi16(
 ) -> __m128i {
     _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
 }
-
 /// Sets packed 8-bit integers with the supplied values in reverse order.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
-
-// no particular instruction to test
-
 pub fn _mm_setr_epi8(
     e15: i8,
     e14: i8,
@@ -849,108 +754,83 @@ pub fn _mm_setr_epi8(
     e1: i8,
     e0: i8,
 ) -> __m128i {
-    _mm_set_epi8(
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
-    )
+    _mm_set_epi8(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15)
 }
-
 /// Returns a vector with all elements set to zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
-
 pub fn _mm_setzero_si128() -> __m128i {
-    __m128i::ZERO()
+    transmute(i32x4::ZERO())
 }
-
 /// Returns a vector where the low element is extracted from `a` and its upper
 /// element is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
-
-// FIXME movd on msvc, movd on i686
-
 pub fn _mm_move_epi64(a: __m128i) -> __m128i {
-    let r: i64x2 = simd_shuffle(a.as_i64x2(), i64x2::from_fn(|_| 0), [0, 2]);
-    r.into()
+    {
+        let r: i64x2 = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 2]);
+        transmute(r)
+    }
 }
-
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
-
 pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    packsswb(a.as_i16x8(), b.as_i16x8()).into()
+    { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
-
 pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
-    packssdw(a.as_i32x4(), b.as_i32x4()).into()
+    { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
 }
-
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
-
 pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
-    packuswb(a.as_i16x8(), b.as_i16x8()).into()
+    { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
 }
-
 /// Returns the `imm8` element of `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
-
 pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
-    // static_assert_uimm_bits!(IMM8, 3);
-    simd_extract(a.as_u16x8(), IMM8 as u32) as i32
+    static_assert_uimm_bits!(IMM8, 3);
+    { simd_extract(a.as_u16x8(), IMM8 as u32) as i32 }
 }
-
 /// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
-
 pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 3);
-    simd_insert(a.as_i16x8(), IMM8 as u32, i as i16).into()
+    static_assert_uimm_bits!(IMM8, 3);
+    { transmute(simd_insert(a.as_i16x8(), IMM8 as u32, i as i16)) }
 }
-
 /// Returns a mask of the most significant bit of each element in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
-
 pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
-    let z = i8x16::from_fn(|_| 0);
-    let m: i8x16 = simd_lt(a.as_i8x16(), z);
-    let r = simd_bitmask_little!(15, m, u16);
-    r as u32 as i32
+    {
+        let z = i8x16::ZERO();
+        let m: i8x16 = simd_lt(a.as_i8x16(), z);
+        simd_bitmask_little!(15,m,u16) as u32 as i32
+    }
 }
-
 /// Shuffles 32-bit integers in `a` using the control in `IMM8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
-
 pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    let a = a.as_i32x4();
-    let x: i32x4 = simd_shuffle(
-        a,
-        a,
-        [
-            IMM8 as u32 & 0b11,
-            (IMM8 as u32 >> 2) & 0b11,
-            (IMM8 as u32 >> 4) & 0b11,
-            (IMM8 as u32 >> 6) & 0b11,
-        ],
-    );
-    x.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i32x4();
+        let x: i32x4 = simd_shuffle(
+            a, a, [IMM8 as u32 & 0b11, (IMM8 as u32 >> 2) & 0b11, (IMM8 as u32 >> 4) &
+            0b11, (IMM8 as u32 >> 6) & 0b11,],
+        );
+        transmute(x)
+    }
 }
-
 /// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
 /// `IMM8`.
 ///
@@ -958,28 +838,17 @@ pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 /// bits being copied from `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
-
 pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    let a = a.as_i16x8();
-    let x: i16x8 = simd_shuffle(
-        a,
-        a,
-        [
-            0,
-            1,
-            2,
-            3,
-            (IMM8 as u32 & 0b11) + 4,
-            ((IMM8 as u32 >> 2) & 0b11) + 4,
-            ((IMM8 as u32 >> 4) & 0b11) + 4,
-            ((IMM8 as u32 >> 6) & 0b11) + 4,
-        ],
-    );
-    x.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x8();
+        let x: i16x8 = simd_shuffle(
+            a, a, [0, 1, 2, 3, (IMM8 as u32 & 0b11) + 4, ((IMM8 as u32 >> 2) & 0b11) + 4,
+            ((IMM8 as u32 >> 4) & 0b11) + 4, ((IMM8 as u32 >> 6) & 0b11) + 4,],
+        );
+        transmute(x)
+    }
 }
-
 /// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
 /// `IMM8`.
 ///
@@ -987,111 +856,691 @@ pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 /// bits being copied from `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
-
 pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    let a = a.as_i16x8();
-    let x: i16x8 = simd_shuffle(
-        a,
-        a,
-        [
-            IMM8 as u32 & 0b11,
-            (IMM8 as u32 >> 2) & 0b11,
-            (IMM8 as u32 >> 4) & 0b11,
-            (IMM8 as u32 >> 6) & 0b11,
-            4,
-            5,
-            6,
-            7,
-        ],
-    );
-    x.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x8();
+        let x: i16x8 = simd_shuffle(
+            a, a, [IMM8 as u32 & 0b11, (IMM8 as u32 >> 2) & 0b11, (IMM8 as u32 >> 4) &
+            0b11, (IMM8 as u32 >> 6) & 0b11, 4, 5, 6, 7,],
+        );
+        transmute(x)
+    }
 }
-
 /// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
-
 pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(
-        a.as_i8x16(),
-        b.as_i8x16(),
-        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
-    ))
-    .into()
+    {
+        transmute::<
+            i8x16,
+            _,
+        >(
+            simd_shuffle(
+                a.as_i8x16(), b.as_i8x16(), [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13,
+                29, 14, 30, 15, 31],
+            ),
+        )
+    }
 }
-
 /// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
-
 pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
-    (x).into()
+    {
+        let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute::<i16x8, _>(x)
+    }
 }
-
 /// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
-
 pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])).into()
+    { transmute::<i32x4, _>(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
 }
-
 /// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
-
 pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(a.as_i64x2(), b.as_i64x2(), [1, 3])).into()
+    { transmute::<i64x2, _>(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
 }
-
 /// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
-
 pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(
-        a.as_i8x16(),
-        b.as_i8x16(),
-        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
-    ))
-    .into()
+    {
+        transmute::<
+            i8x16,
+            _,
+        >(
+            simd_shuffle(
+                a.as_i8x16(), b.as_i8x16(), [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6,
+                22, 7, 23],
+            ),
+        )
+    }
 }
-
 /// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
-
 pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
-    x.into()
+    {
+        let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
+        transmute::<i16x8, _>(x)
+    }
 }
-
 /// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
-
 pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_shuffle(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]).into()
+    { transmute::<i32x4, _>(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
 }
-
 /// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
-
 pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_shuffle(a.as_i64x2(), b.as_i64x2(), [0, 2]).into()
+    { transmute::<i64x2, _>(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
+}
+/// Returns a new vector with the low element of `a` replaced by the sum of the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
+// pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))) }
+// }
+/// Adds packed double-precision (64-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
+// pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_add(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the result of
+/// diving the lower element of `a` by the lower element of `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
+// pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))) }
+// }
+/// Divide packed double-precision (64-bit) floating-point elements in `a` by
+/// packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
+// pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_div(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the maximum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
+// pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { maxsd(a, b) }
+// }
+/// Returns a new vector with the maximum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
+// pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { maxpd(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the minimum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
+// pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { minsd(a, b) }
+// }
+/// Returns a new vector with the minimum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
+// pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { minpd(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by multiplying the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
+// pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))) }
+// }
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
+// pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_mul(a.as_f64x2(), b.as_f64x2())) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the square
+/// root of the lower element `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
+// pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
+// }
+/// Returns a new vector with the square root of each of the values in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
+// pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
+//     { simd_fsqrt(a) }
+// }
+/// Returns a new vector with the low element of `a` replaced by subtracting the
+/// low element by `b` from the low element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
+// pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))) }
+// }
+/// Subtract packed double-precision (64-bit) floating-point elements in `b`
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
+// pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_sub(a, b) }
+// }
+/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
+pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_and_si128(a, b))
+    }
+}
+/// Computes the bitwise NOT of `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
+pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_andnot_si128(a, b))
+    }
+}
+/// Computes the bitwise OR of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
+pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_or_si128(a, b))
+    }
+}
+/// Computes the bitwise XOR of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
+pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_xor_si128(a, b))
+    }
+}
+/// Returns a new vector with the low element of `a` replaced by the equality
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
+// pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 0) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the less-than
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
+// pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 1) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
+// pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 2) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
+// pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract(a, 1))) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
+// pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(_mm_cmple_sd(b, a), 1, simd_extract(a, 1)) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the result
+/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
+/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
+// pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 7) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the result of
+/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
+/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
+// pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 3) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the not-equal
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
+// pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 4) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-less-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
+// pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 5) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
+// pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 6) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
+// pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract(a, 1)) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
+// pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract(a, 1)) }
+// }
+/// Compares corresponding elements in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
+// pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 0) }
+// }
+/// Compares corresponding elements in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
+// pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 1) }
+// }
+/// Compares corresponding elements in `a` and `b` for less-than-or-equal
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
+// pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 2) }
+// }
+/// Compares corresponding elements in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
+// pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmplt_pd(b, a)
+// }
+/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
+// pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmple_pd(b, a)
+// }
+/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
+// pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 7) }
+// }
+/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
+// pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 3) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
+// pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 4) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
+// pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 5) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
+// pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 6) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
+// pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmpnlt_pd(b, a)
+// }
+/// Compares corresponding elements in `a` and `b` for
+/// not-greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
+// pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmpnle_pd(b, a)
+// }
+/// Compares the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
+// pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comieqsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
+// pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comiltsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
+// pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comilesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
+// pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comigtsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
+// pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comigesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
+// pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comineqsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
+// pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomieqsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
+// pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomiltsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
+// pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomilesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
+// pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomigtsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
+// pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomigesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
+// pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomineqsd(a, b) }
+// }
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed single-precision (32-bit) floating-point elements
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
+pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
+    {
+        let r = simd_cast::<2, _, f32>(a.as_f64x2());
+        let zero = f32x2::ZERO();
+        transmute::<f32x4, _>(simd_shuffle(r, zero, [0, 1, 2, 3]))
+    }
+}
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to
+/// packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
+pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
+    {
+        let a = a.as_f32x4();
+        transmute(simd_cast::<2, f32, f64>(simd_shuffle(a, a, [0, 1])))
+    }
+}
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
+// pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
+//     { transmute(cvtpd2dq(a)) }
+// }
+/// Converts the lower double-precision (64-bit) floating-point element in a to
+/// a 32-bit integer.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
+// pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
+//     { cvtsd2si(a) }
+// }
+/// Converts the lower double-precision (64-bit) floating-point element in `b`
+/// to a single-precision (32-bit) floating-point element, store the result in
+/// the lower element of the return value, and copies the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
+// pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
+//     { cvtsd2ss(a, b) }
+// }
+/// Returns the lower double-precision (64-bit) floating-point element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
+// pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
+//     { simd_extract(a, 0) }
+// }
+/// Converts the lower single-precision (32-bit) floating-point element in `b`
+/// to a double-precision (64-bit) floating-point element, store the result in
+/// the lower element of the return value, and copies the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
+// pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
+//     { cvtss2sd(a, b) }
+// }
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
+// pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
+//     { transmute(cvttpd2dq(a)) }
+// }
+/// Converts the lower double-precision (64-bit) floating-point element in `a`
+/// to a 32-bit integer with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
+// pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
+//     { cvttsd2si(a) }
+// }
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
+// pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
+//     { transmute(cvttps2dq(a)) }
+// }
+/// Copies double-precision (64-bit) floating-point element `a` to the lower
+/// element of the packed 64-bit return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
+pub fn _mm_set_sd(a: f64) -> __m128d {
+    _mm_set_pd(0.0, a)
+}
+/// Broadcasts double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
+pub fn _mm_set1_pd(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+/// Broadcasts double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
+pub fn _mm_set_pd1(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+/// Sets packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
+pub fn _mm_set_pd(a: f64, b: f64) -> __m128d {
+    transmute(f64x2::new(b, a))
+}
+/// Sets packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
+pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
+    _mm_set_pd(b, a)
+}
+/// Returns packed double-precision (64-bit) floating-point elements with all
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
+pub fn _mm_setzero_pd() -> __m128d {
+    transmute(f64x2::ZERO())
+}
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 2 least significant bits of the return value.
+/// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
+pub fn _mm_movemask_pd(a: __m128d) -> i32 {
+    {
+        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO());
+        simd_bitmask_little!(1,mask,u8) as i32
+    }
+}
+/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
+/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
+/// parameter as a specifier.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
+pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(MASK, 8);
+    { transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])) }
+}
+/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
+/// 64 bits are set to the lower 64 bits of the second parameter. The upper
+/// 64 bits are set to the upper 64 bits of the first parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
+pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
+    { _mm_setr_pd(simd_extract(b.as_f64x2(), 0), simd_extract(a.as_f64x2(), 1)) }
+}
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// floating-point vector of `[4 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
+pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
+    { transmute(a) }
+}
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
+pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
+    { transmute(a) }
+}
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// floating-point vector of `[2 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
+pub fn _mm_castps_pd(a: __m128) -> __m128d {
+    { transmute(a) }
+}
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
+pub fn _mm_castps_si128(a: __m128) -> __m128i {
+    { transmute(a) }
+}
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[2 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
+pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
+    { transmute(a) }
+}
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[4 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
+pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
+    { transmute(a) }
+}
+/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
+pub fn _mm_undefined_pd() -> __m128d {
+    transmute(f32x4::ZERO())
 }
-
 /// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
 /// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`core::mem::MaybeUninit`].
-/// In practice, this is typically equivalent to [`core::mem::zeroed`].
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
-
 pub fn _mm_undefined_si128() -> __m128i {
-    __m128i::ZERO()
+    transmute(u32x4::ZERO())
 }
+/// The resulting `__m128d` element is composed by the low-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
+/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
+pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
+    { transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [1, 3])) }
+}
+/// The resulting `__m128d` element is composed by the high-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
+/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
+pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
+    { transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [0, 2])) }
+}
+

From 534a9863cfbc30df42f230c15624271e2a999dfe Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 01:22:25 -0400
Subject: [PATCH 28/47] fixed warnings

---
 .../src/core_arch/x86/models/avx.rs           | 370 +++++++-----------
 .../src/core_arch/x86/models/sse2.rs          | 251 ++++++------
 2 files changed, 284 insertions(+), 337 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 840058adfe362..91b168acf3f53 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -85,8 +85,14 @@ pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
     static_assert_uimm_bits!(MASK, 8);
     {
         transmute(simd_shuffle(
-            a.as_f64x4(), b.as_f64x4(), [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 4, ((MASK as u32 >> 2)
-            & 0b1) + 2, ((MASK as u32 >> 3) & 0b1) + 6,],
+            a.as_f64x4(),
+            b.as_f64x4(),
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1) + 4,
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 6,
+            ],
         ))
     }
 }
@@ -98,10 +104,18 @@ pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(MASK, 8);
     {
         transmute(simd_shuffle(
-            a.as_f32x8(), b.as_f32x8(), [MASK as u32 & 0b11, (MASK as u32 >> 2) & 0b11, ((MASK as u32 >> 4) &
-            0b11) + 8, ((MASK as u32 >> 6) & 0b11) + 8, (MASK as u32 & 0b11) + 4, ((MASK
-            as u32 >> 2) & 0b11) + 4, ((MASK as u32 >> 4) & 0b11) + 12, ((MASK as u32 >>
-            6) & 0b11) + 12,],
+            a.as_f32x8(),
+            b.as_f32x8(),
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
         ))
     }
 }
@@ -308,8 +322,14 @@ pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM4, 4);
     {
         transmute(simd_shuffle(
-            a.as_f64x4(), b.as_f64x4(), [((IMM4 as u32 >> 0) & 1) * 4 + 0, ((IMM4 as u32 >> 1) & 1) * 4 + 1,
-            ((IMM4 as u32 >> 2) & 1) * 4 + 2, ((IMM4 as u32 >> 3) & 1) * 4 + 3,],
+            a.as_f64x4(),
+            b.as_f64x4(),
+            [
+                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
+                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
+                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
+                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
+            ],
         ))
     }
 }
@@ -321,10 +341,18 @@ pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 8);
     {
         transmute(simd_shuffle(
-            a.as_f32x8(), b.as_f32x8(), [((IMM8 as u32 >> 0) & 1) * 8 + 0, ((IMM8 as u32 >> 1) & 1) * 8 + 1,
-            ((IMM8 as u32 >> 2) & 1) * 8 + 2, ((IMM8 as u32 >> 3) & 1) * 8 + 3, ((IMM8 as
-            u32 >> 4) & 1) * 8 + 4, ((IMM8 as u32 >> 5) & 1) * 8 + 5, ((IMM8 as u32 >> 6)
-            & 1) * 8 + 6, ((IMM8 as u32 >> 7) & 1) * 8 + 7,],
+            a.as_f32x8(),
+            b.as_f32x8(),
+            [
+                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
+                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
+                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
+                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
+                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
+                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
+                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
+                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
+            ],
         ))
     }
 }
@@ -545,21 +573,21 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtepi32_pd)
 pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
-    { transmute(simd_cast::<4,i32,f64>(a.as_i32x4())) }
+    transmute(simd_cast::<4, i32, f64>(a.as_i32x4()))
 }
 /// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
 /// floating-point elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtepi32_ps)
 pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
-    { transmute(simd_cast::<8,_,f32>(a.as_i32x8())) }
+    transmute(simd_cast::<8, _, f32>(a.as_i32x8()))
 }
 /// Converts packed double-precision (64-bit) floating-point elements in `a`
 /// to packed single-precision (32-bit) floating-point elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtpd_ps)
 pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
-    { transmute(simd_cast::<4,_,f32>(a.as_f64x4())) }
+    transmute(simd_cast::<4, _, f32>(a.as_f64x4()))
 }
 /// Converts packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
@@ -573,13 +601,13 @@ pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtps_pd)
 pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
-    { transmute(simd_cast::<4,_,f64>(a.as_f32x4())) }
+    transmute(simd_cast::<4, _, f64>(a.as_f32x4()))
 }
 /// Returns the first element of the input vector of `[4 x double]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtsd_f64)
 pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
-    { simd_extract(a.as_f64x4(), 0) }
+    simd_extract(a.as_f64x4(), 0)
 }
 /// Converts packed double-precision (64-bit) floating-point elements in `a`
 /// to packed 32-bit integers with truncation.
@@ -610,7 +638,9 @@ pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
     static_assert_uimm_bits!(IMM1, 1);
     {
         transmute(simd_shuffle(
-            a.as_f32x8(), _mm256_undefined_ps().as_f32x8(), [[0, 1, 2, 3], [4, 5, 6, 7]] [IMM1 as usize],
+            a.as_f32x8(),
+            _mm256_undefined_ps().as_f32x8(),
+            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
         ))
     }
 }
@@ -620,7 +650,11 @@ pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_pd)
 pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
     static_assert_uimm_bits!(IMM1, 1);
-    { transmute(simd_shuffle(a.as_f64x4(), _mm256_undefined_pd().as_f64x4(), [[0, 1], [2, 3]] [IMM1 as usize])) }
+    transmute(simd_shuffle(
+        a.as_f64x4(),
+        _mm256_undefined_pd().as_f64x4(),
+        [[0, 1], [2, 3]][IMM1 as usize],
+    ))
 }
 /// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
 ///
@@ -628,9 +662,7 @@ pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
 pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
     static_assert_uimm_bits!(IMM1, 1);
     {
-        let dst: i64x2 = simd_shuffle(
-            a.as_i64x4(), i64x4::ZERO(), [[0, 1], [2, 3]] [IMM1 as usize],
-        );
+        let dst: i64x2 = simd_shuffle(a.as_i64x4(), i64x4::ZERO(), [[0, 1], [2, 3]][IMM1 as usize]);
         transmute(dst)
     }
 }
@@ -639,13 +671,13 @@ pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extract_epi32)
 pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
     static_assert_uimm_bits!(INDEX, 3);
-    { simd_extract(a.as_i32x8(), INDEX as u32) }
+    simd_extract(a.as_i32x8(), INDEX as u32)
 }
 /// Returns the first element of the input vector of `[8 x i32]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtsi256_si32)
 pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
-    { simd_extract(a.as_i32x8(), 0) }
+    simd_extract(a.as_i32x8(), 0)
 }
 /// Zeroes the contents of all XMM or YMM registers.
 ///
@@ -682,10 +714,18 @@ pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 8);
     {
         transmute(simd_shuffle(
-            a.as_f32x8(), _mm256_undefined_ps().as_f32x8(), [(IMM8 as u32 >> 0) & 0b11, (IMM8 as u32 >> 2) &
-            0b11, (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11, ((IMM8 as u32 >>
-            0) & 0b11) + 4, ((IMM8 as u32 >> 2) & 0b11) + 4, ((IMM8 as u32 >> 4) & 0b11)
-            + 4, ((IMM8 as u32 >> 6) & 0b11) + 4,],
+            a.as_f32x8(),
+            _mm256_undefined_ps().as_f32x8(),
+            [
+                (IMM8 as u32 >> 0) & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                ((IMM8 as u32 >> 0) & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+            ],
         ))
     }
 }
@@ -724,8 +764,14 @@ pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM4, 4);
     {
         transmute(simd_shuffle(
-            a.as_f64x4(), _mm256_undefined_pd().as_f64x4(), [((IMM4 as u32 >> 0) & 1), ((IMM4 as u32 >> 1) &
-            1), ((IMM4 as u32 >> 2) & 1) + 2, ((IMM4 as u32 >> 3) & 1) + 2,],
+            a.as_f64x4(),
+            _mm256_undefined_pd().as_f64x4(),
+            [
+                ((IMM4 as u32 >> 0) & 1),
+                ((IMM4 as u32 >> 1) & 1),
+                ((IMM4 as u32 >> 2) & 1) + 2,
+                ((IMM4 as u32 >> 3) & 1) + 2,
+            ],
         ))
     }
 }
@@ -736,8 +782,10 @@ pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
 pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM2, 2);
     {
-       transmute(simd_shuffle(
-            a.as_f64x2(), _mm_undefined_pd().as_f64x2(), [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_undefined_pd().as_f64x2(),
+            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
         ))
     }
 }
@@ -763,7 +811,7 @@ pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_si256)
 pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
+    transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8))
 }
 /// Broadcasts a single-precision (32-bit) floating-point element from memory
 /// to all elements of the returned vector.
@@ -798,7 +846,11 @@ pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_pd)
 pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
-    { transmute(simd_shuffle((*a).as_f64x2(), _mm_setzero_pd().as_f64x2(), [0, 1, 0, 1])) }
+    transmute(simd_shuffle(
+        (*a).as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        [0, 1, 0, 1],
+    ))
 }
 /// Copies `a` to result, then inserts 128 bits (composed of 4 packed
 /// single-precision (32-bit) floating-point elements) from `b` into result
@@ -835,8 +887,9 @@ pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
     static_assert_uimm_bits!(IMM1, 1);
     {
         let dst: i64x4 = simd_shuffle(
-            a.as_i64x4(), _mm256_castsi128_si256(b).as_i64x4(), [[4, 5, 2, 3], [0, 1, 4,
-            5]] [IMM1 as usize],
+            a.as_i64x4(),
+            _mm256_castsi128_si256(b).as_i64x4(),
+            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
         );
         transmute(dst)
     }
@@ -847,7 +900,7 @@ pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi8)
 pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
     static_assert_uimm_bits!(INDEX, 5);
-    { transmute(simd_insert(a.as_i8x32(), INDEX as u32, i)) }
+    transmute(simd_insert(a.as_i8x32(), INDEX as u32, i))
 }
 /// Copies `a` to result, and inserts the 16-bit integer `i` into result
 /// at the location specified by `index`.
@@ -855,7 +908,7 @@ pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi16)
 pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
     static_assert_uimm_bits!(INDEX, 4);
-    { transmute(simd_insert(a.as_i16x16(), INDEX as u32, i)) }
+    transmute(simd_insert(a.as_i16x16(), INDEX as u32, i))
 }
 /// Copies `a` to result, and inserts the 32-bit integer `i` into result
 /// at the location specified by `index`.
@@ -863,28 +916,36 @@ pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi32)
 pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
     static_assert_uimm_bits!(INDEX, 3);
-    { transmute(simd_insert(a.as_i32x8(), INDEX as u32, i)) }
+    transmute(simd_insert(a.as_i32x8(), INDEX as u32, i))
 }
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements
 /// from `a`, and returns the results.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movehdup_ps)
 pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
-    { transmute(simd_shuffle(a.as_f32x8(), a.as_f32x8(), [1, 1, 3, 3, 5, 5, 7, 7])) }
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        a.as_f32x8(),
+        [1, 1, 3, 3, 5, 5, 7, 7],
+    ))
 }
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
 /// from `a`, and returns the results.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_moveldup_ps)
 pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
-    { transmute(simd_shuffle(a.as_f32x8(), a.as_f32x8(), [0, 0, 2, 2, 4, 4, 6, 6])) }
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        a.as_f32x8(),
+        [0, 0, 2, 2, 4, 4, 6, 6],
+    ))
 }
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements
 /// from `a`, and returns the results.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movedup_pd)
 pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
-    { transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 0, 2, 2])) }
+    transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 0, 2, 2]))
 }
 /// Computes the approximate reciprocal of packed single-precision (32-bit)
 /// floating-point elements in `a`, and returns the results. The maximum
@@ -907,28 +968,36 @@ pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpackhi_pd)
 pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
-    { transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [1, 5, 3, 7])) }
+    transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [1, 5, 3, 7]))
 }
 /// Unpacks and interleave single-precision (32-bit) floating-point elements
 /// from the high half of each 128-bit lane in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpackhi_ps)
 pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
-    { transmute(simd_shuffle(a.as_f32x8(), b.as_f32x8(), [2, 10, 3, 11, 6, 14, 7, 15])) }
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        b.as_f32x8(),
+        [2, 10, 3, 11, 6, 14, 7, 15],
+    ))
 }
 /// Unpacks and interleave double-precision (64-bit) floating-point elements
 /// from the low half of each 128-bit lane in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpacklo_pd)
 pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
-    { transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [0, 4, 2, 6])) }
+    transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [0, 4, 2, 6]))
 }
 /// Unpacks and interleave single-precision (32-bit) floating-point elements
 /// from the low half of each 128-bit lane in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpacklo_ps)
 pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
-    { transmute(simd_shuffle(a.as_f32x8(), b.as_f32x8(), [0, 8, 1, 9, 4, 12, 5, 13])) }
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        b.as_f32x8(),
+        [0, 8, 1, 9, 4, 12, 5, 13],
+    ))
 }
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
@@ -937,7 +1006,7 @@ pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_si256)
 pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    { ptestz256(a.as_i64x4(), b.as_i64x4()) }
+    ptestz256(a.as_i64x4(), b.as_i64x4())
 }
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
@@ -946,7 +1015,7 @@ pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_si256)
 pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
-    { ptestc256(a.as_i64x4(), b.as_i64x4()) }
+    ptestc256(a.as_i64x4(), b.as_i64x4())
 }
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
@@ -1157,16 +1226,7 @@ pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 /// vector with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_ps)
-pub fn _mm256_set_ps(
-    a: f32,
-    b: f32,
-    c: f32,
-    d: f32,
-    e: f32,
-    f: f32,
-    g: f32,
-    h: f32,
-) -> __m256 {
+pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
     _mm256_setr_ps(h, g, f, e, d, c, b, a)
 }
 /// Sets packed 8-bit integers in returned vector with the supplied values.
@@ -1207,38 +1267,8 @@ pub fn _mm256_set_epi8(
     e31: i8,
 ) -> __m256i {
     _mm256_setr_epi8(
-        e31,
-        e30,
-        e29,
-        e28,
-        e27,
-        e26,
-        e25,
-        e24,
-        e23,
-        e22,
-        e21,
-        e20,
-        e19,
-        e18,
-        e17,
-        e16,
-        e15,
-        e14,
-        e13,
-        e12,
-        e11,
-        e10,
-        e09,
-        e08,
-        e07,
-        e06,
-        e05,
-        e04,
-        e03,
-        e02,
-        e01,
-        e00,
+        e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14,
+        e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00,
     )
 }
 /// Sets packed 16-bit integers in returned vector with the supplied values.
@@ -1263,22 +1293,7 @@ pub fn _mm256_set_epi16(
     e15: i16,
 ) -> __m256i {
     _mm256_setr_epi16(
-        e15,
-        e14,
-        e13,
-        e12,
-        e11,
-        e10,
-        e09,
-        e08,
-        e07,
-        e06,
-        e05,
-        e04,
-        e03,
-        e02,
-        e01,
-        e00,
+        e15, e14, e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00,
     )
 }
 /// Sets packed 32-bit integers in returned vector with the supplied values.
@@ -1313,16 +1328,7 @@ pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 /// vector with the supplied values in reverse order.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_ps)
-pub fn _mm256_setr_ps(
-    a: f32,
-    b: f32,
-    c: f32,
-    d: f32,
-    e: f32,
-    f: f32,
-    g: f32,
-    h: f32,
-) -> __m256 {
+pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
     transmute(f32x8::new(a, b, c, d, e, f, g, h))
 }
 /// Sets packed 8-bit integers in returned vector with the supplied values in
@@ -1364,42 +1370,10 @@ pub fn _mm256_setr_epi8(
     e31: i8,
 ) -> __m256i {
     {
-        transmute(
-            i8x32::new(
-                e00,
-                e01,
-                e02,
-                e03,
-                e04,
-                e05,
-                e06,
-                e07,
-                e08,
-                e09,
-                e10,
-                e11,
-                e12,
-                e13,
-                e14,
-                e15,
-                e16,
-                e17,
-                e18,
-                e19,
-                e20,
-                e21,
-                e22,
-                e23,
-                e24,
-                e25,
-                e26,
-                e27,
-                e28,
-                e29,
-                e30,
-                e31,
-            ),
-        )
+        transmute(i8x32::new(
+            e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16,
+            e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+        ))
     }
 }
 /// Sets packed 16-bit integers in returned vector with the supplied values in
@@ -1425,26 +1399,9 @@ pub fn _mm256_setr_epi16(
     e15: i16,
 ) -> __m256i {
     {
-        transmute(
-            i16x16::new(
-                e00,
-                e01,
-                e02,
-                e03,
-                e04,
-                e05,
-                e06,
-                e07,
-                e08,
-                e09,
-                e10,
-                e11,
-                e12,
-                e13,
-                e14,
-                e15,
-            ),
-        )
+        transmute(i16x16::new(
+            e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
+        ))
     }
 }
 /// Sets packed 32-bit integers in returned vector with the supplied values in
@@ -1461,14 +1418,14 @@ pub fn _mm256_setr_epi32(
     e6: i32,
     e7: i32,
 ) -> __m256i {
-    { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
+    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
 /// Sets packed 64-bit integers in returned vector with the supplied values in
 /// reverse order.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi64x)
 pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
-    { transmute(i64x4::new(a, b, c, d)) }
+    transmute(i64x4::new(a, b, c, d))
 }
 /// Broadcasts double-precision (64-bit) floating-point value `a` to all
 /// elements of returned vector.
@@ -1489,40 +1446,9 @@ pub fn _mm256_set1_ps(a: f32) -> __m256 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi8)
 pub fn _mm256_set1_epi8(a: i8) -> __m256i {
-
     _mm256_setr_epi8(
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
-        a,
+        a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,
+        a, a,
     )
 }
 /// Broadcasts 16-bit integer `a` to all elements of returned vector.
@@ -1550,49 +1476,49 @@ pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd_ps)
 pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
-    { transmute(a) }
+    transmute(a)
 }
 /// Cast vector of type __m256 to type __m256d.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps_pd)
 pub fn _mm256_castps_pd(a: __m256) -> __m256d {
-    { transmute(a) }
+    transmute(a)
 }
 /// Casts vector of type __m256 to type __m256i.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps_si256)
 pub fn _mm256_castps_si256(a: __m256) -> __m256i {
-    { transmute(a) }
+    transmute(a)
 }
 /// Casts vector of type __m256i to type __m256.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_ps)
 pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
-    { transmute(a) }
+    transmute(a)
 }
 /// Casts vector of type __m256d to type __m256i.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd_si256)
 pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
-    { transmute(a) }
+    transmute(a)
 }
 /// Casts vector of type __m256i to type __m256d.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_pd)
 pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
-    { transmute(a) }
+    transmute(a)
 }
 /// Casts vector of type __m256 to type __m128.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps256_ps128)
 pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
-    { transmute(simd_shuffle(a.as_f32x8(), a.as_f32x8(), [0, 1, 2, 3])) }
+    transmute(simd_shuffle(a.as_f32x8(), a.as_f32x8(), [0, 1, 2, 3]))
 }
 /// Casts vector of type __m256d to type __m128d.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd256_pd128)
 pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
-    { transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 1])) }
+    transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 1]))
 }
 /// Casts vector of type __m256i to type __m128i.
 ///
@@ -1616,7 +1542,11 @@ pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd128_pd256)
 pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
-    { transmute(simd_shuffle(a.as_f64x2(), _mm_undefined_pd().as_f64x2(), [0, 1, 2, 2])) }
+    transmute(simd_shuffle(
+        a.as_f64x2(),
+        _mm_undefined_pd().as_f64x2(),
+        [0, 1, 2, 2],
+    ))
 }
 /// Casts vector of type __m128i to type __m256i;
 /// the upper 128 bits of the result are undefined.
@@ -1690,7 +1620,11 @@ pub fn _mm256_undefined_si256() -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128)
 pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
-    { transmute(simd_shuffle(lo.as_i32x4(), hi.as_i32x4(), [0, 1, 2, 3, 4, 5, 6, 7])) }
+    transmute(simd_shuffle(
+        lo.as_i32x4(),
+        hi.as_i32x4(),
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ))
 }
 /// Sets packed __m256d returned vector with the supplied values.
 ///
@@ -1734,5 +1668,5 @@ pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtss_f32)
 pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
-    { simd_extract(a.as_f32x8(), 0) }
+    simd_extract(a.as_f32x8(), 0)
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 70964df12ce95..66a99aadedc5f 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -8,49 +8,49 @@ use crate::abstractions::utilities::*;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
 pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
+    transmute(simd_add(a.as_i8x16(), b.as_i8x16()))
 }
 /// Adds packed 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
 pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
+    transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
 }
 /// Adds packed 32-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
 pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
+    transmute(simd_add(a.as_i32x4(), b.as_i32x4()))
 }
 /// Adds packed 64-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
 pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
+    transmute(simd_add(a.as_i64x2(), b.as_i64x2()))
 }
 /// Adds packed 8-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
 pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
+    transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16()))
 }
 /// Adds packed 16-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
 pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
+    transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8()))
 }
 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
 pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
+    transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16()))
 }
 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
 pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
+    transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8()))
 }
 /// Averages packed unsigned 8-bit integers in `a` and `b`.
 ///
@@ -82,7 +82,7 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
 pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
+    transmute(pmaddwd(a.as_i16x8(), b.as_i16x8()))
 }
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
 /// maximum values.
@@ -163,7 +163,7 @@ pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
 pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
+    transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
 }
 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
 /// in `a` and `b`.
@@ -188,72 +188,76 @@ pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
 pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
+    transmute(psadbw(a.as_u8x16(), b.as_u8x16()))
 }
 /// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
 pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
+    transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
 }
 /// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
 pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
+    transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
 }
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
 pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
+    transmute(simd_sub(a.as_i32x4(), b.as_i32x4()))
 }
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
 pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
+    transmute(simd_sub(a.as_i64x2(), b.as_i64x2()))
 }
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 /// using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
 pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
+    transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16()))
 }
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
 pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
+    transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8()))
 }
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
 pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
+    transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16()))
 }
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
 /// integers in `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
 pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
+    transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8()))
 }
 /// Shifts `a` left by `IMM8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
 pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    { _mm_slli_si128_impl::<IMM8>(a) }
+    _mm_slli_si128_impl::<IMM8>(a)
 }
 
 fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
     const fn mask(shift: i32, i: u32) -> u32 {
         let shift = shift as u32 & 0xff;
-        if shift > 15 { i } else { 16 - shift + i }
+        if shift > 15 {
+            i
+        } else {
+            16 - shift + i
+        }
     }
     transmute::<i8x16, _>(simd_shuffle(
         i8x16::ZERO(),
@@ -348,7 +352,7 @@ pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
 pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
-    { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
+    transmute(psllw(a.as_i16x8(), count.as_i16x8()))
 }
 /// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
 ///
@@ -368,7 +372,7 @@ pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
 pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
-    { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
+    transmute(pslld(a.as_i32x4(), count.as_i32x4()))
 }
 /// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
 ///
@@ -388,7 +392,7 @@ pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
 pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
-    { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
+    transmute(psllq(a.as_i64x2(), count.as_i64x2()))
 }
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
 /// bits.
@@ -396,14 +400,14 @@ pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
 pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
+    transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16)))
 }
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
 /// bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
 pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
-    { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
+    transmute(psraw(a.as_i16x8(), count.as_i16x8()))
 }
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
 /// bits.
@@ -411,21 +415,21 @@ pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
 pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
+    transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31))))
 }
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
 /// bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
 pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
-    { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
+    transmute(psrad(a.as_i32x4(), count.as_i32x4()))
 }
 /// Shifts `a` right by `IMM8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
 pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    { _mm_srli_si128_impl::<IMM8>(a) }
+    _mm_srli_si128_impl::<IMM8>(a)
 }
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
 /// zeros.
@@ -446,7 +450,7 @@ pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
 pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
-    { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
+    transmute(psrlw(a.as_i16x8(), count.as_i16x8()))
 }
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
 /// zeros.
@@ -467,7 +471,7 @@ pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
 pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
-    { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
+    transmute(psrld(a.as_i32x4(), count.as_i32x4()))
 }
 /// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
 /// zeros.
@@ -488,89 +492,92 @@ pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
 pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
-    { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
+    transmute(psrlq(a.as_i64x2(), count.as_i64x2()))
 }
 /// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
 pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_and(a.as_i32x4(), b.as_i32x4())) }
+    transmute(simd_and(a.as_i32x4(), b.as_i32x4()))
 }
 /// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
 /// then AND with `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
 pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_and(simd_xor(_mm_set1_epi8(-1).as_i32x4(), a.as_i32x4()), b.as_i32x4())) }
+    transmute(simd_and(
+        simd_xor(_mm_set1_epi8(-1).as_i32x4(), a.as_i32x4()),
+        b.as_i32x4(),
+    ))
 }
 /// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
 pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_or(a.as_i32x4(), b.as_i32x4())) }
+    transmute(simd_or(a.as_i32x4(), b.as_i32x4()))
 }
 /// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
 pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(simd_xor(a.as_i32x4(), b.as_i32x4())) }
+    transmute(simd_xor(a.as_i32x4(), b.as_i32x4()))
 }
 /// Compares packed 8-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
 pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
+    transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16()))
 }
 /// Compares packed 16-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
 pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
+    transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8()))
 }
 /// Compares packed 32-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
 pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
+    transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
 }
 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
 pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
+    transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16()))
 }
 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
 pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
+    transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8()))
 }
 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
 pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
+    transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
 }
 /// Compares packed 8-bit integers in `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
 pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
+    transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16()))
 }
 /// Compares packed 16-bit integers in `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
 pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
+    transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8()))
 }
 /// Compares packed 32-bit integers in `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
 pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
+    transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
 }
 /// Converts the lower two packed 32-bit integers in `a` to packed
 /// double-precision (64-bit) floating-point elements.
@@ -587,14 +594,14 @@ pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
 pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
-    { transmute(simd_insert(a.as_f64x2(), 0, b as f64)) }
+    transmute(simd_insert(a.as_f64x2(), 0, b as f64))
 }
 /// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
 /// floating-point elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
 pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
-    { transmute(simd_cast::<4, _, f32>(a.as_i32x4())) }
+    transmute(simd_cast::<4, _, f32>(a.as_i32x4()))
 }
 /// Converts packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
@@ -608,26 +615,26 @@ pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
 pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
-    { transmute(i32x4::new(a, 0, 0, 0)) }
+    transmute(i32x4::new(a, 0, 0, 0))
 }
 /// Returns the lowest element of `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
 pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
-    { simd_extract(a.as_i32x4(), 0) }
+    simd_extract(a.as_i32x4(), 0)
 }
 /// Sets packed 64-bit integers with the supplied values, from highest to
 /// lowest.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
 pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
-    { transmute(i64x2::new(e0, e1)) }
+    transmute(i64x2::new(e0, e1))
 }
 /// Sets packed 32-bit integers with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
 pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
-    { transmute(i32x4::new(e0, e1, e2, e3)) }
+    transmute(i32x4::new(e0, e1, e2, e3))
 }
 /// Sets packed 16-bit integers with the supplied values.
 ///
@@ -642,7 +649,7 @@ pub fn _mm_set_epi16(
     e1: i16,
     e0: i16,
 ) -> __m128i {
-    { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
+    transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
 /// Sets packed 8-bit integers with the supplied values.
 ///
@@ -666,26 +673,9 @@ pub fn _mm_set_epi8(
     e0: i8,
 ) -> __m128i {
     {
-        transmute(
-            i8x16::new(
-                e0,
-                e1,
-                e2,
-                e3,
-                e4,
-                e5,
-                e6,
-                e7,
-                e8,
-                e9,
-                e10,
-                e11,
-                e12,
-                e13,
-                e14,
-                e15,
-            ),
-        )
+        transmute(i8x16::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+        ))
     }
 }
 /// Broadcasts 64-bit integer `a` to all elements.
@@ -754,7 +744,9 @@ pub fn _mm_setr_epi8(
     e1: i8,
     e0: i8,
 ) -> __m128i {
-    _mm_set_epi8(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15)
+    _mm_set_epi8(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
 }
 /// Returns a vector with all elements set to zero.
 ///
@@ -777,35 +769,35 @@ pub fn _mm_move_epi64(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
 pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
+    transmute(packsswb(a.as_i16x8(), b.as_i16x8()))
 }
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
 pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
+    transmute(packssdw(a.as_i32x4(), b.as_i32x4()))
 }
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
 pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
-    { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
+    transmute(packuswb(a.as_i16x8(), b.as_i16x8()))
 }
 /// Returns the `imm8` element of `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
 pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
     static_assert_uimm_bits!(IMM8, 3);
-    { simd_extract(a.as_u16x8(), IMM8 as u32) as i32 }
+    simd_extract(a.as_u16x8(), IMM8 as u32) as i32
 }
 /// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
 pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
     static_assert_uimm_bits!(IMM8, 3);
-    { transmute(simd_insert(a.as_i16x8(), IMM8 as u32, i as i16)) }
+    transmute(simd_insert(a.as_i16x8(), IMM8 as u32, i as i16))
 }
 /// Returns a mask of the most significant bit of each element in `a`.
 ///
@@ -814,7 +806,7 @@ pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
     {
         let z = i8x16::ZERO();
         let m: i8x16 = simd_lt(a.as_i8x16(), z);
-        simd_bitmask_little!(15,m,u16) as u32 as i32
+        simd_bitmask_little!(15, m, u16) as u32 as i32
     }
 }
 /// Shuffles 32-bit integers in `a` using the control in `IMM8`.
@@ -825,8 +817,14 @@ pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
     {
         let a = a.as_i32x4();
         let x: i32x4 = simd_shuffle(
-            a, a, [IMM8 as u32 & 0b11, (IMM8 as u32 >> 2) & 0b11, (IMM8 as u32 >> 4) &
-            0b11, (IMM8 as u32 >> 6) & 0b11,],
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
         );
         transmute(x)
     }
@@ -843,8 +841,18 @@ pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     {
         let a = a.as_i16x8();
         let x: i16x8 = simd_shuffle(
-            a, a, [0, 1, 2, 3, (IMM8 as u32 & 0b11) + 4, ((IMM8 as u32 >> 2) & 0b11) + 4,
-            ((IMM8 as u32 >> 4) & 0b11) + 4, ((IMM8 as u32 >> 6) & 0b11) + 4,],
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                (IMM8 as u32 & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+            ],
         );
         transmute(x)
     }
@@ -861,8 +869,18 @@ pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     {
         let a = a.as_i16x8();
         let x: i16x8 = simd_shuffle(
-            a, a, [IMM8 as u32 & 0b11, (IMM8 as u32 >> 2) & 0b11, (IMM8 as u32 >> 4) &
-            0b11, (IMM8 as u32 >> 6) & 0b11, 4, 5, 6, 7,],
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                4,
+                5,
+                6,
+                7,
+            ],
         );
         transmute(x)
     }
@@ -872,15 +890,11 @@ pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
 pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
     {
-        transmute::<
-            i8x16,
-            _,
-        >(
-            simd_shuffle(
-                a.as_i8x16(), b.as_i8x16(), [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13,
-                29, 14, 30, 15, 31],
-            ),
-        )
+        transmute::<i8x16, _>(simd_shuffle(
+            a.as_i8x16(),
+            b.as_i8x16(),
+            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
+        ))
     }
 }
 /// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
@@ -896,28 +910,24 @@ pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
 pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i32x4, _>(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
+    transmute::<i32x4, _>(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
 }
 /// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
 pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i64x2, _>(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
+    transmute::<i64x2, _>(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [1, 3]))
 }
 /// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
 pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
     {
-        transmute::<
-            i8x16,
-            _,
-        >(
-            simd_shuffle(
-                a.as_i8x16(), b.as_i8x16(), [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6,
-                22, 7, 23],
-            ),
-        )
+        transmute::<i8x16, _>(simd_shuffle(
+            a.as_i8x16(),
+            b.as_i8x16(),
+            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
+        ))
     }
 }
 /// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
@@ -933,13 +943,13 @@ pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
 pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i32x4, _>(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
+    transmute::<i32x4, _>(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
 }
 /// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
 pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
-    { transmute::<i64x2, _>(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
+    transmute::<i64x2, _>(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [0, 2]))
 }
 /// Returns a new vector with the low element of `a` replaced by the sum of the
 /// low elements of `a` and `b`.
@@ -1443,7 +1453,7 @@ pub fn _mm_setzero_pd() -> __m128d {
 pub fn _mm_movemask_pd(a: __m128d) -> i32 {
     {
         let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO());
-        simd_bitmask_little!(1,mask,u8) as i32
+        simd_bitmask_little!(1, mask, u8) as i32
     }
 }
 /// Constructs a 128-bit floating-point vector of `[2 x double]` from two
@@ -1453,7 +1463,11 @@ pub fn _mm_movemask_pd(a: __m128d) -> i32 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
 pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(MASK, 8);
-    { transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])) }
+    transmute(simd_shuffle(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2],
+    ))
 }
 /// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
@@ -1461,49 +1475,49 @@ pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
 pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
-    { _mm_setr_pd(simd_extract(b.as_f64x2(), 0), simd_extract(a.as_f64x2(), 1)) }
+    _mm_setr_pd(simd_extract(b.as_f64x2(), 0), simd_extract(a.as_f64x2(), 1))
 }
 /// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
 /// floating-point vector of `[4 x float]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
 pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
-    { transmute(a) }
+    transmute(a)
 }
 /// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
 /// integer vector.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
 pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
-    { transmute(a) }
+    transmute(a)
 }
 /// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
 /// floating-point vector of `[2 x double]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
 pub fn _mm_castps_pd(a: __m128) -> __m128d {
-    { transmute(a) }
+    transmute(a)
 }
 /// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
 /// integer vector.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
 pub fn _mm_castps_si128(a: __m128) -> __m128i {
-    { transmute(a) }
+    transmute(a)
 }
 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
 /// of `[2 x double]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
 pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
-    { transmute(a) }
+    transmute(a)
 }
 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
 /// of `[4 x float]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
 pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
-    { transmute(a) }
+    transmute(a)
 }
 /// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
 /// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
@@ -1531,7 +1545,7 @@ pub fn _mm_undefined_si128() -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
 pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
-    { transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [1, 3])) }
+    transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [1, 3]))
 }
 /// The resulting `__m128d` element is composed by the high-order values of
 /// the two `__m128d` interleaved input elements, i.e.:
@@ -1541,6 +1555,5 @@ pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
 pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
-    { transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [0, 2])) }
+    transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [0, 2]))
 }
-

From ff2f829ae14dbd6b58db06df82c936756c50899b Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 01:26:48 -0400
Subject: [PATCH 29/47] sse3 models

---
 .../src/core_arch/x86/models/ssse3.rs         | 161 +++++++++---------
 1 file changed, 80 insertions(+), 81 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index 900c32e5e5293..665e83460fca6 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -11,36 +11,39 @@ use super::types::*;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8)
 pub fn _mm_abs_epi8(a: __m128i) -> __m128i {
-    let a = a.as_i8x16();
-    let zero = i8x16::from_fn(|_| 0);
-    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
-    transmute(r)
+    {
+        let a = a.as_i8x16();
+        let zero = i8x16::ZERO();
+        let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Computes the absolute value of each of the packed 16-bit signed integers in
 /// `a` and
 /// return the 16-bit unsigned integer
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16)
 pub fn _mm_abs_epi16(a: __m128i) -> __m128i {
-    let a = a.as_i16x8();
-    let zero = i16x8::from_fn(|_| 0);
-    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
-    transmute(r)
+    {
+        let a = a.as_i16x8();
+        let zero = i16x8::ZERO();
+        let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Computes the absolute value of each of the packed 32-bit signed integers in
 /// `a` and
 /// return the 32-bit unsigned integer
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32)
 pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
-    let a = a.as_i32x4();
-    let zero = i32x4::from_fn(|_| 0);
-    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
-    transmute(r)
+    {
+        let a = a.as_i32x4();
+        let zero = i32x4::ZERO();
+        let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Shuffles bytes from `a` according to the content of `b`.
 ///
 /// The last 4 bits of each byte of `b` are used as addresses
@@ -68,23 +71,19 @@ pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8)
 pub fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pshufb128(a.as_u8x16(), b.as_u8x16()))
+    {
+        transmute(pshufb128(a.as_u8x16(), b.as_u8x16()))
+    }
 }
-
 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
 /// shift the result right by `n` bytes, and returns the low 16 bytes.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8)
-
 pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    // TODO static_assert_uimm_bits!(IMM8, 8);
-    // If palignr is shifting the pair of vectors more than the size of two
-    // lanes, emit zero.
+    static_assert_uimm_bits!(IMM8, 8);
     if IMM8 > 32 {
         return _mm_setzero_si128();
     }
-    // If palignr is shifting the pair of input vectors more than one lane,
-    // but less than two lanes, convert to shifting in zeroes.
     let (a, b) = if IMM8 > 16 {
         (_mm_setzero_si128(), a)
     } else {
@@ -92,7 +91,6 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     };
     const fn mask(shift: u32, i: u32) -> u32 {
         if shift > 32 {
-            // Unused, but needs to be a valid index.
             i
         } else if shift > 16 {
             shift - 16 + i
@@ -100,89 +98,89 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
             shift + i
         }
     }
-
-    let r: i8x16 = simd_shuffle(
-        b.as_i8x16(),
-        a.as_i8x16(),
-        [
-            mask(IMM8 as u32, 0),
-            mask(IMM8 as u32, 1),
-            mask(IMM8 as u32, 2),
-            mask(IMM8 as u32, 3),
-            mask(IMM8 as u32, 4),
-            mask(IMM8 as u32, 5),
-            mask(IMM8 as u32, 6),
-            mask(IMM8 as u32, 7),
-            mask(IMM8 as u32, 8),
-            mask(IMM8 as u32, 9),
-            mask(IMM8 as u32, 10),
-            mask(IMM8 as u32, 11),
-            mask(IMM8 as u32, 12),
-            mask(IMM8 as u32, 13),
-            mask(IMM8 as u32, 14),
-            mask(IMM8 as u32, 15),
-        ],
-    );
-    r.into()
+    {
+        let r: i8x16 = simd_shuffle(
+            b.as_i8x16(),
+            a.as_i8x16(),
+            [
+                mask(IMM8 as u32, 0),
+                mask(IMM8 as u32, 1),
+                mask(IMM8 as u32, 2),
+                mask(IMM8 as u32, 3),
+                mask(IMM8 as u32, 4),
+                mask(IMM8 as u32, 5),
+                mask(IMM8 as u32, 6),
+                mask(IMM8 as u32, 7),
+                mask(IMM8 as u32, 8),
+                mask(IMM8 as u32, 9),
+                mask(IMM8 as u32, 10),
+                mask(IMM8 as u32, 11),
+                mask(IMM8 as u32, 12),
+                mask(IMM8 as u32, 13),
+                mask(IMM8 as u32, 14),
+                mask(IMM8 as u32, 15),
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of `[8 x i16]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16)
-
 pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phaddw128(a.as_i16x8(), b.as_i16x8()).into()
+    {
+        transmute(phaddw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are
 /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16)
-
 pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phaddsw128(a.as_i16x8(), b.as_i16x8()).into()
+    {
+        transmute(phaddsw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of `[4 x i32]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32)
-
 pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
-    phaddd128(a.as_i32x4(), b.as_i32x4()).into()
+    {
+        transmute(phaddd128(a.as_i32x4(), b.as_i32x4()))
+    }
 }
-
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of `[8 x i16]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16)
-
 pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phsubw128(a.as_i16x8(), b.as_i16x8()).into()
+    {
+        transmute(phsubw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than
 /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 /// saturated to 8000h.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16)
-
 pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phsubsw128(a.as_i16x8(), b.as_i16x8()).into()
+    {
+        transmute(phsubsw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of `[4 x i32]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32)
-
 pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    phsubd128(a.as_i32x4(), b.as_i32x4()).into()
+    {
+        transmute(phsubd128(a.as_i32x4(), b.as_i32x4()))
+    }
 }
-
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
 /// values contained in the first source operand and packed 8-bit signed
 /// integer values contained in the second source operand, add pairs of
@@ -190,50 +188,51 @@ pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// the corresponding bits in the destination.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16)
-
 pub fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    pmaddubsw128(a.as_u8x16(), b.as_i8x16()).into()
+    {
+        transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16()))
+    }
 }
-
 /// Multiplies packed 16-bit signed integer values, truncate the 32-bit
 /// product to the 18 most significant bits by right-shifting, round the
 /// truncated value by adding 1, and write bits `[16:1]` to the destination.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16)
-
 pub fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    pmulhrsw128(a.as_i16x8(), b.as_i16x8()).into()
+    {
+        transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit
 /// integer in `b` is negative, and returns the result.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8)
-
 pub fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
-    psignb128(a.as_i8x16(), b.as_i8x16()).into()
+    {
+        transmute(psignb128(a.as_i8x16(), b.as_i8x16()))
+    }
 }
-
 /// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit
 /// integer in `b` is negative, and returns the results.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16)
-
 pub fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
-    psignw128(a.as_i16x8(), b.as_i16x8()).into()
+    {
+        transmute(psignw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit
 /// integer in `b` is negative, and returns the results.
 /// Element in result are zeroed out when the corresponding element in `b`
 /// is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32)
-
 pub fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
-    psignd128(a.as_i32x4(), b.as_i32x4()).into()
+    {
+        transmute(psignd128(a.as_i32x4(), b.as_i32x4()))
+    }
 }

From 09a0cbd4fcc337ad60803e430de9808f66aa92cb Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 01:31:01 -0400
Subject: [PATCH 30/47] added unmodeled functions

---
 .../src/core_arch/x86/models/avx.rs           | 69 +++++++++++++++++++
 .../src/core_arch/x86/models/avx2.rs          |  6 ++
 .../src/core_arch/x86/models/sse2.rs          | 59 ++++++++++++++++
 3 files changed, 134 insertions(+)

diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 91b168acf3f53..1789a2efa6d2b 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -23,6 +23,7 @@ use crate::abstractions::utilities::*;
 /// in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_add_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { transmute(simd_add(a.as_f64x4(), b.as_f64x4())) }
 // }
@@ -30,6 +31,7 @@ use crate::abstractions::utilities::*;
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_add_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
 //     { transmute(simd_add(a.as_f32x8(), b.as_f32x8())) }
 // }
@@ -146,6 +148,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// in `a` and `b`, and returns packed maximum values
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_max_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { vmaxpd(a, b) }
 // }
@@ -153,6 +156,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// and `b`, and returns packed maximum values
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_max_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
 //     { vmaxps(a, b) }
 // }
@@ -160,6 +164,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// in `a` and `b`, and returns packed minimum values
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_min_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { vminpd(a, b) }
 // }
@@ -167,6 +172,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// and `b`, and returns packed minimum values
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_min_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
 //     { vminps(a, b) }
 // }
@@ -174,6 +180,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_mul_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { transmute(simd_mul(a.as_f64x4(), b.as_f64x4())) }
 // }
@@ -181,6 +188,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_mul_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
 //     { transmute(simd_mul(a.as_f32x8(), b.as_f32x8())) }
 // }
@@ -188,6 +196,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// floating-point elements in `a` to/from packed elements in `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_addsub_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
 //     {
 //         let a = a.as_f64x4();
@@ -201,6 +210,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// floating-point elements in `a` to/from packed elements in `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_addsub_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
 //     {
 //         let a = a.as_f32x8();
@@ -214,6 +224,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// from packed elements in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sub_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { simd_sub(a, b) }
 // }
@@ -221,6 +232,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// from packed elements in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sub_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
 //     { simd_sub(a, b) }
 // }
@@ -228,6 +240,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// in `a` by the corresponding packed elements in `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_div_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
 //     { simd_div(a, b) }
 // }
@@ -235,6 +248,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// in `a` by the corresponding packed elements in `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_div_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { simd_div(a, b) }
 // }
@@ -251,6 +265,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_round_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
 //     static_assert_uimm_bits!(ROUNDING, 4);
 //     { roundpd256(a, ROUNDING) }
@@ -259,6 +274,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// toward positive infinity.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_ceil_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
 //     { simd_ceil(a) }
 // }
@@ -266,6 +282,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// toward negative infinity.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_floor_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
 //     { simd_floor(a) }
 // }
@@ -282,6 +299,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_round_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
 //     static_assert_uimm_bits!(ROUNDING, 4);
 //     { roundps256(a, ROUNDING) }
@@ -290,6 +308,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// toward positive infinity.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_ceil_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
 //     { simd_ceil(a) }
 // }
@@ -297,6 +316,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// toward negative infinity.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_floor_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_floor_ps(a: __m256) -> __m256 {
 //     { simd_floor(a) }
 // }
@@ -304,6 +324,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// elements in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sqrt_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
 //     { simd_fsqrt(a) }
 // }
@@ -311,6 +332,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 /// elements in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sqrt_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
 //     { simd_fsqrt(a) }
 // }
@@ -382,6 +404,7 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 ///  using the low 4 bits of `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_dp_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 //     static_assert_uimm_bits!(IMM8, 8);
 //     { vdpps(a, b, IMM8 as i8) }
@@ -392,6 +415,7 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 /// while sums of elements from `b` are returned in odd locations.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hadd_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { vhaddpd(a, b) }
 // }
@@ -402,6 +426,7 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 /// 2, 3, 6, 7.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hadd_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
 //     { vhaddps(a, b) }
 // }
@@ -411,6 +436,7 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 /// while sums of elements from `b` are returned in odd locations.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hsub_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { vhsubpd(a, b) }
 // }
@@ -421,6 +447,7 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 /// 2, 3, 6, 7.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hsub_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
 //     { vhsubps(a, b) }
 // }
@@ -515,6 +542,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 /// specified by `IMM5`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmppd(a, b, const { IMM5 as i8 }) }
@@ -524,6 +552,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 /// specified by `IMM5`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cmp_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmppd256(a, b, IMM5 as u8) }
@@ -533,6 +562,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 /// specified by `IMM5`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_ps)
+// NOTE: Not modeled yet
 // pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmpps(a, b, const { IMM5 as i8 }) }
@@ -542,6 +572,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 /// specified by `IMM5`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cmp_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmpps256(a, b, const { IMM5 as u8 }) }
@@ -553,6 +584,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 /// vector.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmpsd(a, b, IMM5 as i8) }
@@ -564,6 +596,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 /// returned vector.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_ss)
+// NOTE: Not modeled yet
 // pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmpss(a, b, IMM5 as i8) }
@@ -593,6 +626,7 @@ pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
 /// to packed 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtps_epi32)
+// NOTE: Not modeled yet
 // pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
 //     { transmute(vcvtps2dq(a)) }
 // }
@@ -613,6 +647,7 @@ pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
 /// to packed 32-bit integers with truncation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvttpd_epi32)
+// NOTE: Not modeled yet
 // pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
 //     { transmute(vcvttpd2dq(a)) }
 // }
@@ -620,6 +655,7 @@ pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
 /// to packed 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtpd_epi32)
+// NOTE: Not modeled yet
 // pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
 //     { transmute(vcvtpd2dq(a)) }
 // }
@@ -627,6 +663,7 @@ pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
 /// to packed 32-bit integers with truncation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvttps_epi32)
+// NOTE: Not modeled yet
 // pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
 //     { transmute(vcvttps2dq(a)) }
 // }
@@ -682,6 +719,7 @@ pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
 /// Zeroes the contents of all XMM or YMM registers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zeroall)
+// NOTE: Not modeled yet
 // pub fn _mm256_zeroall() {
 //     { vzeroall() }
 // }
@@ -689,6 +727,7 @@ pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
 /// the lower 128-bits of the registers are unmodified.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zeroupper)
+// NOTE: Not modeled yet
 // pub fn _mm256_zeroupper() {
 //     { vzeroupper() }
 // }
@@ -696,6 +735,7 @@ pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
 /// within 128-bit lanes using the control in `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permutevar_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
 //     { vpermilps256(a, b.as_i32x8()) }
 // }
@@ -703,6 +743,7 @@ pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
 /// using the control in `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permutevar_ps)
+// NOTE: Not modeled yet
 // pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
 //     { vpermilps(a, b.as_i32x4()) }
 // }
@@ -733,6 +774,7 @@ pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
 /// using the control in `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_ps)
+// NOTE: Not modeled yet
 // pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
 //     static_assert_uimm_bits!(IMM8, 8);
 //     {
@@ -746,6 +788,7 @@ pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
 /// within 256-bit lanes using the control in `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permutevar_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
 //     { vpermilpd256(a, b.as_i64x4()) }
 // }
@@ -753,6 +796,7 @@ pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
 /// using the control in `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permutevar_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
 //     { vpermilpd(a, b.as_i64x2()) }
 // }
@@ -793,6 +837,7 @@ pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
 /// floating-point elements) selected by `imm8` from `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 //     static_assert_uimm_bits!(IMM8, 8);
 //     { vperm2f128ps256(a, b, IMM8 as i8) }
@@ -801,6 +846,7 @@ pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
 /// floating-point elements) selected by `imm8` from `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
 //     static_assert_uimm_bits!(IMM8, 8);
 //     { vperm2f128pd256(a, b, IMM8 as i8) }
@@ -824,6 +870,7 @@ pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
 /// to all elements of the returned vector.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_broadcast_ss)
+// NOTE: Not modeled yet
 // pub fn _mm_broadcast_ss(f: &f32) -> __m128 {
 //     _mm_set1_ps(*f)
 // }
@@ -831,6 +878,7 @@ pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
 /// to all elements of the returned vector.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_sd)
+// NOTE: Not modeled yet
 // pub fn _mm256_broadcast_sd(f: &f64) -> __m256d {
 //     _mm256_set1_pd(*f)
 // }
@@ -838,6 +886,7 @@ pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
 /// (32-bit) floating-point elements) to all elements of the returned vector.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
 //     { transmute(simd_shuffle((*a).as_f32x4(), _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])) }
 // }
@@ -857,6 +906,7 @@ pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
 /// at the location specified by `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
 //     static_assert_uimm_bits!(IMM1, 1);
 //     {
@@ -871,6 +921,7 @@ pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
 /// at the location specified by `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
 //     static_assert_uimm_bits!(IMM1, 1);
 //     {
@@ -952,6 +1003,7 @@ pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
 /// relative error for this approximation is less than 1.5*2^-12.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_rcp_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
 //     { vrcpps(a) }
 // }
@@ -960,6 +1012,7 @@ pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
 /// The maximum relative error for this approximation is less than 1.5*2^-12.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_rsqrt_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
 //     { vrsqrtps(a) }
 // }
@@ -1024,6 +1077,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// `CF` values are zero, otherwise return 0.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_si256)
+// NOTE: Not modeled yet
 // pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
 //     { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
 // }
@@ -1036,6 +1090,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
 //     { vtestzpd256(a, b) }
 // }
@@ -1048,6 +1103,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
 //     { vtestcpd256(a, b) }
 // }
@@ -1061,6 +1117,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// are zero, otherwise return 0.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
 //     { vtestnzcpd256(a, b) }
 // }
@@ -1073,6 +1130,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testz_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
 //     { vtestzpd(a, b) }
 // }
@@ -1085,6 +1143,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testc_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
 //     { vtestcpd(a, b) }
 // }
@@ -1098,6 +1157,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// are zero, otherwise return 0.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testnzc_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
 //     { vtestnzcpd(a, b) }
 // }
@@ -1110,6 +1170,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
 //     { vtestzps256(a, b) }
 // }
@@ -1122,6 +1183,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
 //     { vtestcps256(a, b) }
 // }
@@ -1135,6 +1197,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// are zero, otherwise return 0.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
 //     { vtestnzcps256(a, b) }
 // }
@@ -1147,6 +1210,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testz_ps)
+// NOTE: Not modeled yet
 // pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
 //     { vtestzps(a, b) }
 // }
@@ -1159,6 +1223,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testc_ps)
+// NOTE: Not modeled yet
 // pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
 //     { vtestcps(a, b) }
 // }
@@ -1172,6 +1237,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// are zero, otherwise return 0.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testnzc_ps)
+// NOTE: Not modeled yet
 // pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
 //     { vtestnzcps(a, b) }
 // }
@@ -1534,6 +1600,7 @@ pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 /// the upper 128 bits of the result are undefined.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps128_ps256)
+// NOTE: Not modeled yet
 // pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
 //     { simd_shuffle(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
 // }
@@ -1565,6 +1632,7 @@ pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
 /// the value of the source vector. The upper 128 bits are set to zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextps128_ps256)
+// NOTE: Not modeled yet
 // pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
 //     { simd_shuffle(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
 // }
@@ -1586,6 +1654,7 @@ pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
 /// to zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextpd128_pd256)
+// NOTE: Not modeled yet
 // pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
 //     { simd_shuffle(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
 // }
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index b731bd4a2022f..eb7cebe85abaf 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -386,6 +386,7 @@ pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 /// from `a` to all elements of the 128-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
 //     { simd_shuffle(a, _mm_setzero_pd(), [0_u32; 2]) }
 // }
@@ -393,6 +394,7 @@ pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 /// from `a` to all elements of the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
 //     { simd_shuffle(a, _mm_setzero_pd(), [0_u32; 4]) }
 // }
@@ -420,6 +422,7 @@ pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 /// from `a` to all elements of the 128-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
+// NOTE: Not modeled yet
 // pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
 //     { simd_shuffle(a, _mm_setzero_ps(), [0_u32; 4]) }
 // }
@@ -427,6 +430,7 @@ pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 /// from `a` to all elements of the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
 //     { simd_shuffle(a, _mm_setzero_ps(), [0_u32; 8]) }
 // }
@@ -1053,6 +1057,7 @@ pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 /// control in `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
+// NOTE: Not modeled yet
 // pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
 //     static_assert_uimm_bits!(IMM8, 8);
 //     {
@@ -1066,6 +1071,7 @@ pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 /// the corresponding 32-bit integer index in `idx`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
+// NOTE: Not modeled yet
 // pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
 //     { permps(a, idx.as_i32x8()) }
 // }
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 66a99aadedc5f..c9c90e3e9e267 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -607,6 +607,7 @@ pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
 /// to packed 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
+// NOTE: Not modeled yet
 // pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
 //     { transmute(cvtps2dq(a)) }
 // }
@@ -955,6 +956,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// low elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))) }
 // }
@@ -962,6 +964,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { simd_add(a, b) }
 // }
@@ -969,6 +972,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// diving the lower element of `a` by the lower element of `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))) }
 // }
@@ -976,6 +980,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// packed elements in `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { simd_div(a, b) }
 // }
@@ -983,6 +988,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { maxsd(a, b) }
 // }
@@ -990,6 +996,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { maxpd(a, b) }
 // }
@@ -997,6 +1004,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { minsd(a, b) }
 // }
@@ -1004,6 +1012,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { minpd(a, b) }
 // }
@@ -1011,6 +1020,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// low elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))) }
 // }
@@ -1018,6 +1028,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { transmute(simd_mul(a.as_f64x2(), b.as_f64x2())) }
 // }
@@ -1025,12 +1036,14 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// root of the lower element `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { simd_insert(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
 // }
 /// Returns a new vector with the square root of each of the values in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
 //     { simd_fsqrt(a) }
 // }
@@ -1038,6 +1051,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// low element by `b` from the low element of `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))) }
 // }
@@ -1045,6 +1059,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// from `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { simd_sub(a, b) }
 // }
@@ -1093,6 +1108,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// comparison of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmpsd(a, b, 0) }
 // }
@@ -1100,6 +1116,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// comparison of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmpsd(a, b, 1) }
 // }
@@ -1107,6 +1124,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// less-than-or-equal comparison of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmpsd(a, b, 2) }
 // }
@@ -1114,6 +1132,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// greater-than comparison of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { transmute(simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract(a, 1))) }
 // }
@@ -1121,6 +1140,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// greater-than-or-equal comparison of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { simd_insert(_mm_cmple_sd(b, a), 1, simd_extract(a, 1)) }
 // }
@@ -1130,6 +1150,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// otherwise.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmpsd(a, b, 7) }
 // }
@@ -1138,6 +1159,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmpsd(a, b, 3) }
 // }
@@ -1145,6 +1167,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// comparison of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmpsd(a, b, 4) }
 // }
@@ -1152,6 +1175,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// not-less-than comparison of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmpsd(a, b, 5) }
 // }
@@ -1159,6 +1183,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmpsd(a, b, 6) }
 // }
@@ -1166,6 +1191,7 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// not-greater-than comparison of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract(a, 1)) }
 // }
@@ -1173,72 +1199,84 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
 //     { simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract(a, 1)) }
 // }
 /// Compares corresponding elements in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmppd(a, b, 0) }
 // }
 /// Compares corresponding elements in `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmppd(a, b, 1) }
 // }
 /// Compares corresponding elements in `a` and `b` for less-than-or-equal
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmppd(a, b, 2) }
 // }
 /// Compares corresponding elements in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
 //     _mm_cmplt_pd(b, a)
 // }
 /// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
 //     _mm_cmple_pd(b, a)
 // }
 /// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmppd(a, b, 7) }
 // }
 /// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmppd(a, b, 3) }
 // }
 /// Compares corresponding elements in `a` and `b` for not-equal.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmppd(a, b, 4) }
 // }
 /// Compares corresponding elements in `a` and `b` for not-less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmppd(a, b, 5) }
 // }
 /// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
 //     { cmppd(a, b, 6) }
 // }
 /// Compares corresponding elements in `a` and `b` for not-greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
 //     _mm_cmpnlt_pd(b, a)
 // }
@@ -1246,78 +1284,91 @@ pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 /// not-greater-than-or-equal.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
+// NOTE: Not modeled yet
 // pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
 //     _mm_cmpnle_pd(b, a)
 // }
 /// Compares the lower element of `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
 //     { comieqsd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
 //     { comiltsd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for less-than-or-equal.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
 //     { comilesd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
 //     { comigtsd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for greater-than-or-equal.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
 //     { comigesd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for not-equal.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
 //     { comineqsd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
 //     { ucomieqsd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
 //     { ucomiltsd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for less-than-or-equal.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
 //     { ucomilesd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
 //     { ucomigtsd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for greater-than-or-equal.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
 //     { ucomigesd(a, b) }
 // }
 /// Compares the lower element of `a` and `b` for not-equal.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
 //     { ucomineqsd(a, b) }
 // }
@@ -1347,6 +1398,7 @@ pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
 /// packed 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
+// NOTE: Not modeled yet
 // pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
 //     { transmute(cvtpd2dq(a)) }
 // }
@@ -1354,6 +1406,7 @@ pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
 /// a 32-bit integer.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
+// NOTE: Not modeled yet
 // pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
 //     { cvtsd2si(a) }
 // }
@@ -1363,12 +1416,14 @@ pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
 /// to the upper element the return value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
+// NOTE: Not modeled yet
 // pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
 //     { cvtsd2ss(a, b) }
 // }
 /// Returns the lower double-precision (64-bit) floating-point element of `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
+// NOTE: Not modeled yet
 // pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
 //     { simd_extract(a, 0) }
 // }
@@ -1378,6 +1433,7 @@ pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
 /// to the upper element the return value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
+// NOTE: Not modeled yet
 // pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
 //     { cvtss2sd(a, b) }
 // }
@@ -1385,6 +1441,7 @@ pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
 /// packed 32-bit integers with truncation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
+// NOTE: Not modeled yet
 // pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
 //     { transmute(cvttpd2dq(a)) }
 // }
@@ -1392,6 +1449,7 @@ pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
 /// to a 32-bit integer with truncation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
+// NOTE: Not modeled yet
 // pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
 //     { cvttsd2si(a) }
 // }
@@ -1399,6 +1457,7 @@ pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
 /// packed 32-bit integers with truncation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
+// NOTE: Not modeled yet
 // pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
 //     { transmute(cvttps2dq(a)) }
 // }

From 5b12d03c12cdeef49c1e3cf1e108dfb847e35987 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 01:45:38 -0400
Subject: [PATCH 31/47] sse

---
 .../src/core_arch/x86/models/avx.rs           | 84 +++++++++----------
 .../src/core_arch/x86/models/avx2.rs          | 30 ++++---
 .../src/core_arch/x86/models/mod.rs           |  1 +
 .../src/core_arch/x86/models/sse.rs           | 20 +++++
 4 files changed, 75 insertions(+), 60 deletions(-)
 create mode 100644 testable-simd-models/src/core_arch/x86/models/sse.rs

diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 1789a2efa6d2b..5315f067c1a2f 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -14,6 +14,7 @@
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
 use super::avx_handwritten::*;
+use super::sse::*;
 use super::sse2::*;
 use super::types::*;
 use crate::abstractions::simd::*;
@@ -774,16 +775,15 @@ pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
 /// using the control in `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_ps)
-// NOTE: Not modeled yet
-// pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
-//     static_assert_uimm_bits!(IMM8, 8);
-//     {
-//         transmute(simd_shuffle(
-//             a.as_f32x4(), _mm_undefined_ps(), [(IMM8 as u32 >> 0) & 0b11, (IMM8 as u32 >> 2) & 0b11,
-//             (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
-//         ))
-//     }
-// }
+pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(), _mm_undefined_ps().as_f32x4(), [(IMM8 as u32 >> 0) & 0b11, (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
+        ))
+    }
+}
 /// Shuffles double-precision (64-bit) floating-point elements in `a`
 /// within 256-bit lanes using the control in `b`.
 ///
@@ -886,10 +886,9 @@ pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
 /// (32-bit) floating-point elements) to all elements of the returned vector.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ps)
-// NOTE: Not modeled yet
-// pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
-//     { transmute(simd_shuffle((*a).as_f32x4(), _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])) }
-// }
+pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
+    { transmute(simd_shuffle((*a).as_f32x4(), _mm_setzero_ps().as_f32x4(), [0, 1, 2, 3, 0, 1, 2, 3])) }
+}
 /// Broadcasts 128 bits from memory (composed of 2 packed double-precision
 /// (64-bit) floating-point elements) to all elements of the returned vector.
 ///
@@ -906,30 +905,29 @@ pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
 /// at the location specified by `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_ps)
-// NOTE: Not modeled yet
-// pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
-//     static_assert_uimm_bits!(IMM1, 1);
-//     {
-//         transmute(simd_shuffle(
-//             a.as_f32x8(), _mm256_castps128_ps256(b), [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9,
-//             10, 11]] [IMM1 as usize],
-//         ))
-//     }
-// }
+pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(), _mm256_castps128_ps256(b).as_f32x8(), [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9,
+            10, 11]] [IMM1 as usize],
+        ))
+    }
+}
 /// Copies `a` to result, then inserts 128 bits (composed of 2 packed
 /// double-precision (64-bit) floating-point elements) from `b` into result
 /// at the location specified by `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_pd)
-// NOTE: Not modeled yet
-// pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
-//     static_assert_uimm_bits!(IMM1, 1);
-//     {
-//         simd_shuffle(
-//             a, _mm256_castpd128_pd256(b), [[4, 5, 2, 3], [0, 1, 4, 5]] [IMM1 as usize],
-//         )
-//     }
-// }
+pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(), _mm256_castpd128_pd256(b).as_f64x4(), 
+            [[4, 5, 2, 3], [0, 1, 4, 5]] [IMM1 as usize],
+        ))
+    }
+}
 /// Copies `a` to result, then inserts 128 bits from `b` into result
 /// at the location specified by `imm8`.
 ///
@@ -1600,10 +1598,9 @@ pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 /// the upper 128 bits of the result are undefined.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps128_ps256)
-// NOTE: Not modeled yet
-// pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
-//     { simd_shuffle(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
-// }
+pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
+    { transmute(simd_shuffle(a.as_f32x4(), _mm_undefined_ps().as_f32x4(), [0, 1, 2, 3, 4, 4, 4, 4])) }
+}
 /// Casts vector of type __m128d to type __m256d;
 /// the upper 128 bits of the result are undefined.
 ///
@@ -1632,10 +1629,9 @@ pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
 /// the value of the source vector. The upper 128 bits are set to zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextps128_ps256)
-// NOTE: Not modeled yet
-// pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
-//     { simd_shuffle(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
-// }
+pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
+    { transmute(simd_shuffle(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), [0, 1, 2, 3, 4, 5, 6, 7])) }
+}
 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
 /// The lower 128 bits contain the value of the source vector. The upper
 /// 128 bits are set to zero.
@@ -1655,9 +1651,9 @@ pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextpd128_pd256)
 // NOTE: Not modeled yet
-// pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
-//     { simd_shuffle(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
-// }
+pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
+    { transmute(simd_shuffle(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), [0, 1, 2, 3])) }
+}
 /// Returns vector of type `__m256` with indeterminate elements.
 /// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
 /// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index eb7cebe85abaf..6b3829fa565b0 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -22,6 +22,8 @@
 use crate::abstractions::simd::*;
 use crate::abstractions::utilities::*;
 
+use super::sse::*;
+use super::sse2::*;
 use super::avx::*;
 use super::avx2_handwritten::*;
 use super::types::*;
@@ -386,18 +388,16 @@ pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 /// from `a` to all elements of the 128-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
-// NOTE: Not modeled yet
-// pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
-//     { simd_shuffle(a, _mm_setzero_pd(), [0_u32; 2]) }
-// }
+pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
+    { transmute(simd_shuffle(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), [0_u32; 2])) }
+}
 /// Broadcasts the low double-precision (64-bit) floating-point element
 /// from `a` to all elements of the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
-// NOTE: Not modeled yet
-// pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
-//     { simd_shuffle(a, _mm_setzero_pd(), [0_u32; 4]) }
-// }
+pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
+    { transmute(simd_shuffle(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), [0_u32; 4])) }
+}
 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
 /// the 256-bit returned value.
 ///
@@ -422,18 +422,16 @@ pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 /// from `a` to all elements of the 128-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
-// NOTE: Not modeled yet
-// pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
-//     { simd_shuffle(a, _mm_setzero_ps(), [0_u32; 4]) }
-// }
+pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
+    { transmute(simd_shuffle(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), [0_u32; 4])) }
+}
 /// Broadcasts the low single-precision (32-bit) floating-point element
 /// from `a` to all elements of the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
-// NOTE: Not modeled yet
-// pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
-//     { simd_shuffle(a, _mm_setzero_ps(), [0_u32; 8]) }
-// }
+pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
+    { transmute(simd_shuffle(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), [0_u32; 8])) }
+}
 /// Broadcasts the low packed 16-bit integer from a to all elements of
 /// the 128-bit returned value
 ///
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 3efc7f6791f03..b0a3416260e47 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -20,6 +20,7 @@
 //! In general, it is best to gain an idea of how an implementation should be written by looking
 //! at how other functions are implemented. Also see `core::arch::x86` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
 
+pub mod sse;
 pub mod avx;
 pub mod avx2;
 pub mod avx2_handwritten;
diff --git a/testable-simd-models/src/core_arch/x86/models/sse.rs b/testable-simd-models/src/core_arch/x86/models/sse.rs
new file mode 100644
index 0000000000000..6bfa7ec38e44b
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/sse.rs
@@ -0,0 +1,20 @@
+//! Streaming SIMD Extensions (SSE)
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
+use super::types::*;
+
+/// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
+pub fn _mm_undefined_ps() -> __m128 {
+    transmute(f32x4::ZERO()) 
+}
+
+/// Construct a `__m128` with all elements initialized to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
+pub fn _mm_setzero_ps() -> __m128 {
+    transmute(f32x4::ZERO()) }

From 2bba06951d9223416baa4e3cf5900bbd1bd1b91b Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 01:46:01 -0400
Subject: [PATCH 32/47] format

---
 .../src/core_arch/x86/models/avx.rs           | 52 +++++++++++++++----
 .../src/core_arch/x86/models/avx2.rs          | 36 ++++++++++---
 .../src/core_arch/x86/models/mod.rs           |  2 +-
 .../src/core_arch/x86/models/sse.rs           |  7 +--
 4 files changed, 77 insertions(+), 20 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 5315f067c1a2f..3ea97b7c7bf29 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -779,8 +779,14 @@ pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 8);
     {
         transmute(simd_shuffle(
-            a.as_f32x4(), _mm_undefined_ps().as_f32x4(), [(IMM8 as u32 >> 0) & 0b11, (IMM8 as u32 >> 2) & 0b11,
-            (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
+            a.as_f32x4(),
+            _mm_undefined_ps().as_f32x4(),
+            [
+                (IMM8 as u32 >> 0) & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
         ))
     }
 }
@@ -887,7 +893,13 @@ pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ps)
 pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
-    { transmute(simd_shuffle((*a).as_f32x4(), _mm_setzero_ps().as_f32x4(), [0, 1, 2, 3, 0, 1, 2, 3])) }
+    {
+        transmute(simd_shuffle(
+            (*a).as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0, 1, 2, 3, 0, 1, 2, 3],
+        ))
+    }
 }
 /// Broadcasts 128 bits from memory (composed of 2 packed double-precision
 /// (64-bit) floating-point elements) to all elements of the returned vector.
@@ -909,8 +921,9 @@ pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
     static_assert_uimm_bits!(IMM1, 1);
     {
         transmute(simd_shuffle(
-            a.as_f32x8(), _mm256_castps128_ps256(b).as_f32x8(), [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9,
-            10, 11]] [IMM1 as usize],
+            a.as_f32x8(),
+            _mm256_castps128_ps256(b).as_f32x8(),
+            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
         ))
     }
 }
@@ -923,8 +936,9 @@ pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d
     static_assert_uimm_bits!(IMM1, 1);
     {
         transmute(simd_shuffle(
-            a.as_f64x4(), _mm256_castpd128_pd256(b).as_f64x4(), 
-            [[4, 5, 2, 3], [0, 1, 4, 5]] [IMM1 as usize],
+            a.as_f64x4(),
+            _mm256_castpd128_pd256(b).as_f64x4(),
+            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
         ))
     }
 }
@@ -1599,7 +1613,13 @@ pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps128_ps256)
 pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
-    { transmute(simd_shuffle(a.as_f32x4(), _mm_undefined_ps().as_f32x4(), [0, 1, 2, 3, 4, 4, 4, 4])) }
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_undefined_ps().as_f32x4(),
+            [0, 1, 2, 3, 4, 4, 4, 4],
+        ))
+    }
 }
 /// Casts vector of type __m128d to type __m256d;
 /// the upper 128 bits of the result are undefined.
@@ -1630,7 +1650,13 @@ pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextps128_ps256)
 pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
-    { transmute(simd_shuffle(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), [0, 1, 2, 3, 4, 5, 6, 7])) }
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ))
+    }
 }
 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
 /// The lower 128 bits contain the value of the source vector. The upper
@@ -1652,7 +1678,13 @@ pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextpd128_pd256)
 // NOTE: Not modeled yet
 pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
-    { transmute(simd_shuffle(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), [0, 1, 2, 3])) }
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_setzero_pd().as_f64x2(),
+            [0, 1, 2, 3],
+        ))
+    }
 }
 /// Returns vector of type `__m256` with indeterminate elements.
 /// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 6b3829fa565b0..e23c712c200b6 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -22,10 +22,10 @@
 use crate::abstractions::simd::*;
 use crate::abstractions::utilities::*;
 
-use super::sse::*;
-use super::sse2::*;
 use super::avx::*;
 use super::avx2_handwritten::*;
+use super::sse::*;
+use super::sse2::*;
 use super::types::*;
 
 /// Computes the absolute values of packed 32-bit integers in `a`.
@@ -389,14 +389,26 @@ pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
 pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
-    { transmute(simd_shuffle(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), [0_u32; 2])) }
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_setzero_pd().as_f64x2(),
+            [0_u32; 2],
+        ))
+    }
 }
 /// Broadcasts the low double-precision (64-bit) floating-point element
 /// from `a` to all elements of the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
 pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
-    { transmute(simd_shuffle(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), [0_u32; 4])) }
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_setzero_pd().as_f64x2(),
+            [0_u32; 4],
+        ))
+    }
 }
 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
 /// the 256-bit returned value.
@@ -423,14 +435,26 @@ pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
 pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
-    { transmute(simd_shuffle(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), [0_u32; 4])) }
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0_u32; 4],
+        ))
+    }
 }
 /// Broadcasts the low single-precision (32-bit) floating-point element
 /// from `a` to all elements of the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
 pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
-    { transmute(simd_shuffle(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), [0_u32; 8])) }
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0_u32; 8],
+        ))
+    }
 }
 /// Broadcasts the low packed 16-bit integer from a to all elements of
 /// the 128-bit returned value
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index b0a3416260e47..79b660019c07c 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -20,11 +20,11 @@
 //! In general, it is best to gain an idea of how an implementation should be written by looking
 //! at how other functions are implemented. Also see `core::arch::x86` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
 
-pub mod sse;
 pub mod avx;
 pub mod avx2;
 pub mod avx2_handwritten;
 pub mod avx_handwritten;
+pub mod sse;
 pub mod sse2;
 pub mod sse2_handwritten;
 pub mod ssse3;
diff --git a/testable-simd-models/src/core_arch/x86/models/sse.rs b/testable-simd-models/src/core_arch/x86/models/sse.rs
index 6bfa7ec38e44b..f975c2814438a 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse.rs
@@ -1,7 +1,7 @@
 //! Streaming SIMD Extensions (SSE)
+use super::types::*;
 use crate::abstractions::simd::*;
 use crate::abstractions::utilities::*;
-use super::types::*;
 
 /// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
 /// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
@@ -10,11 +10,12 @@ use super::types::*;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
 pub fn _mm_undefined_ps() -> __m128 {
-    transmute(f32x4::ZERO()) 
+    transmute(f32x4::ZERO())
 }
 
 /// Construct a `__m128` with all elements initialized to zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
 pub fn _mm_setzero_ps() -> __m128 {
-    transmute(f32x4::ZERO()) }
+    transmute(f32x4::ZERO())
+}

From 8da4fc0f901a6744e5b443c9468f0909aeaee72c Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 08:22:49 -0400
Subject: [PATCH 33/47] removed script

---
 testable-simd-models/modelize/Cargo.toml  |  9 ---
 testable-simd-models/modelize/src/main.rs | 77 -----------------------
 2 files changed, 86 deletions(-)
 delete mode 100644 testable-simd-models/modelize/Cargo.toml
 delete mode 100644 testable-simd-models/modelize/src/main.rs

diff --git a/testable-simd-models/modelize/Cargo.toml b/testable-simd-models/modelize/Cargo.toml
deleted file mode 100644
index f1b4ab6152565..0000000000000
--- a/testable-simd-models/modelize/Cargo.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-[package]
-name = "modelize"
-version = "0.1.0"
-edition = "2024"
-
-[dependencies]
-prettyplease = "0.2.36"
-syn = { version = "2", features = ["full"] }
-
diff --git a/testable-simd-models/modelize/src/main.rs b/testable-simd-models/modelize/src/main.rs
deleted file mode 100644
index 9010fd83907e4..0000000000000
--- a/testable-simd-models/modelize/src/main.rs
+++ /dev/null
@@ -1,77 +0,0 @@
-use syn::{parse_file, Item, File, Stmt, Expr, ExprBlock};
-use std::fs;
-use std::env;
-
-fn remove_all_attributes(input_file_path: &str, handwritten_module: &str, output_file_path: &str) -> Result<(), Box<dyn std::error::Error>> {
-    let source_code = fs::read_to_string(input_file_path)?;
-    let mut syntax_tree: File = parse_file(&source_code)?;
-
-    syntax_tree.items.retain(|item|
-        match item {
-            Item::Use(_) => false,
-            Item::Fn(item_fn) => item_fn.sig.unsafety.is_none(),
-            _ => true
-        }
-    );
-
-    // let use_abstractions: Item = syn::parse_quote! {
-    //     use crate::abstractions::simd::*;
-    // };
-
-    // let use_types: Item = syn::parse_quote! {
-    //     use super::types::*;
-    // };
-
-    // let use_handwritten: Item = syn::parse_quote! {
-    //     use super::avx_handwritten::*;
-    // };
-
-    // syntax_tree.items.insert(0, use_handwritten);
-    // syntax_tree.items.insert(0, use_types);
-    // syntax_tree.items.insert(0, use_abstractions);
-
-    // Clear attributes from the file's top-level items
-    for item in &mut syntax_tree.items {
-        match item {
-            Item::Fn(item_fn) => {
-                item_fn.attrs.retain(|attr| attr.path().is_ident("doc"));
-                for stmt in &mut item_fn.block.stmts {
-                    match stmt {
-                        Stmt::Expr(Expr::Unsafe(u), tok) => *stmt = Stmt::Expr(Expr::Block(
-                                ExprBlock {attrs : Vec::new(), label : None, block : u.block.clone()}), *tok),
-                        _ => ()
-                    }
-                }
-            },
-            Item::Struct(item_struct) => {
-                item_struct.attrs.clear();
-                for field in &mut item_struct.fields {
-                    field.attrs.retain(|attr| attr.path().is_ident("doc"));
-                }
-            },
-            Item::Enum(item_enum) => {
-                item_enum.attrs.clear();
-                for variant in &mut item_enum.variants {
-                    variant.attrs.retain(|attr| attr.path().is_ident("doc"));
-                }
-            },
-            // Add more cases for other Item types (e.g., Item::Mod, Item::Impl, etc.)
-            _ => {
-                // For other item types, if they have an 'attrs' field, clear it.
-                // This requires more specific matching or a helper trait.
-            }
-        }
-    }
-
-//    let output_tokens = quote! { #syntax_tree };
-    let formatted_string = prettyplease::unparse(&syntax_tree);
-
-    fs::write(output_file_path, formatted_string)?;
-
-    Ok(())
-}
-
-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let args: Vec<String> = env::args().collect();
-    remove_all_attributes(&args[1], &args[2], &args[3])
-}

From 93964b6df4e5901e6187cc1d326163191e007b14 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 11:14:16 -0400
Subject: [PATCH 34/47] README.md

---
 testable-simd-models/README.md  | 157 ++++++++++++++++++++++++--------
 testable-simd-models/src/lib.rs |   2 +-
 2 files changed, 120 insertions(+), 39 deletions(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index f2f6ec3b9b629..abb526672c252 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -12,7 +12,7 @@ Each such folder has 3 sub-folders, `models`, `tests`, and `specs`.
 
 The `models` folder contains the models of the intrinsics, with a file
 corresponding to different target features, and are written using the
-various abstractions implementedin `crate::abstractions`, especially
+various abstractions implemented in `crate::abstractions`, especially
 those in `crate::abstractions::simd`. These models are meant to
 closely resemble their implementations within the Rust core itself.
 
@@ -25,9 +25,111 @@ outputs.
 
 The tests can run by executing `cargo test`.
 
-## Modeling Process
-The process of adding a specific intrinsic's model goes as follows.
-For this example, let us say the intrinsic we are adding is
+## Modeling a SIMD Intrinsic
+
+There are three kinds of SIMD intrinsics we find in `core::arch`.
+
+The first kind are builtin Rust compiler intrinsics, some of which are 
+in the [`intrinsics/simd.rs` file](https://github.com/model-checking/verify-rust-std/blob/main/library/core/src/intrinsics/simd.rs)
+in the `core` crate, and others are in the [`simd.rs` file of `core_arch`](https://github.com/model-checking/verify-rust-std/blob/main/library/stdarch/crates/core_arch/src/simd.rs).
+These builtin intrinsics define generic SIMD operations that the Rust compiler knows how to implement on each platform.
+
+The second kind are `extern` intrinsics that are links to definitions in LLVM.
+See, for example, [this list](https://github.com/rust-lang/stdarch/blob/master/crates/core_arch/src/x86/avx2.rs#L3596C8-L3596C14)
+of `extern` intrinsics used in the Intel x86 AVX2 library.
+These extern intrinsics are typically platform-specific functions that map to low-level instructions.
+
+The third kind are `defined` intrinsics that are given proper definitions in Rust, and their code may
+depend on the builtin intrinsics or the extern intrinsics. There defined intrinsics represent higher-level
+operations that are wrappers around one or more assembly instructions.
+
+### Modeling builtin intrinsics manually
+
+We model all three kinds of intrinsics, but in slightly different
+ways.  For the builtin intrinsics, we can write implementations once
+and for all, and to this end, we use a library within the
+`abstractions/simd.rs` file, where we copy the signatures of the
+intrinsics from Rust but give them our own implementation. In
+particular, we model each SIMD vector as an array of scalars, and
+define each generic operation as functions over such arrays. This can
+be seen as a reference implementation of the builtin intrinsics.
+
+Hence, for example, the SIMD add intrinsic `simd_add` is modeled as follows,
+it takes two arrays of machine integers and adds them pointwise using a
+`wrapping_add` operation:
+
+```rust
+pub fn simd_add<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].wrapping_add(y[i])))
+}
+```
+
+Notably, we model a strongly typed version of `simd_add`, in contrast to the compiler
+intrinsic which is too generic and unimplementable in safe Rust:
+
+```rust
+/// Adds two simd vectors elementwise.
+///
+/// `T` must be a vector of integers or floats.
+#[rustc_intrinsic]
+#[rustc_nounwind]
+pub unsafe fn simd_add<T>(x: T, y: T) -> T;
+```
+
+The main rules for writing these models is that they should be simple and self-contained,
+relying only on the libraries in `abstractions` or on builtin Rust language features or on
+other testable models. In particular, they should not themselves directly call Rust libraries
+or external crates, without going through the abstractions API.
+
+
+### Modeling extern intrinsics manually
+
+For each file in `core::arch`, we split the code into extern
+intrinsics that must be modeled by hand and defined intrinsics whose
+models can be derived semi-automatically. The extern intrinsics are
+placed in a module suffixed with `_handwritten`. Hence, for example,
+the extern intrinsics used in `avx2.rs` can be found in `avx2_handwritten.rs`.
+
+Modeling extern intrinsics is similar to modeling the builtin ones,
+in that the models are written by hand and treat the SIMD vectors
+as arrays of machine integers. The main difference is that these intrinsics
+are platform specific and so their modeling requires looking at the Intel or ARM
+documentation for the underlying operation.
+
+For example, the extern intrinsic `phaddw` used in `avx2` corresponds to an
+Intel instruction called "Packed Horizontal Add" and is used in AVX2 intrinsics
+like `_mm256_hadd_epi16` documented [here](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16&ig_expand=3667_)
+By inspecting the Intel documentation, we can write a Rust model for it
+as follows 
+
+```rust
+pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1])
+        }
+    })
+}
+```
+
+### Modeling defined intrinsics semi-automatically
+
+To model the third category of intrinsics, we copy the Rust code of
+the intrinsic and adapt it to use our underlying abstractions.  The
+changes needed to the code are sometimes scriptable, and indeed most
+of our models were generated from a script, but some changes are still
+needed by hand.
+
+For example, let us say the intrinsic we are modeling is
 `_mm256_bsrli_epi128` from the avx2 feature set.
 
 1. We go to [rust-lang/stdarch/crates/core_arch/src/x86/](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch/src/x86/), and find the implementation of the intrinsic in `avx2.rs`.
@@ -69,39 +171,17 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
         transmute(r)
     }
 }
-  ```
-Thus, we then go to to `core_arch/x86/models/avx2.rs`, and add the implementation. After some modification, it ends up looking like this.
-``` rust
-/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+```
 
-pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
-    const fn mask(shift: i32, i: u32) -> u64 {
-        let shift = shift as u32 & 0xff;
-        if shift > 15 || (15 - (i % 16)) < shift {
-            0 as u64
-        } else {
-            (32 + (i + shift)) as u64
-        }
-    }
-    
-	let a = BitVec::to_i8x32(a);
-	let r: i8x32 = simd_shuffle(
-		i8x32::from_fn(|_| 0),
-		a,
-		[
-			mask(IMM8, 0),
-			mask(IMM8, 1),
-			mask(IMM8, 2),
-			mask(IMM8, 3),
-			...
-			mask(IMM8, 31),
-		],
-	);
-	r.into()
-}
-  ```
+Thus, we then go to to `core_arch/x86/models/avx2.rs`, and add this implementation.
+The only change it requires here is that the `simd_shuffle` macro is a function in our model,
+and we discard all the function attributes.
+
+For other intrinsics, sometimes we need to make more changes. Since our model of the builtin intrinsics
+are more precise with respect to the type of their arguments compared to their Rust counterparts, we
+sometimes need to add more type annotations in our defined models. We also remove all `unsafe` guards,
+since our models are always in safe Rust. Otherwise, our code for the defined intrinsics looks very
+similar to the upstream code in `core::arch`.
   
 3. Next, we add a test for this intrinsic. For this, we navigate to `core_arch/avx2/tests/avx2.rs`. Since the value of
    `IMM8` can be up to 8 bits, we want to test constant arguments up to 255. Thus, we write the following macro invocation.
@@ -116,9 +196,10 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 To contribute new models of intrinsics, we expect the author to follow
 the above steps and provide comprehensive tests.  It is important that
 the model author look carefully at both the Intel/ARM specification
-and the Rust `stdarch` implementation, because the Rust implementation
-may not necessarily be correct.
+and the Rust `stdarch` implementation, because they may look quite different
+from each other. 
 
+In some cases, the Rust implementation may not be correct.
 Indeed, the previous implementation of `_mm256_bsrli_epi128` (and a
 similar intrinsic called `_mm512_bsrli_epi128`) in `stdarch` had a
 bug, which we found during the process of modeling and testing this
diff --git a/testable-simd-models/src/lib.rs b/testable-simd-models/src/lib.rs
index fc76194526e20..13d6ba2e6e7cd 100644
--- a/testable-simd-models/src/lib.rs
+++ b/testable-simd-models/src/lib.rs
@@ -25,7 +25,7 @@
 //! By providing a readable, testable, well-specified version of `core`'s behavior, it serves as a foundation for
 //! proof assistants and other verification tools.
 
-// This recursion limit is necessary for mk! macro sued for tests.
+// This recursion limit is necessary for mk! macro used for tests.
 // We test functions with const generics, the macro generate a test per possible (const generic) control value.
 #![recursion_limit = "4096"]
 pub mod abstractions;

From e16c47a01e5e752df806f2642cf8daaad55ddf1c Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 11:26:27 -0400
Subject: [PATCH 35/47] fix text

---
 testable-simd-models/README.md                | 16 ++---
 .../src/core_arch/x86/models/avx.rs           | 59 +++++++++++++++++++
 .../src/core_arch/x86/models/avx2.rs          |  3 +
 3 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index abb526672c252..0575d52a3b26f 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -8,13 +8,14 @@ The structure of this crate is based on [rust-lang/stdarch/crates/core_arch](htt
 Within the `core_arch` folder in this crate, there is a different
 folder for each architecture for which we have wrtten models. 
 In particular, it contains folders for `x86` and `arm_shared`.
-Each such folder has 3 sub-folders, `models`, `tests`, and `specs`. 
+Each such folder has 2 sub-folders: `models` and `tests`. 
 
-The `models` folder contains the models of the intrinsics, with a file
-corresponding to different target features, and are written using the
-various abstractions implemented in `crate::abstractions`, especially
-those in `crate::abstractions::simd`. These models are meant to
-closely resemble their implementations within the Rust core itself.
+The `models` folder contains the models of the intrinsics, with
+different files for different target features (e.g. `sse2`, `avx2`
+etc.). The code in this folder is written using the various
+abstractions implemented in `abstractions`, especially those in
+`abstractions::simd`. These models are meant to closely
+resemble their implementations within the Rust core itself.
 
 The `tests` folder contains the tests of these models, and is
 structured the same way as `models`. Each file additionally contains
@@ -23,7 +24,8 @@ tests work by testing the models against the intrinsics in the Rust
 core, trying out random inputs (generally 1000), and comparing their
 outputs.
 
-The tests can run by executing `cargo test`.
+All tests can run by executing `cargo test` and we expect this to be
+run as part of CI.
 
 ## Modeling a SIMD Intrinsic
 
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 3ea97b7c7bf29..8e2fb37319d36 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -28,6 +28,7 @@ use crate::abstractions::utilities::*;
 // pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { transmute(simd_add(a.as_f64x4(), b.as_f64x4())) }
 // }
+
 /// Adds packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`.
 ///
@@ -36,6 +37,7 @@ use crate::abstractions::utilities::*;
 // pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
 //     { transmute(simd_add(a.as_f32x8(), b.as_f32x8())) }
 // }
+
 /// Computes the bitwise AND of a packed double-precision (64-bit)
 /// floating-point elements in `a` and `b`.
 ///
@@ -153,6 +155,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { vmaxpd(a, b) }
 // }
+
 /// Compares packed single-precision (32-bit) floating-point elements in `a`
 /// and `b`, and returns packed maximum values
 ///
@@ -161,6 +164,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
 //     { vmaxps(a, b) }
 // }
+
 /// Compares packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`, and returns packed minimum values
 ///
@@ -169,6 +173,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { vminpd(a, b) }
 // }
+
 /// Compares packed single-precision (32-bit) floating-point elements in `a`
 /// and `b`, and returns packed minimum values
 ///
@@ -177,6 +182,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
 //     { vminps(a, b) }
 // }
+
 /// Multiplies packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
 ///
@@ -185,6 +191,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { transmute(simd_mul(a.as_f64x4(), b.as_f64x4())) }
 // }
+
 /// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`.
 ///
@@ -193,6 +200,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
 //     { transmute(simd_mul(a.as_f32x8(), b.as_f32x8())) }
 // }
+
 /// Alternatively adds and subtracts packed double-precision (64-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
 ///
@@ -207,6 +215,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 //         simd_shuffle(add, sub, [4, 1, 6, 3])
 //     }
 // }
+
 /// Alternatively adds and subtracts packed single-precision (32-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
 ///
@@ -221,6 +230,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 //         simd_shuffle(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
 //     }
 // }
+
 /// Subtracts packed double-precision (64-bit) floating-point elements in `b`
 /// from packed elements in `a`.
 ///
@@ -229,6 +239,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { simd_sub(a, b) }
 // }
+
 /// Subtracts packed single-precision (32-bit) floating-point elements in `b`
 /// from packed elements in `a`.
 ///
@@ -237,6 +248,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
 //     { simd_sub(a, b) }
 // }
+
 /// Computes the division of each of the 8 packed 32-bit floating-point elements
 /// in `a` by the corresponding packed elements in `b`.
 ///
@@ -245,6 +257,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
 //     { simd_div(a, b) }
 // }
+
 /// Computes the division of each of the 4 packed 64-bit floating-point elements
 /// in `a` by the corresponding packed elements in `b`.
 ///
@@ -253,6 +266,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { simd_div(a, b) }
 // }
+
 /// Rounds packed double-precision (64-bit) floating point elements in `a`
 /// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
 ///
@@ -271,6 +285,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 //     static_assert_uimm_bits!(ROUNDING, 4);
 //     { roundpd256(a, ROUNDING) }
 // }
+
 /// Rounds packed double-precision (64-bit) floating point elements in `a`
 /// toward positive infinity.
 ///
@@ -279,6 +294,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
 //     { simd_ceil(a) }
 // }
+
 /// Rounds packed double-precision (64-bit) floating point elements in `a`
 /// toward negative infinity.
 ///
@@ -287,6 +303,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
 //     { simd_floor(a) }
 // }
+
 /// Rounds packed single-precision (32-bit) floating point elements in `a`
 /// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
 ///
@@ -305,6 +322,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 //     static_assert_uimm_bits!(ROUNDING, 4);
 //     { roundps256(a, ROUNDING) }
 // }
+
 /// Rounds packed single-precision (32-bit) floating point elements in `a`
 /// toward positive infinity.
 ///
@@ -313,6 +331,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
 //     { simd_ceil(a) }
 // }
+
 /// Rounds packed single-precision (32-bit) floating point elements in `a`
 /// toward negative infinity.
 ///
@@ -321,6 +340,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_floor_ps(a: __m256) -> __m256 {
 //     { simd_floor(a) }
 // }
+
 /// Returns the square root of packed single-precision (32-bit) floating point
 /// elements in `a`.
 ///
@@ -329,6 +349,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
 //     { simd_fsqrt(a) }
 // }
+
 /// Returns the square root of packed double-precision (64-bit) floating point
 /// elements in `a`.
 ///
@@ -337,6 +358,7 @@ pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 // pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
 //     { simd_fsqrt(a) }
 // }
+
 /// Blends packed double-precision (64-bit) floating-point elements from
 /// `a` and `b` using control mask `imm8`.
 ///
@@ -410,6 +432,7 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 //     static_assert_uimm_bits!(IMM8, 8);
 //     { vdpps(a, b, IMM8 as i8) }
 // }
+
 /// Horizontal addition of adjacent pairs in the two packed vectors
 /// of 4 64-bit floating points `a` and `b`.
 /// In the result, sums of elements from `a` are returned in even locations,
@@ -420,6 +443,7 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 // pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { vhaddpd(a, b) }
 // }
+
 /// Horizontal addition of adjacent pairs in the two packed vectors
 /// of 8 32-bit floating points `a` and `b`.
 /// In the result, sums of elements from `a` are returned in locations of
@@ -431,6 +455,7 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 // pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
 //     { vhaddps(a, b) }
 // }
+
 /// Horizontal subtraction of adjacent pairs in the two packed vectors
 /// of 4 64-bit floating points `a` and `b`.
 /// In the result, sums of elements from `a` are returned in even locations,
@@ -441,6 +466,7 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 // pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
 //     { vhsubpd(a, b) }
 // }
+
 /// Horizontal subtraction of adjacent pairs in the two packed vectors
 /// of 8 32-bit floating points `a` and `b`.
 /// In the result, sums of elements from `a` are returned in locations of
@@ -452,6 +478,7 @@ pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 // pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
 //     { vhsubps(a, b) }
 // }
+
 /// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
 /// elements in `a` and `b`.
 ///
@@ -548,6 +575,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmppd(a, b, const { IMM5 as i8 }) }
 // }
+
 /// Compares packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `IMM5`.
@@ -558,6 +586,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmppd256(a, b, IMM5 as u8) }
 // }
+
 /// Compares packed single-precision (32-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `IMM5`.
@@ -568,6 +597,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmpps(a, b, const { IMM5 as i8 }) }
 // }
+
 /// Compares packed single-precision (32-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `IMM5`.
@@ -578,6 +608,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmpps256(a, b, const { IMM5 as u8 }) }
 // }
+
 /// Compares the lower double-precision (64-bit) floating-point element in
 /// `a` and `b` based on the comparison operand specified by `IMM5`,
 /// store the result in the lower element of returned vector,
@@ -590,6 +621,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmpsd(a, b, IMM5 as i8) }
 // }
+
 /// Compares the lower single-precision (32-bit) floating-point element in
 /// `a` and `b` based on the comparison operand specified by `IMM5`,
 /// store the result in the lower element of returned vector,
@@ -602,6 +634,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 //     static_assert_uimm_bits!(IMM5, 5);
 //     { vcmpss(a, b, IMM5 as i8) }
 // }
+
 /// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
 ///
@@ -631,6 +664,7 @@ pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
 // pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
 //     { transmute(vcvtps2dq(a)) }
 // }
+
 /// Converts packed single-precision (32-bit) floating-point elements in `a`
 /// to packed double-precision (64-bit) floating-point elements.
 ///
@@ -644,6 +678,7 @@ pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
 pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
     simd_extract(a.as_f64x4(), 0)
 }
+
 /// Converts packed double-precision (64-bit) floating-point elements in `a`
 /// to packed 32-bit integers with truncation.
 ///
@@ -652,6 +687,7 @@ pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
 // pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
 //     { transmute(vcvttpd2dq(a)) }
 // }
+
 /// Converts packed double-precision (64-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
 ///
@@ -660,6 +696,7 @@ pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
 // pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
 //     { transmute(vcvtpd2dq(a)) }
 // }
+
 /// Converts packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers with truncation.
 ///
@@ -668,6 +705,7 @@ pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
 // pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
 //     { transmute(vcvttps2dq(a)) }
 // }
+
 /// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
 /// floating-point elements) from `a`, selected with `imm8`.
 ///
@@ -724,6 +762,7 @@ pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
 // pub fn _mm256_zeroall() {
 //     { vzeroall() }
 // }
+
 /// Zeroes the upper 128 bits of all YMM registers;
 /// the lower 128-bits of the registers are unmodified.
 ///
@@ -732,6 +771,7 @@ pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
 // pub fn _mm256_zeroupper() {
 //     { vzeroupper() }
 // }
+
 /// Shuffles single-precision (32-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `b`.
 ///
@@ -740,6 +780,7 @@ pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
 // pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
 //     { vpermilps256(a, b.as_i32x8()) }
 // }
+
 /// Shuffles single-precision (32-bit) floating-point elements in `a`
 /// using the control in `b`.
 ///
@@ -748,6 +789,7 @@ pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
 // pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
 //     { vpermilps(a, b.as_i32x4()) }
 // }
+
 /// Shuffles single-precision (32-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `imm8`.
 ///
@@ -790,6 +832,7 @@ pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
         ))
     }
 }
+
 /// Shuffles double-precision (64-bit) floating-point elements in `a`
 /// within 256-bit lanes using the control in `b`.
 ///
@@ -798,6 +841,7 @@ pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
 // pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
 //     { vpermilpd256(a, b.as_i64x4()) }
 // }
+
 /// Shuffles double-precision (64-bit) floating-point elements in `a`
 /// using the control in `b`.
 ///
@@ -806,6 +850,7 @@ pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
 // pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
 //     { vpermilpd(a, b.as_i64x2()) }
 // }
+
 /// Shuffles double-precision (64-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `imm8`.
 ///
@@ -1082,6 +1127,7 @@ pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
 pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
     ptestc256(a.as_i64x4(), b.as_i64x4())
 }
+
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
 /// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
@@ -1093,6 +1139,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
 //     { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
 // }
+
 /// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
 /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
@@ -1106,6 +1153,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
 //     { vtestzpd256(a, b) }
 // }
+
 /// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
 /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
@@ -1119,6 +1167,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
 //     { vtestcpd256(a, b) }
 // }
+
 /// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
 /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
@@ -1133,6 +1182,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
 //     { vtestnzcpd256(a, b) }
 // }
+
 /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
 /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
@@ -1146,6 +1196,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
 //     { vtestzpd(a, b) }
 // }
+
 /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
 /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
@@ -1159,6 +1210,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
 //     { vtestcpd(a, b) }
 // }
+
 /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
 /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
@@ -1173,6 +1225,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
 //     { vtestnzcpd(a, b) }
 // }
+
 /// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
 /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
@@ -1186,6 +1239,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
 //     { vtestzps256(a, b) }
 // }
+
 /// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
 /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
@@ -1199,6 +1253,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
 //     { vtestcps256(a, b) }
 // }
+
 /// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
 /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
@@ -1213,6 +1268,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
 //     { vtestnzcps256(a, b) }
 // }
+
 /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
 /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
@@ -1226,6 +1282,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
 //     { vtestzps(a, b) }
 // }
+
 /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
 /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
@@ -1239,6 +1296,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
 //     { vtestcps(a, b) }
 // }
+
 /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
 /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
 /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
@@ -1253,6 +1311,7 @@ pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 // pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
 //     { vtestnzcps(a, b) }
 // }
+
 /// Sets each bit of the returned mask based on the most significant bit of the
 /// corresponding packed double-precision (64-bit) floating-point element in
 /// `a`.
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index e23c712c200b6..2626d04635bd6 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -1089,6 +1089,7 @@ pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 //         ))
 //     }
 // }
+
 /// Shuffles eight 32-bit floating-point elements in `a` across lanes using
 /// the corresponding 32-bit integer index in `idx`.
 ///
@@ -1097,6 +1098,7 @@ pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 // pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
 //     { permps(a, idx.as_i32x8()) }
 // }
+
 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
 /// and `b`, then horizontally sum each consecutive 8 differences to
 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
@@ -1342,6 +1344,7 @@ pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_bslli_epi128::<IMM8>(a)
 }
+
 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)

From 8bc25d4c27a25504b5f8e23ab575612a748a62f9 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Thu, 31 Jul 2025 21:36:48 +0530
Subject: [PATCH 36/47] Added explanation for test adding

---
 testable-simd-models/README.md | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index 0575d52a3b26f..df494480b03d5 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -185,12 +185,30 @@ sometimes need to add more type annotations in our defined models. We also remov
 since our models are always in safe Rust. Otherwise, our code for the defined intrinsics looks very
 similar to the upstream code in `core::arch`.
   
-3. Next, we add a test for this intrinsic. For this, we navigate to `core_arch/avx2/tests/avx2.rs`. Since the value of
-   `IMM8` can be up to 8 bits, we want to test constant arguments up to 255. Thus, we write the following macro invocation.
+3. Next, we add a test for this intrinsic in `core_arch/avx2/tests/avx2.rs`. For convenience purposes, we have defined a `mk!` macro, which can be used to automatically generate
+   tests. The test generated by the macro generates a number of random inputs (by default, 1000), and compares the output generated by the model
+   and that generated by the intrinsic in upstream `core::arch`.  A valid test of the intrinsic above looks like this.
    ```rust
 	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
    ```
-   Here, the `[100]` means we test 100 random inputs for each constant value. This concludes the necessary steps for implementing an intrinsic.
+   ```
+	         ^          ^                     ^                   ^
+			 |          |                     |                   |
+		    (1)        (2)                   (3)                 (4)
+   ```
+   The macro invocation has four parts. 
+   1. By default the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that
+      only 100 inputs are generated. 
+   2. This is the name of the intrinsic being tested, and is necessary in all cases.
+   3. This is relevant because of the constant `IMM8` argument used by the intrinsic. We can gleam from the name the constant argument is supposed
+      to be at most 8 bits wide. We can confirm this by looking at the `core::arch` implementation, and spotting the `static_assert_uimm_bits!(IMM8, 8);`
+      line, which asserts the constant argument fits in at most 8 bits. Thus, we add `{<0>,<1>,<2>,<3>,...,<255>}` to test for each possible constant
+      value of the constant argument. If the intrinsic does not have a constant argument, then this part must be ignored.
+   4. This signifies the arguments that the intrinsics take, and is also necessary.
+   
+   This surmises the steps needed to use the `mk!` macro to generate a test. There is a caveat however. In the case that the output of an intrinsic is _not_
+   a bit-vector (and is instead say, an integer like `i32`), then the macro will not work, and a manual test has to be written. 
+   
 
 
 ## Contributing Models

From e03010ec6e26ce098421a30e09c985f9b97fd333 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Thu, 31 Jul 2025 21:38:01 +0530
Subject: [PATCH 37/47] Indentation fix?

---
 testable-simd-models/README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index df494480b03d5..8aef7c060fe03 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -192,9 +192,7 @@ similar to the upstream code in `core::arch`.
 	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
    ```
    ```
-	         ^          ^                     ^                   ^
-			 |          |                     |                   |
-		    (1)        (2)                   (3)                 (4)
+	     (1)          (2)                   (3)                 (4)
    ```
    The macro invocation has four parts. 
    1. By default the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that

From 81f4943251985e858b5f6bb1bd78c224ad024496 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Thu, 31 Jul 2025 21:38:47 +0530
Subject: [PATCH 38/47] Romanisation

---
 testable-simd-models/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index 8aef7c060fe03..39b6ef5cfe308 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -192,7 +192,7 @@ similar to the upstream code in `core::arch`.
 	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
    ```
    ```
-	     (1)          (2)                   (3)                 (4)
+	     i.          ii.                   iii.                 iv.
    ```
    The macro invocation has four parts. 
    1. By default the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that

From 2ccdf2655963c4e0077613417b2244d6d1d93ffd Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Thu, 31 Jul 2025 21:39:23 +0530
Subject: [PATCH 39/47] Further finetuning

---
 testable-simd-models/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index 39b6ef5cfe308..1cbd7285958d0 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -192,7 +192,7 @@ similar to the upstream code in `core::arch`.
 	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
    ```
    ```
-	     i.          ii.                   iii.                 iv.
+	        i.          ii.                   iii.                 iv.
    ```
    The macro invocation has four parts. 
    1. By default the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that

From b53a824c899a5d36c5214461a1fbac6f1d669c93 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Thu, 31 Jul 2025 21:40:13 +0530
Subject: [PATCH 40/47] Again

---
 testable-simd-models/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index 1cbd7285958d0..ebc4cd368e588 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -192,7 +192,7 @@ similar to the upstream code in `core::arch`.
 	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
    ```
    ```
-	        i.          ii.                   iii.                 iv.
+	         i.          ii.                  iii.                 iv.
    ```
    The macro invocation has four parts. 
    1. By default the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that

From feb404cb67a1c0c0db573886345c5b8bdf1a6582 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Thu, 31 Jul 2025 21:40:53 +0530
Subject: [PATCH 41/47] Smarter way

---
 testable-simd-models/README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index ebc4cd368e588..b68343fb2886c 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -190,9 +190,7 @@ similar to the upstream code in `core::arch`.
    and that generated by the intrinsic in upstream `core::arch`.  A valid test of the intrinsic above looks like this.
    ```rust
 	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
-   ```
-   ```
-	         i.          ii.                  iii.                 iv.
+	   //    i.          ii.                  iii.                 iv.
    ```
    The macro invocation has four parts. 
    1. By default the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that

From 05823fcae777ddbdb3a27234d783bf02733623d7 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 18:01:40 -0400
Subject: [PATCH 42/47] generated tests

---
 .../src/core_arch/x86/tests/avx.rs            |  90 +++++++++++
 .../src/core_arch/x86/tests/avx2.rs           | 151 ++++++++++++++++++
 2 files changed, 241 insertions(+)

diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
index 4ffa0dc139b9d..60f19c2676cb1 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -130,3 +130,93 @@ mk!(_mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64));
 mk!(_mm256_set1_epi8(a: i8));
 mk!(_mm256_set1_epi16(a: i16));
 mk!(_mm256_set1_epi32(a: i32));
+
+mk!(_mm256_and_pd(a: __m256d, b: __m256d));
+mk!(_mm256_and_ps(a: __m256, b: __m256));
+mk!(_mm256_or_pd(a: __m256d, b: __m256d));
+mk!(_mm256_or_ps(a: __m256, b: __m256));
+mk!(_mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d));
+mk!(_mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256));
+mk!(_mm256_andnot_pd(a: __m256d, b: __m256d));
+mk!(_mm256_andnot_ps(a: __m256, b: __m256));
+mk!(_mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d));
+mk!(_mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256));
+mk!(_mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d));
+mk!(_mm256_blendv_ps(a: __m256, b: __m256, c: __m256));
+mk!(_mm256_xor_pd(a: __m256d, b: __m256d));
+mk!(_mm256_xor_ps(a: __m256, b: __m256));
+mk!(_mm256_cvtepi32_pd(a: __m128i));
+mk!(_mm256_cvtepi32_ps(a: __m256i));
+mk!(_mm256_cvtpd_ps(a: __m256d));
+mk!(_mm256_cvtps_pd(a: __m128));
+mk!(_mm256_cvtsd_f64(a: __m256d));
+mk!(_mm256_extractf128_ps<const IMM1: i32>(a: __m256));
+mk!(_mm256_extractf128_pd<const IMM1: i32>(a: __m256d));
+mk!(_mm256_extractf128_si256<const IMM1: i32>(a: __m256i));
+mk!(_mm256_extract_epi32<const INDEX: i32>(a: __m256i));
+mk!(_mm256_cvtsi256_si32(a: __m256i));
+mk!(_mm256_permute_ps<const IMM8: i32>(a: __m256));
+mk!(_mm_permute_ps<const IMM8: i32>(a: __m128));
+mk!(_mm256_permute_pd<const IMM4: i32>(a: __m256d));
+mk!(_mm_permute_pd<const IMM2: i32>(a: __m128d));
+mk!(_mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i));
+mk!(_mm256_broadcast_ss(f: &f32));
+mk!(_mm256_broadcast_ps(a: &__m128));
+mk!(_mm256_broadcast_pd(a: &__m128d));
+mk!(_mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128));
+mk!(_mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d));
+mk!(_mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i));
+mk!(_mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8));
+mk!(_mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16));
+mk!(_mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32));
+mk!(_mm256_movehdup_ps(a: __m256));
+mk!(_mm256_moveldup_ps(a: __m256));
+mk!(_mm256_movedup_pd(a: __m256d));
+mk!(_mm256_unpackhi_pd(a: __m256d, b: __m256d));
+mk!(_mm256_unpackhi_ps(a: __m256, b: __m256));
+mk!(_mm256_unpacklo_pd(a: __m256d, b: __m256d));
+mk!(_mm256_unpacklo_ps(a: __m256, b: __m256));
+mk!(_mm256_testz_si256(a: __m256i, b: __m256i));
+mk!(_mm256_testc_si256(a: __m256i, b: __m256i));
+mk!(_mm256_movemask_pd(a: __m256d));
+mk!(_mm256_movemask_ps(a: __m256));
+mk!(_mm256_setzero_pd());
+mk!(_mm256_setzero_ps());
+mk!(_mm256_setzero_si256());
+mk!(_mm256_set_pd(a: f64, b: f64, c: f64, d: f64));
+mk!(_mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32));
+mk!(_mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64));
+mk!(_mm256_setr_pd(a: f64, b: f64, c: f64, d: f64));
+mk!(_mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32));
+mk!(_mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64));
+mk!(_mm256_set1_pd(a: f64));
+mk!(_mm256_set1_ps(a: f32));
+mk!(_mm256_set1_epi8(a: i8));
+mk!(_mm256_set1_epi16(a: i16));
+mk!(_mm256_set1_epi32(a: i32));
+mk!(_mm256_set1_epi64x(a: i64));
+mk!(_mm256_castpd_ps(a: __m256d));
+mk!(_mm256_castps_pd(a: __m256));
+mk!(_mm256_castps_si256(a: __m256));
+mk!(_mm256_castsi256_ps(a: __m256i));
+mk!(_mm256_castpd_si256(a: __m256d));
+mk!(_mm256_castsi256_pd(a: __m256i));
+mk!(_mm256_castps256_ps128(a: __m256));
+mk!(_mm256_castpd256_pd128(a: __m256d));
+mk!(_mm256_castsi256_si128(a: __m256i));
+mk!(_mm256_castps128_ps256(a: __m128));
+mk!(_mm256_castpd128_pd256(a: __m128d));
+mk!(_mm256_castsi128_si256(a: __m128i));
+mk!(_mm256_zextps128_ps256(a: __m128));
+mk!(_mm256_zextsi128_si256(a: __m128i));
+mk!(_mm256_zextpd128_pd256(a: __m128d));
+mk!(_mm256_undefined_ps());
+mk!(_mm256_undefined_pd());
+mk!(_mm256_undefined_si256());
+mk!(_mm256_set_m128(hi: __m128, lo: __m128));
+mk!(_mm256_set_m128d(hi: __m128d, lo: __m128d));
+mk!(_mm256_set_m128i(hi: __m128i, lo: __m128i));
+mk!(_mm256_setr_m128(lo: __m128, hi: __m128));
+mk!(_mm256_setr_m128d(lo: __m128d, hi: __m128d));
+mk!(_mm256_setr_m128i(lo: __m128i, hi: __m128i));
+mk!(_mm256_cvtss_f32(a: __m256));
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
index a1b8378566403..561a763a408a4 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
@@ -529,3 +529,154 @@ fn _mm256_extract_epi16() {
         );
     }
 }
+
+mk!(_mm256_abs_epi32(a: __m256i));
+mk!(_mm256_abs_epi16(a: __m256i));
+mk!(_mm256_abs_epi8(a: __m256i));
+mk!(_mm256_add_epi64(a: __m256i, b: __m256i));
+mk!(_mm256_add_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_add_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_add_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_adds_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_adds_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_adds_epu8(a: __m256i, b: __m256i));
+mk!(_mm256_adds_epu16(a: __m256i, b: __m256i));
+mk!(_mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i));
+mk!(_mm256_and_si256(a: __m256i, b: __m256i));
+mk!(_mm256_andnot_si256(a: __m256i, b: __m256i));
+mk!(_mm256_avg_epu16(a: __m256i, b: __m256i));
+mk!(_mm256_avg_epu8(a: __m256i, b: __m256i));
+mk!(_mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i));
+mk!(_mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i));
+mk!(_mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i));
+mk!(_mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i));
+mk!(_mm_broadcastb_epi8(a: __m128i));
+mk!(_mm256_broadcastb_epi8(a: __m128i));
+mk!(_mm_broadcastd_epi32(a: __m128i));
+mk!(_mm256_broadcastd_epi32(a: __m128i));
+mk!(_mm_broadcastq_epi64(a: __m128i));
+mk!(_mm256_broadcastq_epi64(a: __m128i));
+mk!(_mm_broadcastsd_pd(a: __m128d));
+mk!(_mm256_broadcastsd_pd(a: __m128d));
+mk!(_mm_broadcastsi128_si256(a: __m128i));
+mk!(_mm256_broadcastsi128_si256(a: __m128i));
+mk!(_mm_broadcastss_ps(a: __m128));
+mk!(_mm256_broadcastss_ps(a: __m128));
+mk!(_mm_broadcastw_epi16(a: __m128i));
+mk!(_mm256_broadcastw_epi16(a: __m128i));
+mk!(_mm256_cmpeq_epi64(a: __m256i, b: __m256i));
+mk!(_mm256_cmpeq_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_cmpeq_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_cmpeq_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_cmpgt_epi64(a: __m256i, b: __m256i));
+mk!(_mm256_cmpgt_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_cmpgt_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_cmpgt_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_cvtepi16_epi32(a: __m128i));
+mk!(_mm256_cvtepi16_epi64(a: __m128i));
+mk!(_mm256_cvtepi32_epi64(a: __m128i));
+mk!(_mm256_cvtepi8_epi16(a: __m128i));
+mk!(_mm256_cvtepi8_epi32(a: __m128i));
+mk!(_mm256_cvtepi8_epi64(a: __m128i));
+mk!(_mm256_cvtepu16_epi32(a: __m128i));
+mk!(_mm256_cvtepu16_epi64(a: __m128i));
+mk!(_mm256_cvtepu32_epi64(a: __m128i));
+mk!(_mm256_cvtepu8_epi16(a: __m128i));
+mk!(_mm256_cvtepu8_epi32(a: __m128i));
+mk!(_mm256_cvtepu8_epi64(a: __m128i));
+mk!(_mm256_extracti128_si256<const IMM1: i32>(a: __m256i));
+mk!(_mm256_hadd_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_hadd_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_hadds_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_hsub_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_hsub_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_hsubs_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i));
+mk!(_mm256_madd_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_maddubs_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_max_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_max_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_max_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_max_epu16(a: __m256i, b: __m256i));
+mk!(_mm256_max_epu32(a: __m256i, b: __m256i));
+mk!(_mm256_max_epu8(a: __m256i, b: __m256i));
+mk!(_mm256_min_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_min_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_min_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_min_epu16(a: __m256i, b: __m256i));
+mk!(_mm256_min_epu32(a: __m256i, b: __m256i));
+mk!(_mm256_min_epu8(a: __m256i, b: __m256i));
+mk!(_mm256_movemask_epi8(a: __m256i));
+mk!(_mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i));
+mk!(_mm256_mul_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_mul_epu32(a: __m256i, b: __m256i));
+mk!(_mm256_mulhi_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_mulhi_epu16(a: __m256i, b: __m256i));
+mk!(_mm256_mullo_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_mullo_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_mulhrs_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_or_si256(a: __m256i, b: __m256i));
+mk!(_mm256_packs_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_packs_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_packus_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_packus_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_permutevar8x32_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i));
+mk!(_mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i));
+mk!(_mm256_sad_epu8(a: __m256i, b: __m256i));
+mk!(_mm256_shuffle_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_shuffle_epi32<const MASK: i32>(a: __m256i));
+mk!(_mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i));
+mk!(_mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i));
+mk!(_mm256_sign_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_sign_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_sign_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_sll_epi16(a: __m256i, count: __m128i));
+mk!(_mm256_sll_epi32(a: __m256i, count: __m128i));
+mk!(_mm256_sll_epi64(a: __m256i, count: __m128i));
+mk!(_mm256_slli_epi16<const IMM8: i32>(a: __m256i));
+mk!(_mm256_slli_epi32<const IMM8: i32>(a: __m256i));
+mk!(_mm256_slli_epi64<const IMM8: i32>(a: __m256i));
+mk!(_mm256_slli_si256<const IMM8: i32>(a: __m256i));
+mk!(_mm256_bslli_epi128<const IMM8: i32>(a: __m256i));
+mk!(_mm_sllv_epi32(a: __m128i, count: __m128i));
+mk!(_mm256_sllv_epi32(a: __m256i, count: __m256i));
+mk!(_mm_sllv_epi64(a: __m128i, count: __m128i));
+mk!(_mm256_sllv_epi64(a: __m256i, count: __m256i));
+mk!(_mm256_sra_epi16(a: __m256i, count: __m128i));
+mk!(_mm256_sra_epi32(a: __m256i, count: __m128i));
+mk!(_mm256_srai_epi16<const IMM8: i32>(a: __m256i));
+mk!(_mm256_srai_epi32<const IMM8: i32>(a: __m256i));
+mk!(_mm_srav_epi32(a: __m128i, count: __m128i));
+mk!(_mm256_srav_epi32(a: __m256i, count: __m256i));
+mk!(_mm256_srli_si256<const IMM8: i32>(a: __m256i));
+mk!(_mm256_bsrli_epi128<const IMM8: i32>(a: __m256i));
+mk!(_mm256_srl_epi16(a: __m256i, count: __m128i));
+mk!(_mm256_srl_epi32(a: __m256i, count: __m128i));
+mk!(_mm256_srl_epi64(a: __m256i, count: __m128i));
+mk!(_mm256_srli_epi16<const IMM8: i32>(a: __m256i));
+mk!(_mm256_srli_epi32<const IMM8: i32>(a: __m256i));
+mk!(_mm256_srli_epi64<const IMM8: i32>(a: __m256i));
+mk!(_mm_srlv_epi32(a: __m128i, count: __m128i));
+mk!(_mm256_srlv_epi32(a: __m256i, count: __m256i));
+mk!(_mm_srlv_epi64(a: __m128i, count: __m128i));
+mk!(_mm256_srlv_epi64(a: __m256i, count: __m256i));
+mk!(_mm256_sub_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_sub_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_sub_epi64(a: __m256i, b: __m256i));
+mk!(_mm256_sub_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_subs_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_subs_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_subs_epu16(a: __m256i, b: __m256i));
+mk!(_mm256_subs_epu8(a: __m256i, b: __m256i));
+mk!(_mm256_unpackhi_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_unpacklo_epi8(a: __m256i, b: __m256i));
+mk!(_mm256_unpackhi_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_unpacklo_epi16(a: __m256i, b: __m256i));
+mk!(_mm256_unpackhi_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_unpacklo_epi32(a: __m256i, b: __m256i));
+mk!(_mm256_unpackhi_epi64(a: __m256i, b: __m256i));
+mk!(_mm256_unpacklo_epi64(a: __m256i, b: __m256i));
+mk!(_mm256_xor_si256(a: __m256i, b: __m256i));
+mk!(_mm256_extract_epi8<const INDEX: i32>(a: __m256i));
+mk!(_mm256_extract_epi16<const INDEX: i32>(a: __m256i));

From 2f622214cce332503e4c3c8c9ffb27fe2ff9c668 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 18:05:25 -0400
Subject: [PATCH 43/47] tests

---
 .../src/core_arch/x86/tests/avx.rs            | 19 --------------
 .../src/core_arch/x86/tests/avx2.rs           | 26 -------------------
 2 files changed, 45 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
index 60f19c2676cb1..cb1bec8d0ae95 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -135,12 +135,8 @@ mk!(_mm256_and_pd(a: __m256d, b: __m256d));
 mk!(_mm256_and_ps(a: __m256, b: __m256));
 mk!(_mm256_or_pd(a: __m256d, b: __m256d));
 mk!(_mm256_or_ps(a: __m256, b: __m256));
-mk!(_mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d));
-mk!(_mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256));
 mk!(_mm256_andnot_pd(a: __m256d, b: __m256d));
 mk!(_mm256_andnot_ps(a: __m256, b: __m256));
-mk!(_mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d));
-mk!(_mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256));
 mk!(_mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d));
 mk!(_mm256_blendv_ps(a: __m256, b: __m256, c: __m256));
 mk!(_mm256_xor_pd(a: __m256d, b: __m256d));
@@ -150,25 +146,10 @@ mk!(_mm256_cvtepi32_ps(a: __m256i));
 mk!(_mm256_cvtpd_ps(a: __m256d));
 mk!(_mm256_cvtps_pd(a: __m128));
 mk!(_mm256_cvtsd_f64(a: __m256d));
-mk!(_mm256_extractf128_ps<const IMM1: i32>(a: __m256));
-mk!(_mm256_extractf128_pd<const IMM1: i32>(a: __m256d));
-mk!(_mm256_extractf128_si256<const IMM1: i32>(a: __m256i));
-mk!(_mm256_extract_epi32<const INDEX: i32>(a: __m256i));
 mk!(_mm256_cvtsi256_si32(a: __m256i));
-mk!(_mm256_permute_ps<const IMM8: i32>(a: __m256));
-mk!(_mm_permute_ps<const IMM8: i32>(a: __m128));
-mk!(_mm256_permute_pd<const IMM4: i32>(a: __m256d));
-mk!(_mm_permute_pd<const IMM2: i32>(a: __m128d));
-mk!(_mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i));
 mk!(_mm256_broadcast_ss(f: &f32));
 mk!(_mm256_broadcast_ps(a: &__m128));
 mk!(_mm256_broadcast_pd(a: &__m128d));
-mk!(_mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128));
-mk!(_mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d));
-mk!(_mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i));
-mk!(_mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8));
-mk!(_mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16));
-mk!(_mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32));
 mk!(_mm256_movehdup_ps(a: __m256));
 mk!(_mm256_moveldup_ps(a: __m256));
 mk!(_mm256_movedup_pd(a: __m256d));
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
index 561a763a408a4..1c3d26e22ddf3 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
@@ -541,14 +541,10 @@ mk!(_mm256_adds_epi8(a: __m256i, b: __m256i));
 mk!(_mm256_adds_epi16(a: __m256i, b: __m256i));
 mk!(_mm256_adds_epu8(a: __m256i, b: __m256i));
 mk!(_mm256_adds_epu16(a: __m256i, b: __m256i));
-mk!(_mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i));
 mk!(_mm256_and_si256(a: __m256i, b: __m256i));
 mk!(_mm256_andnot_si256(a: __m256i, b: __m256i));
 mk!(_mm256_avg_epu16(a: __m256i, b: __m256i));
 mk!(_mm256_avg_epu8(a: __m256i, b: __m256i));
-mk!(_mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i));
-mk!(_mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i));
-mk!(_mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i));
 mk!(_mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i));
 mk!(_mm_broadcastb_epi8(a: __m128i));
 mk!(_mm256_broadcastb_epi8(a: __m128i));
@@ -584,14 +580,12 @@ mk!(_mm256_cvtepu32_epi64(a: __m128i));
 mk!(_mm256_cvtepu8_epi16(a: __m128i));
 mk!(_mm256_cvtepu8_epi32(a: __m128i));
 mk!(_mm256_cvtepu8_epi64(a: __m128i));
-mk!(_mm256_extracti128_si256<const IMM1: i32>(a: __m256i));
 mk!(_mm256_hadd_epi16(a: __m256i, b: __m256i));
 mk!(_mm256_hadd_epi32(a: __m256i, b: __m256i));
 mk!(_mm256_hadds_epi16(a: __m256i, b: __m256i));
 mk!(_mm256_hsub_epi16(a: __m256i, b: __m256i));
 mk!(_mm256_hsub_epi32(a: __m256i, b: __m256i));
 mk!(_mm256_hsubs_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i));
 mk!(_mm256_madd_epi16(a: __m256i, b: __m256i));
 mk!(_mm256_maddubs_epi16(a: __m256i, b: __m256i));
 mk!(_mm256_max_epi16(a: __m256i, b: __m256i));
@@ -607,7 +601,6 @@ mk!(_mm256_min_epu16(a: __m256i, b: __m256i));
 mk!(_mm256_min_epu32(a: __m256i, b: __m256i));
 mk!(_mm256_min_epu8(a: __m256i, b: __m256i));
 mk!(_mm256_movemask_epi8(a: __m256i));
-mk!(_mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i));
 mk!(_mm256_mul_epi32(a: __m256i, b: __m256i));
 mk!(_mm256_mul_epu32(a: __m256i, b: __m256i));
 mk!(_mm256_mulhi_epi16(a: __m256i, b: __m256i));
@@ -621,42 +614,25 @@ mk!(_mm256_packs_epi32(a: __m256i, b: __m256i));
 mk!(_mm256_packus_epi16(a: __m256i, b: __m256i));
 mk!(_mm256_packus_epi32(a: __m256i, b: __m256i));
 mk!(_mm256_permutevar8x32_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i));
-mk!(_mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i));
 mk!(_mm256_sad_epu8(a: __m256i, b: __m256i));
 mk!(_mm256_shuffle_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_shuffle_epi32<const MASK: i32>(a: __m256i));
-mk!(_mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i));
-mk!(_mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i));
 mk!(_mm256_sign_epi16(a: __m256i, b: __m256i));
 mk!(_mm256_sign_epi32(a: __m256i, b: __m256i));
 mk!(_mm256_sign_epi8(a: __m256i, b: __m256i));
 mk!(_mm256_sll_epi16(a: __m256i, count: __m128i));
 mk!(_mm256_sll_epi32(a: __m256i, count: __m128i));
 mk!(_mm256_sll_epi64(a: __m256i, count: __m128i));
-mk!(_mm256_slli_epi16<const IMM8: i32>(a: __m256i));
-mk!(_mm256_slli_epi32<const IMM8: i32>(a: __m256i));
-mk!(_mm256_slli_epi64<const IMM8: i32>(a: __m256i));
-mk!(_mm256_slli_si256<const IMM8: i32>(a: __m256i));
-mk!(_mm256_bslli_epi128<const IMM8: i32>(a: __m256i));
 mk!(_mm_sllv_epi32(a: __m128i, count: __m128i));
 mk!(_mm256_sllv_epi32(a: __m256i, count: __m256i));
 mk!(_mm_sllv_epi64(a: __m128i, count: __m128i));
 mk!(_mm256_sllv_epi64(a: __m256i, count: __m256i));
 mk!(_mm256_sra_epi16(a: __m256i, count: __m128i));
 mk!(_mm256_sra_epi32(a: __m256i, count: __m128i));
-mk!(_mm256_srai_epi16<const IMM8: i32>(a: __m256i));
-mk!(_mm256_srai_epi32<const IMM8: i32>(a: __m256i));
 mk!(_mm_srav_epi32(a: __m128i, count: __m128i));
 mk!(_mm256_srav_epi32(a: __m256i, count: __m256i));
-mk!(_mm256_srli_si256<const IMM8: i32>(a: __m256i));
-mk!(_mm256_bsrli_epi128<const IMM8: i32>(a: __m256i));
 mk!(_mm256_srl_epi16(a: __m256i, count: __m128i));
 mk!(_mm256_srl_epi32(a: __m256i, count: __m128i));
 mk!(_mm256_srl_epi64(a: __m256i, count: __m128i));
-mk!(_mm256_srli_epi16<const IMM8: i32>(a: __m256i));
-mk!(_mm256_srli_epi32<const IMM8: i32>(a: __m256i));
-mk!(_mm256_srli_epi64<const IMM8: i32>(a: __m256i));
 mk!(_mm_srlv_epi32(a: __m128i, count: __m128i));
 mk!(_mm256_srlv_epi32(a: __m256i, count: __m256i));
 mk!(_mm_srlv_epi64(a: __m128i, count: __m128i));
@@ -678,5 +654,3 @@ mk!(_mm256_unpacklo_epi32(a: __m256i, b: __m256i));
 mk!(_mm256_unpackhi_epi64(a: __m256i, b: __m256i));
 mk!(_mm256_unpacklo_epi64(a: __m256i, b: __m256i));
 mk!(_mm256_xor_si256(a: __m256i, b: __m256i));
-mk!(_mm256_extract_epi8<const INDEX: i32>(a: __m256i));
-mk!(_mm256_extract_epi16<const INDEX: i32>(a: __m256i));

From 377ad8da7218d3fcb8b80bc36703f883dab1db2a Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Thu, 31 Jul 2025 18:10:37 -0400
Subject: [PATCH 44/47] avx2

---
 testable-simd-models/src/core_arch/x86/tests/avx2.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
index 1c3d26e22ddf3..2f776fc583629 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
@@ -1,6 +1,7 @@
 use super::upstream;
 use crate::abstractions::bitvec::BitVec;
 use crate::helpers::test::HasRandom;
+use super::types::*;
 
 /// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
 macro_rules! mk {

From 27819ebb3f32a5bb6ba5b1da330ef5b67bdba868 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Fri, 1 Aug 2025 14:18:23 +0200
Subject: [PATCH 45/47] tests

---
 .../src/core_arch/x86/tests/avx.rs            | 105 +++++++++++----
 .../src/core_arch/x86/tests/avx2.rs           | 120 +-----------------
 .../src/core_arch/x86/tests/mod.rs            |  69 +++++++++-
 testable-simd-models/src/helpers.rs           |  12 ++
 4 files changed, 158 insertions(+), 148 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
index cb1bec8d0ae95..02b1d81173ad0 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -3,6 +3,12 @@ use super::upstream;
 use crate::abstractions::bitvec::BitVec;
 use crate::helpers::test::HasRandom;
 
+macro_rules! assert_feq {
+    ($lhs:expr, $rhs:expr) => {
+        assert!(($lhs.is_nan() && $rhs.is_nan()) || $lhs == $rhs)
+    };
+}
+
 /// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
 macro_rules! mk {
     ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
@@ -48,6 +54,19 @@ fn _mm256_movemask_ps() {
     }
 }
 
+#[test]
+fn _mm256_movemask_pd() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_movemask_pd(a.into()),
+            unsafe { upstream::_mm256_movemask_pd(a.into()) }
+        );
+    }
+}
+
 #[test]
 fn _mm256_testz_si256() {
     let n = 1000;
@@ -62,6 +81,59 @@ fn _mm256_testz_si256() {
     }
 }
 
+#[test]
+fn _mm256_testc_si256() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        let b: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_testc_si256(a.into(), b.into()),
+            unsafe { upstream::_mm256_testc_si256(a.into(), b.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_cvtsd_f64() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_feq!(
+            super::super::models::avx::_mm256_cvtsd_f64(a.into()),
+            unsafe { upstream::_mm256_cvtsd_f64(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_cvtsi256_si32() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_cvtsi256_si32(a.into()),
+            unsafe { upstream::_mm256_cvtsi256_si32(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_cvtss_f32() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_feq!(
+            super::super::models::avx::_mm256_cvtss_f32(a.into()),
+            unsafe { upstream::_mm256_cvtss_f32(a.into()) }
+        );
+    }
+}
+
 mk!(_mm256_setzero_ps());
 mk!(_mm256_setzero_si256());
 mk!(_mm256_set_epi8(
@@ -130,6 +202,14 @@ mk!(_mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64));
 mk!(_mm256_set1_epi8(a: i8));
 mk!(_mm256_set1_epi16(a: i16));
 mk!(_mm256_set1_epi32(a: i32));
+mk!(_mm256_set1_epi64x(a: i64));
+mk!(_mm256_set_pd(a: f64, b: f64, c: f64, d: f64));
+mk!(_mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32));
+mk!(_mm256_setr_pd(a: f64, b: f64, c: f64, d: f64));
+mk!(_mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32));
+mk!(_mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64));
+mk!(_mm256_set1_pd(a: f64));
+mk!(_mm256_set1_ps(a: f32));
 
 mk!(_mm256_and_pd(a: __m256d, b: __m256d));
 mk!(_mm256_and_ps(a: __m256, b: __m256));
@@ -138,18 +218,12 @@ mk!(_mm256_or_ps(a: __m256, b: __m256));
 mk!(_mm256_andnot_pd(a: __m256d, b: __m256d));
 mk!(_mm256_andnot_ps(a: __m256, b: __m256));
 mk!(_mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d));
-mk!(_mm256_blendv_ps(a: __m256, b: __m256, c: __m256));
 mk!(_mm256_xor_pd(a: __m256d, b: __m256d));
 mk!(_mm256_xor_ps(a: __m256, b: __m256));
 mk!(_mm256_cvtepi32_pd(a: __m128i));
 mk!(_mm256_cvtepi32_ps(a: __m256i));
 mk!(_mm256_cvtpd_ps(a: __m256d));
 mk!(_mm256_cvtps_pd(a: __m128));
-mk!(_mm256_cvtsd_f64(a: __m256d));
-mk!(_mm256_cvtsi256_si32(a: __m256i));
-mk!(_mm256_broadcast_ss(f: &f32));
-mk!(_mm256_broadcast_ps(a: &__m128));
-mk!(_mm256_broadcast_pd(a: &__m128d));
 mk!(_mm256_movehdup_ps(a: __m256));
 mk!(_mm256_moveldup_ps(a: __m256));
 mk!(_mm256_movedup_pd(a: __m256d));
@@ -157,25 +231,7 @@ mk!(_mm256_unpackhi_pd(a: __m256d, b: __m256d));
 mk!(_mm256_unpackhi_ps(a: __m256, b: __m256));
 mk!(_mm256_unpacklo_pd(a: __m256d, b: __m256d));
 mk!(_mm256_unpacklo_ps(a: __m256, b: __m256));
-mk!(_mm256_testz_si256(a: __m256i, b: __m256i));
-mk!(_mm256_testc_si256(a: __m256i, b: __m256i));
-mk!(_mm256_movemask_pd(a: __m256d));
-mk!(_mm256_movemask_ps(a: __m256));
 mk!(_mm256_setzero_pd());
-mk!(_mm256_setzero_ps());
-mk!(_mm256_setzero_si256());
-mk!(_mm256_set_pd(a: f64, b: f64, c: f64, d: f64));
-mk!(_mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32));
-mk!(_mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64));
-mk!(_mm256_setr_pd(a: f64, b: f64, c: f64, d: f64));
-mk!(_mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32));
-mk!(_mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64));
-mk!(_mm256_set1_pd(a: f64));
-mk!(_mm256_set1_ps(a: f32));
-mk!(_mm256_set1_epi8(a: i8));
-mk!(_mm256_set1_epi16(a: i16));
-mk!(_mm256_set1_epi32(a: i32));
-mk!(_mm256_set1_epi64x(a: i64));
 mk!(_mm256_castpd_ps(a: __m256d));
 mk!(_mm256_castps_pd(a: __m256));
 mk!(_mm256_castps_si256(a: __m256));
@@ -200,4 +256,3 @@ mk!(_mm256_set_m128i(hi: __m128i, lo: __m128i));
 mk!(_mm256_setr_m128(lo: __m128, hi: __m128));
 mk!(_mm256_setr_m128d(lo: __m128d, hi: __m128d));
 mk!(_mm256_setr_m128i(lo: __m128i, hi: __m128i));
-mk!(_mm256_cvtss_f32(a: __m256));
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
index 2f776fc583629..dcabcbb58b1e0 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
@@ -1,7 +1,7 @@
+use super::types::*;
 use super::upstream;
 use crate::abstractions::bitvec::BitVec;
 use crate::helpers::test::HasRandom;
-use super::types::*;
 
 /// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
 macro_rules! mk {
@@ -81,7 +81,6 @@ mk!(_mm256_cvtepu8_epi64(a: BitVec));
 mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));
 mk!(_mm256_hadd_epi16(a: BitVec, b: BitVec));
 mk!(_mm256_hadd_epi32(a: BitVec, b: BitVec));
-mk!(_mm256_hadds_epi16(a: BitVec, b: BitVec));
 mk!(_mm256_hsub_epi16(a: BitVec, b: BitVec));
 mk!(_mm256_hsub_epi32(a: BitVec, b: BitVec));
 mk!(_mm256_hsubs_epi16(a: BitVec, b: BitVec));
@@ -183,6 +182,7 @@ mk!(_mm256_unpacklo_epi32(a: BitVec, b: BitVec));
 mk!(_mm256_unpackhi_epi64(a: BitVec, b: BitVec));
 mk!(_mm256_unpacklo_epi64(a: BitVec, b: BitVec));
 mk!(_mm256_xor_si256(a: BitVec, b: BitVec));
+
 #[test]
 fn _mm256_extract_epi8() {
     let n = 100;
@@ -531,127 +531,11 @@ fn _mm256_extract_epi16() {
     }
 }
 
-mk!(_mm256_abs_epi32(a: __m256i));
-mk!(_mm256_abs_epi16(a: __m256i));
-mk!(_mm256_abs_epi8(a: __m256i));
-mk!(_mm256_add_epi64(a: __m256i, b: __m256i));
-mk!(_mm256_add_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_add_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_add_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_adds_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_adds_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_adds_epu8(a: __m256i, b: __m256i));
-mk!(_mm256_adds_epu16(a: __m256i, b: __m256i));
 mk!(_mm256_and_si256(a: __m256i, b: __m256i));
 mk!(_mm256_andnot_si256(a: __m256i, b: __m256i));
 mk!(_mm256_avg_epu16(a: __m256i, b: __m256i));
 mk!(_mm256_avg_epu8(a: __m256i, b: __m256i));
-mk!(_mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i));
-mk!(_mm_broadcastb_epi8(a: __m128i));
-mk!(_mm256_broadcastb_epi8(a: __m128i));
-mk!(_mm_broadcastd_epi32(a: __m128i));
-mk!(_mm256_broadcastd_epi32(a: __m128i));
-mk!(_mm_broadcastq_epi64(a: __m128i));
-mk!(_mm256_broadcastq_epi64(a: __m128i));
 mk!(_mm_broadcastsd_pd(a: __m128d));
 mk!(_mm256_broadcastsd_pd(a: __m128d));
-mk!(_mm_broadcastsi128_si256(a: __m128i));
-mk!(_mm256_broadcastsi128_si256(a: __m128i));
 mk!(_mm_broadcastss_ps(a: __m128));
 mk!(_mm256_broadcastss_ps(a: __m128));
-mk!(_mm_broadcastw_epi16(a: __m128i));
-mk!(_mm256_broadcastw_epi16(a: __m128i));
-mk!(_mm256_cmpeq_epi64(a: __m256i, b: __m256i));
-mk!(_mm256_cmpeq_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_cmpeq_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_cmpeq_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_cmpgt_epi64(a: __m256i, b: __m256i));
-mk!(_mm256_cmpgt_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_cmpgt_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_cmpgt_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_cvtepi16_epi32(a: __m128i));
-mk!(_mm256_cvtepi16_epi64(a: __m128i));
-mk!(_mm256_cvtepi32_epi64(a: __m128i));
-mk!(_mm256_cvtepi8_epi16(a: __m128i));
-mk!(_mm256_cvtepi8_epi32(a: __m128i));
-mk!(_mm256_cvtepi8_epi64(a: __m128i));
-mk!(_mm256_cvtepu16_epi32(a: __m128i));
-mk!(_mm256_cvtepu16_epi64(a: __m128i));
-mk!(_mm256_cvtepu32_epi64(a: __m128i));
-mk!(_mm256_cvtepu8_epi16(a: __m128i));
-mk!(_mm256_cvtepu8_epi32(a: __m128i));
-mk!(_mm256_cvtepu8_epi64(a: __m128i));
-mk!(_mm256_hadd_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_hadd_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_hadds_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_hsub_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_hsub_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_hsubs_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_madd_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_maddubs_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_max_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_max_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_max_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_max_epu16(a: __m256i, b: __m256i));
-mk!(_mm256_max_epu32(a: __m256i, b: __m256i));
-mk!(_mm256_max_epu8(a: __m256i, b: __m256i));
-mk!(_mm256_min_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_min_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_min_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_min_epu16(a: __m256i, b: __m256i));
-mk!(_mm256_min_epu32(a: __m256i, b: __m256i));
-mk!(_mm256_min_epu8(a: __m256i, b: __m256i));
-mk!(_mm256_movemask_epi8(a: __m256i));
-mk!(_mm256_mul_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_mul_epu32(a: __m256i, b: __m256i));
-mk!(_mm256_mulhi_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_mulhi_epu16(a: __m256i, b: __m256i));
-mk!(_mm256_mullo_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_mullo_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_mulhrs_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_or_si256(a: __m256i, b: __m256i));
-mk!(_mm256_packs_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_packs_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_packus_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_packus_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_permutevar8x32_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_sad_epu8(a: __m256i, b: __m256i));
-mk!(_mm256_shuffle_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_sign_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_sign_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_sign_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_sll_epi16(a: __m256i, count: __m128i));
-mk!(_mm256_sll_epi32(a: __m256i, count: __m128i));
-mk!(_mm256_sll_epi64(a: __m256i, count: __m128i));
-mk!(_mm_sllv_epi32(a: __m128i, count: __m128i));
-mk!(_mm256_sllv_epi32(a: __m256i, count: __m256i));
-mk!(_mm_sllv_epi64(a: __m128i, count: __m128i));
-mk!(_mm256_sllv_epi64(a: __m256i, count: __m256i));
-mk!(_mm256_sra_epi16(a: __m256i, count: __m128i));
-mk!(_mm256_sra_epi32(a: __m256i, count: __m128i));
-mk!(_mm_srav_epi32(a: __m128i, count: __m128i));
-mk!(_mm256_srav_epi32(a: __m256i, count: __m256i));
-mk!(_mm256_srl_epi16(a: __m256i, count: __m128i));
-mk!(_mm256_srl_epi32(a: __m256i, count: __m128i));
-mk!(_mm256_srl_epi64(a: __m256i, count: __m128i));
-mk!(_mm_srlv_epi32(a: __m128i, count: __m128i));
-mk!(_mm256_srlv_epi32(a: __m256i, count: __m256i));
-mk!(_mm_srlv_epi64(a: __m128i, count: __m128i));
-mk!(_mm256_srlv_epi64(a: __m256i, count: __m256i));
-mk!(_mm256_sub_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_sub_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_sub_epi64(a: __m256i, b: __m256i));
-mk!(_mm256_sub_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_subs_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_subs_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_subs_epu16(a: __m256i, b: __m256i));
-mk!(_mm256_subs_epu8(a: __m256i, b: __m256i));
-mk!(_mm256_unpackhi_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_unpacklo_epi8(a: __m256i, b: __m256i));
-mk!(_mm256_unpackhi_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_unpacklo_epi16(a: __m256i, b: __m256i));
-mk!(_mm256_unpackhi_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_unpacklo_epi32(a: __m256i, b: __m256i));
-mk!(_mm256_unpackhi_epi64(a: __m256i, b: __m256i));
-mk!(_mm256_unpacklo_epi64(a: __m256i, b: __m256i));
-mk!(_mm256_xor_si256(a: __m256i, b: __m256i));
diff --git a/testable-simd-models/src/core_arch/x86/tests/mod.rs b/testable-simd-models/src/core_arch/x86/tests/mod.rs
index b5a0c3a449715..217ff55623dbf 100644
--- a/testable-simd-models/src/core_arch/x86/tests/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/mod.rs
@@ -14,9 +14,9 @@
 //!
 //! For example, some valid invocations are
 //!
-//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
-//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
-//! `mk!(_mm256_abs_epi16(a: BitVec));`
+//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: __m256i));`
+//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: __m256i));`
+//! `mk!(_mm256_abs_epi16(a: __m256i));`
 //!
 //! The number of random tests is optional. If not provided, it is taken to be 1000 by default.
 //! The const values are necessary if the function has constant arguments, but should be discarded if not.
@@ -45,6 +45,12 @@ pub(crate) mod types {
     pub type __m256 = BitVec<256>;
     #[allow(non_camel_case_types)]
     pub type __m128i = BitVec<128>;
+    #[allow(non_camel_case_types)]
+    pub type __m256d = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128 = BitVec<128>;
+    #[allow(non_camel_case_types)]
+    pub type __m128d = BitVec<128>;
 }
 
 pub(crate) mod upstream {
@@ -56,8 +62,10 @@ pub(crate) mod upstream {
 
 mod conversions {
     use super::upstream::{
-        __m128i, __m256, __m256i, _mm256_castps_si256, _mm256_castsi256_ps, _mm256_loadu_si256,
-        _mm256_storeu_si256, _mm_loadu_si128, _mm_storeu_si128,
+        __m128, __m128d, __m128i, __m256, __m256d, __m256i, _mm256_castpd_si256,
+        _mm256_castps_si256, _mm256_castsi256_pd, _mm256_castsi256_ps, _mm256_loadu_si256,
+        _mm256_storeu_si256, _mm_castpd_si128, _mm_castps_si128, _mm_castsi128_pd,
+        _mm_castsi128_ps, _mm_loadu_si128, _mm_storeu_si128,
     };
     use super::BitVec;
 
@@ -81,6 +89,27 @@ mod conversions {
         }
     }
 
+    impl From<BitVec<128>> for __m128 {
+        fn from(bv: BitVec<128>) -> __m128 {
+            let slice: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm_castsi128_ps(_mm_loadu_si128(slice.as_ptr() as *const __m128i)) }
+        }
+    }
+
+    impl From<BitVec<128>> for __m128d {
+        fn from(bv: BitVec<128>) -> __m128d {
+            let slice: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm_castsi128_pd(_mm_loadu_si128(slice.as_ptr() as *const __m128i)) }
+        }
+    }
+
+    impl From<BitVec<256>> for __m256d {
+        fn from(bv: BitVec<256>) -> __m256d {
+            let bv: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm256_castsi256_pd(_mm256_loadu_si256(bv.as_ptr() as *const _)) }
+        }
+    }
+
     impl From<__m256i> for BitVec<256> {
         fn from(vec: __m256i) -> BitVec<256> {
             let mut v = [0u8; 32];
@@ -101,6 +130,16 @@ mod conversions {
         }
     }
 
+    impl From<__m256d> for BitVec<256> {
+        fn from(vec: __m256d) -> BitVec<256> {
+            let mut v = [0u8; 32];
+            unsafe {
+                _mm256_storeu_si256(v.as_mut_ptr() as *mut _, _mm256_castpd_si256(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
     impl From<__m128i> for BitVec<128> {
         fn from(vec: __m128i) -> BitVec<128> {
             let mut v = [0u8; 16];
@@ -110,4 +149,24 @@ mod conversions {
             BitVec::from_slice(&v[..], 8)
         }
     }
+
+    impl From<__m128> for BitVec<128> {
+        fn from(vec: __m128) -> BitVec<128> {
+            let mut v = [0u8; 16];
+            unsafe {
+                _mm_storeu_si128(v.as_mut_ptr() as *mut _, _mm_castps_si128(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m128d> for BitVec<128> {
+        fn from(vec: __m128d) -> BitVec<128> {
+            let mut v = [0u8; 16];
+            unsafe {
+                _mm_storeu_si128(v.as_mut_ptr() as *mut _, _mm_castpd_si128(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
 }
diff --git a/testable-simd-models/src/helpers.rs b/testable-simd-models/src/helpers.rs
index c2ea42b4b7ed6..1a30bf251a877 100644
--- a/testable-simd-models/src/helpers.rs
+++ b/testable-simd-models/src/helpers.rs
@@ -33,6 +33,18 @@ pub mod test {
         }
     }
 
+    impl HasRandom for f32 {
+        fn random() -> Self {
+            u32::random() as f32
+        }
+    }
+
+    impl HasRandom for f64 {
+        fn random() -> Self {
+            u64::random() as f64
+        }
+    }
+
     impl HasRandom for Bit {
         fn random() -> Self {
             crate::abstractions::bit::Bit::from(bool::random())

From d7b90252271374145f8f531a5692a5ed1bdb8a3e Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Fri, 1 Aug 2025 14:23:38 +0200
Subject: [PATCH 46/47] readme edtis

---
 testable-simd-models/README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index b68343fb2886c..61505e96b2960 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -190,21 +190,21 @@ similar to the upstream code in `core::arch`.
    and that generated by the intrinsic in upstream `core::arch`.  A valid test of the intrinsic above looks like this.
    ```rust
 	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
-	   //    i.          ii.                  iii.                 iv.
    ```
    The macro invocation has four parts. 
-   1. By default the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that
+   1. `mk!([100]...`: By default the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that
       only 100 inputs are generated. 
-   2. This is the name of the intrinsic being tested, and is necessary in all cases.
-   3. This is relevant because of the constant `IMM8` argument used by the intrinsic. We can gleam from the name the constant argument is supposed
-      to be at most 8 bits wide. We can confirm this by looking at the `core::arch` implementation, and spotting the `static_assert_uimm_bits!(IMM8, 8);`
-      line, which asserts the constant argument fits in at most 8 bits. Thus, we add `{<0>,<1>,<2>,<3>,...,<255>}` to test for each possible constant
-      value of the constant argument. If the intrinsic does not have a constant argument, then this part must be ignored.
-   4. This signifies the arguments that the intrinsics take, and is also necessary.
+   2. `_mm256_bsrli_epi128`: This is the name of the intrinsic being tested, and is necessary in all cases.
+   3. `{<0>,<1>,<2>,<3>,...,<255>}`: This part only appears when the intrinsic has a const generic argument, like the `IMM8` in this intrinsic.
+      As the name indicates, this constant argument is supposed to be at most 8 bits wide.
+      We can confirm this by looking at the implementation, and spotting the `static_assert_uimm_bits!(IMM8, 8);`
+      line, which asserts that constant argument is positive and fits in 8 bits. Thus, we add `{<0>,<1>,<2>,<3>,...,<255>}` to test for each possible constant
+      value of the constant argument. 
+   4. `(a: BitVec)`: This part contains all the arguments of the intrinsic and their types.
    
    This surmises the steps needed to use the `mk!` macro to generate a test. There is a caveat however. In the case that the output of an intrinsic is _not_
-   a bit-vector (and is instead say, an integer like `i32`), then the macro will not work, and a manual test has to be written. 
-   
+   a bit-vector (and is instead say, an integer like `i32`), then the macro will not work, and a manual test has to be written. You can see examples in the test files.
+  
 
 
 ## Contributing Models

From 23339d963dde3033fc24afd0e1a6ca05378c66af Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Fri, 1 Aug 2025 08:30:33 -0400
Subject: [PATCH 47/47] README text

---
 testable-simd-models/README.md | 38 +++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index 61505e96b2960..470c51072c8e5 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -6,7 +6,7 @@ The structure of this crate is based on [rust-lang/stdarch/crates/core_arch](htt
 
 ## Code Structure
 Within the `core_arch` folder in this crate, there is a different
-folder for each architecture for which we have wrtten models. 
+folder for each architecture for which we have written models. 
 In particular, it contains folders for `x86` and `arm_shared`.
 Each such folder has 2 sub-folders: `models` and `tests`. 
 
@@ -18,18 +18,18 @@ abstractions implemented in `abstractions`, especially those in
 resemble their implementations within the Rust core itself.
 
 The `tests` folder contains the tests of these models, and is
-structured the same way as `models`. Each file additionally contains
+structured the same way as `models`. Each file additionally includes
 the definition of a macro that makes writing these tests easier. The
 tests work by testing the models against the intrinsics in the Rust
 core, trying out random inputs (generally 1000), and comparing their
 outputs.
 
-All tests can run by executing `cargo test` and we expect this to be
+All tests can be run by executing `cargo test`, and we expect this to be
 run as part of CI.
 
 ## Modeling a SIMD Intrinsic
 
-There are three kinds of SIMD intrinsics we find in `core::arch`.
+There are three kinds of SIMD intrinsics in `core::arch`.
 
 The first kind are builtin Rust compiler intrinsics, some of which are 
 in the [`intrinsics/simd.rs` file](https://github.com/model-checking/verify-rust-std/blob/main/library/core/src/intrinsics/simd.rs)
@@ -42,7 +42,7 @@ of `extern` intrinsics used in the Intel x86 AVX2 library.
 These extern intrinsics are typically platform-specific functions that map to low-level instructions.
 
 The third kind are `defined` intrinsics that are given proper definitions in Rust, and their code may
-depend on the builtin intrinsics or the extern intrinsics. There defined intrinsics represent higher-level
+depend on the builtin intrinsics or the extern intrinsics. These defined intrinsics represent higher-level
 operations that are wrappers around one or more assembly instructions.
 
 ### Modeling builtin intrinsics manually
@@ -70,7 +70,7 @@ pub fn simd_add<const N: u64, T: MachineInteger + Copy>(
 ```
 
 Notably, we model a strongly typed version of `simd_add`, in contrast to the compiler
-intrinsic which is too generic and unimplementable in safe Rust:
+intrinsic, which is too generic and unimplementable in safe Rust:
 
 ```rust
 /// Adds two simd vectors elementwise.
@@ -81,8 +81,8 @@ intrinsic which is too generic and unimplementable in safe Rust:
 pub unsafe fn simd_add<T>(x: T, y: T) -> T;
 ```
 
-The main rules for writing these models is that they should be simple and self-contained,
-relying only on the libraries in `abstractions` or on builtin Rust language features or on
+The main rules for writing these models are that they should be simple and self-contained,
+relying only on the libraries in `abstractions`, on builtin Rust language features, or 
 other testable models. In particular, they should not themselves directly call Rust libraries
 or external crates, without going through the abstractions API.
 
@@ -98,7 +98,7 @@ the extern intrinsics used in `avx2.rs` can be found in `avx2_handwritten.rs`.
 Modeling extern intrinsics is similar to modeling the builtin ones,
 in that the models are written by hand and treat the SIMD vectors
 as arrays of machine integers. The main difference is that these intrinsics
-are platform specific and so their modeling requires looking at the Intel or ARM
+are platform-specific and so their modeling requires looking at the Intel or ARM
 documentation for the underlying operation.
 
 For example, the extern intrinsic `phaddw` used in `avx2` corresponds to an
@@ -125,8 +125,8 @@ pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
 
 ### Modeling defined intrinsics semi-automatically
 
-To model the third category of intrinsics, we copy the Rust code of
-the intrinsic and adapt it to use our underlying abstractions.  The
+To model a defined intrinsic, we essentially copy the Rust code of
+the intrinsic from `core::arch` and adapt it to use our underlying abstractions.  The
 changes needed to the code are sometimes scriptable, and indeed most
 of our models were generated from a script, but some changes are still
 needed by hand.
@@ -175,12 +175,12 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 }
 ```
 
-Thus, we then go to to `core_arch/x86/models/avx2.rs`, and add this implementation.
+Thus, we then go to `core_arch/x86/models/avx2.rs`, and add this implementation.
 The only change it requires here is that the `simd_shuffle` macro is a function in our model,
 and we discard all the function attributes.
 
-For other intrinsics, sometimes we need to make more changes. Since our model of the builtin intrinsics
-are more precise with respect to the type of their arguments compared to their Rust counterparts, we
+For other intrinsics, we sometimes need to make more changes. Since our model of the builtin intrinsics
+is more precise concerning the type of their arguments compared to their Rust counterparts, we
 sometimes need to add more type annotations in our defined models. We also remove all `unsafe` guards,
 since our models are always in safe Rust. Otherwise, our code for the defined intrinsics looks very
 similar to the upstream code in `core::arch`.
@@ -192,18 +192,18 @@ similar to the upstream code in `core::arch`.
 	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
    ```
    The macro invocation has four parts. 
-   1. `mk!([100]...`: By default the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that
+   1. `mk!([100]...`: By default, the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that
       only 100 inputs are generated. 
    2. `_mm256_bsrli_epi128`: This is the name of the intrinsic being tested, and is necessary in all cases.
    3. `{<0>,<1>,<2>,<3>,...,<255>}`: This part only appears when the intrinsic has a const generic argument, like the `IMM8` in this intrinsic.
       As the name indicates, this constant argument is supposed to be at most 8 bits wide.
-      We can confirm this by looking at the implementation, and spotting the `static_assert_uimm_bits!(IMM8, 8);`
+      We can confirm this by looking at the implementation and spotting the `static_assert_uimm_bits!(IMM8, 8);`
       line, which asserts that constant argument is positive and fits in 8 bits. Thus, we add `{<0>,<1>,<2>,<3>,...,<255>}` to test for each possible constant
       value of the constant argument. 
    4. `(a: BitVec)`: This part contains all the arguments of the intrinsic and their types.
    
-   This surmises the steps needed to use the `mk!` macro to generate a test. There is a caveat however. In the case that the output of an intrinsic is _not_
-   a bit-vector (and is instead say, an integer like `i32`), then the macro will not work, and a manual test has to be written. You can see examples in the test files.
+   This summarizes the steps needed to use the `mk!` macro to generate a test. There is a caveat: in the case that the output of an intrinsic is _not_
+   a bit-vector (and is instead, say, an integer like `i32`), then the macro will not work, and a manual test has to be written. You can see examples in the test files.
   
 
 
@@ -211,7 +211,7 @@ similar to the upstream code in `core::arch`.
 
 To contribute new models of intrinsics, we expect the author to follow
 the above steps and provide comprehensive tests.  It is important that
-the model author look carefully at both the Intel/ARM specification
+the model author looks carefully at both the Intel/ARM specifications
 and the Rust `stdarch` implementation, because they may look quite different
 from each other.