cmov: impl optimized Cmov for [u8; N] (#1350)

tarcieri · web-flow · commit 4bb07d1c4352 · 2026-01-15T14:37:02.000-07:00
Adds a specialized impl of `Cmov` for byte arrays of generic size which
first coalesces elements of the array into word-sized chunks, then calls
`Cmov` on those.

This should result in significantly more efficient codegen, which can
also take advantage of compile-time knowledge of `N` to potentially
unroll loops.

Unfortunately without specialization this means we can't impl `Cmov` for
other types of arrays, but downstream consumers can just iterate over
them and call `Cmov::cmov*` on each element easily enough, whereas this
optimized implementation for byte arrays actually provides something a
lot less trivial than looping over an array.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1 @@
 target
-**/*proptest-regressions
diff --git a/cmov/src/lib.rs b/cmov/src/lib.rs
@@ -196,6 +196,51 @@ macro_rules! impl_cmov_traits_for_signed_ints {
 
 impl_cmov_traits_for_signed_ints!(i8 => u8, i16 => u16, i32 => u32, i64 => u64, i128 => u128);
 
+/// Optimized implementation for byte arrays which coalesces them into word-sized chunks first,
+/// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
+///
+/// With compile-time knowledge of `N`, the compiler should also be able to unroll the loops in
+/// cases where efficiency would benefit, reducing the implementation to a sequence of word-sized
+/// [`Cmov`] ops (and if `N` isn't word-aligned, followed by a series of 1-byte ops).
+impl<const N: usize> Cmov for [u8; N] {
+    #[inline]
+    fn cmovnz(&mut self, value: &Self, condition: Condition) {
+        // Uses 64-bit words on 64-bit targets, 32-bit everywhere else
+        #[cfg(not(target_pointer_width = "64"))]
+        type Chunk = u32;
+        #[cfg(target_pointer_width = "64")]
+        type Chunk = u64;
+        const CHUNK_SIZE: usize = size_of::<Chunk>();
+
+        // Load a chunk from a byte slice
+        // TODO(tarcieri): use `array_chunks` when stable (rust-lang/rust##100450)
+        #[inline]
+        fn load_chunk(slice: &[u8]) -> Chunk {
+            Chunk::from_ne_bytes(slice.try_into().expect("should be the right size"))
+        }
+
+        let mut self_chunks = self.chunks_exact_mut(CHUNK_SIZE);
+        let mut value_chunks = value.chunks_exact(CHUNK_SIZE);
+
+        // Process as much input as we can a `Chunk`-at-a-time.
+        for (self_chunk, value_chunk) in self_chunks.by_ref().zip(value_chunks.by_ref()) {
+            let mut a = load_chunk(self_chunk);
+            let b = load_chunk(value_chunk);
+            a.cmovnz(&b, condition);
+            self_chunk.copy_from_slice(&a.to_ne_bytes());
+        }
+
+        // Process the remainder a byte-at-a-time.
+        for (a, b) in self_chunks
+            .into_remainder()
+            .iter_mut()
+            .zip(value_chunks.remainder().iter())
+        {
+            a.cmovnz(b, condition);
+        }
+    }
+}
+
 impl<T: CmovEq> CmovEq for [T] {
     fn cmoveq(&self, rhs: &Self, input: Condition, output: &mut Condition) {
         let mut tmp = 1u8;
diff --git a/cmov/tests/core_impls.rs b/cmov/tests/core_impls.rs
@@ -122,40 +122,88 @@ int_tests!(
     0x2222_2222_2222_2222_3333_3333_3333_3333u128
 );
 
+mod arrays {
+    use cmov::Cmov;
+
+    // 127-elements: large enough to test the chunk loop, odd-sized to test remainder handling,
+    // and with each element different to ensure the operations actually work
+    const EXAMPLE_A: [u8; 127] = [
+        0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11,
+        0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+        0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e,
+        0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d,
+        0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
+        0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
+        0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a,
+        0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+    ];
+
+    const EXAMPLE_B: [u8; 127] = [
+        0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1,
+        0xf0, 0xef, 0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8, 0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2,
+        0xe1, 0xe0, 0xdf, 0xde, 0xdd, 0xdc, 0xdb, 0xda, 0xd9, 0xd8, 0xd7, 0xd6, 0xd5, 0xd4, 0xd3,
+        0xd2, 0xd1, 0xd0, 0xcf, 0xce, 0xcd, 0xcc, 0xcb, 0xca, 0xc9, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4,
+        0xc3, 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5,
+        0xb4, 0xb3, 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac, 0xab, 0xaa, 0xa9, 0xa8, 0xa7, 0xa6,
+        0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0, 0x9f, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a, 0x99, 0x98, 0x97,
+        0x96, 0x95, 0x94, 0x93, 0x92, 0x91, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88,
+        0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81,
+    ];
+
+    /// Note: we only provide this impl for `[u8; N]` so we have some optimized way of operating
+    /// over byte arrays. Unfortunately without specialization we can't also provide a generalized
+    /// impl, but having good codegen for byte arrays is important.
+    #[test]
+    fn u8_cmovnz_works() {
+        let mut x = EXAMPLE_A;
+        x.cmovnz(&EXAMPLE_B, 0);
+        assert_eq!(x, EXAMPLE_A);
+
+        for cond in 1..u8::MAX {
+            let mut x = EXAMPLE_A;
+            x.cmovnz(&EXAMPLE_B, cond);
+            assert_eq!(x, EXAMPLE_B);
+        }
+    }
+}
+
 mod slices {
     use cmov::CmovEq;
 
     #[test]
     fn cmoveq_works() {
+        let example = [1u8, 2, 3].as_slice();
         let mut o = 0u8;
 
         // Same slices.
-        [1u8, 2, 3].cmoveq(&[1, 2, 3], 43, &mut o);
+        example.cmoveq(example, 43, &mut o);
         assert_eq!(o, 43);
 
         // Different lengths.
-        [1u8, 2, 3].cmoveq(&[1, 2], 44, &mut o);
+        example.cmoveq(&[1, 2], 44, &mut o);
         assert_ne!(o, 44);
 
         // Different contents.
-        [1u8, 2, 3].cmoveq(&[1, 2, 4], 45, &mut o);
+        example.cmoveq(&[1, 2, 4], 45, &mut o);
         assert_ne!(o, 45);
     }
 
     #[test]
     fn cmovne_works() {
+        let example = [1u8, 2, 3].as_slice();
         let mut o = 0u8;
 
         // Same slices.
-        [1u8, 2, 3].cmovne(&[1, 2, 3], 43, &mut o);
+        example.cmovne(example, 43, &mut o);
         assert_ne!(o, 43);
 
         // Different lengths.
-        [1u8, 2, 3].cmovne(&[1, 2], 44, &mut o);
+        example.cmovne(&[1, 2], 44, &mut o);
         assert_eq!(o, 44);
 
         // Different contents.
-        [1u8, 2, 3].cmovne(&[1, 2, 4], 45, &mut o);
+        example.cmovne(&[1, 2, 4], 45, &mut o);
         assert_eq!(o, 45);
     }
 }
diff --git a/cmov/tests/proptests.rs b/cmov/tests/proptests.rs
@@ -9,55 +9,51 @@ macro_rules! int_proptests {
                 proptest! {
                     #[test]
                     fn cmovz_works(mut a in any::<$int>(), b in any::<$int>(), cond in any::<u8>()) {
-                        a.cmovz(&b, cond);
-
                         let expected = if cond == 0 {
                             b
                         } else {
                             a
                         };
 
+                        a.cmovz(&b, cond);
                         prop_assert_eq!(expected, a);
                     }
 
                     #[test]
                     fn cmovnz_works(mut a in any::<$int>(), b in any::<$int>(), cond in any::<u8>()) {
-                        a.cmovnz(&b, cond);
-
                         let expected = if cond != 0 {
                             b
                         } else {
                             a
                         };
 
+                        a.cmovnz(&b, cond);
                         prop_assert_eq!(expected, a);
                     }
 
                     #[test]
                      fn cmoveq_works(a in any::<$int>(), b in any::<$int>(), cond in any::<u8>()) {
-                        let mut actual = 0;
-                        a.cmoveq(&b, cond, &mut actual);
-
                         let expected = if a == b {
                             cond
                         } else {
                             0
                         };
 
+                        let mut actual = 0;
+                        a.cmoveq(&b, cond, &mut actual);
                         prop_assert_eq!(expected, actual);
                      }
 
                     #[test]
                      fn cmovne_works(a in any::<$int>(), b in any::<$int>(), cond in any::<u8>()) {
-                        let mut actual = 0;
-                        a.cmovne(&b, cond, &mut actual);
-
                         let expected = if a != b {
                             cond
                         } else {
                             0
                         };
 
+                        let mut actual = 0;
+                        a.cmovne(&b, cond, &mut actual);
                         prop_assert_eq!(expected, actual);
                      }
                 }
@@ -66,4 +62,61 @@ macro_rules! int_proptests {
     };
 }
 
+/// Write the proptests for a byte array of the given size.
+macro_rules! byte_array_proptests {
+    ( $($name:ident: $size:expr),+ ) => {
+        $(
+            mod $name {
+                use cmov::Cmov;
+                use proptest::prelude::*;
+
+                proptest! {
+                    #[test]
+                    fn cmovnz_works(
+                        mut a in any::<[u8; $size]>(),
+                        b in any::<[u8; $size]>(),
+                        cond in any::<u8>()
+                    ) {
+                        let expected = if cond == 0 {
+                            a
+                        } else {
+                            b
+                        };
+
+                        a.cmovnz(&b, cond);
+                        prop_assert_eq!(expected, a);
+                    }
+                }
+            }
+        )+
+    };
+}
+
 int_proptests!(i8, i16, i32, i64, i128, u8, u16, u32, u64, u128);
+byte_array_proptests!(
+    array0: 0,
+    array1: 1,
+    array2: 2,
+    array3: 3,
+    array4: 4,
+    array5: 5,
+    array6: 6,
+    array7: 7,
+    array8: 8,
+    array9: 9,
+    array10: 10,
+    array11: 11,
+    array12: 12,
+    array13: 13,
+    array14: 14,
+    array15: 15,
+    array16: 16,
+    array17: 17,
+    array18: 18,
+    array19: 19,
+    array20: 20,
+    array21: 21,
+    array22: 22,
+    array23: 23,
+    array24: 24
+);