cmov: impl Cmov for [u8] (#1354)

tarcieri · web-flow · commit db238d8ab9ef · 2026-01-15T18:45:01.000-07:00
Adapts the implementation written for core arrays in #1350/#1351 to work on byte slices as well, since there's no reason it couldn't in a code-identical manner. The `[u8; N]` impl now invokes the `[u8]` impl (hopefully inlining can take advantage of knowledge of static `N`). Unfortunately there's now an inconsistency between `Cmov` being impl'd for `[u8]` and `CmovEq` being impl'd for `[T]`. We should probably eventually have a breaking change for the latter, so the crate can provide an efficient impl of `CmovEq` for byte slices.
diff --git a/cmov/src/array.rs b/cmov/src/array.rs
@@ -1,41 +1,21 @@
+//! Trait impls for core arrays.
+
 use crate::{
     Cmov, CmovEq, Condition,
-    utils::{slice_as_chunks, slice_as_chunks_mut},
+    utils::{WORD_SIZE, Word, slice_as_chunks},
 };
 
-// Uses 64-bit words on 64-bit targets, 32-bit everywhere else
-#[cfg(not(target_pointer_width = "64"))]
-type Word = u32;
-#[cfg(target_pointer_width = "64")]
-type Word = u64;
-const WORD_SIZE: usize = size_of::<Word>();
-
 /// Optimized implementation for byte arrays which coalesces them into word-sized chunks first,
 /// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
-///
-/// With compile-time knowledge of `N`, the compiler should also be able to unroll the loops in
-/// cases where efficiency would benefit, reducing the implementation to a sequence of word-sized
-/// [`Cmov`] ops (and if `N` isn't word-aligned, followed by a series of 1-byte ops).
 impl<const N: usize> Cmov for [u8; N] {
     #[inline]
     fn cmovnz(&mut self, value: &Self, condition: Condition) {
-        let (self_chunks, self_remainder) = slice_as_chunks_mut::<u8, WORD_SIZE>(self);
-        let (value_chunks, value_remainder) = slice_as_chunks::<u8, WORD_SIZE>(value);
-
-        for (self_chunk, value_chunk) in self_chunks.iter_mut().zip(value_chunks.iter()) {
-            let mut a = Word::from_ne_bytes(*self_chunk);
-            let b = Word::from_ne_bytes(*value_chunk);
-            a.cmovnz(&b, condition);
-            self_chunk.copy_from_slice(&a.to_ne_bytes());
-        }
-
-        // Process the remainder a byte-at-a-time.
-        for (a, b) in self_remainder.iter_mut().zip(value_remainder.iter()) {
-            a.cmovnz(b, condition);
-        }
+        self.as_mut_slice().cmovnz(value, condition);
     }
 }
 
+/// Optimized implementation for byte arrays which coalesces them into word-sized chunks first,
+/// then performs [`CmovEq`] at the word-level to cut down on the total number of instructions.
 impl<const N: usize> CmovEq for [u8; N] {
     fn cmovne(&self, rhs: &Self, input: Condition, output: &mut Condition) {
         let (self_chunks, self_remainder) = slice_as_chunks::<u8, WORD_SIZE>(self);
diff --git a/cmov/src/lib.rs b/cmov/src/lib.rs
@@ -39,6 +39,7 @@ mod array;
     miri
 ))]
 mod portable;
+mod slice;
 #[cfg(not(miri))]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86;
@@ -198,19 +199,3 @@ macro_rules! impl_cmov_traits_for_signed_ints {
 }
 
 impl_cmov_traits_for_signed_ints!(i8 => u8, i16 => u16, i32 => u32, i64 => u64, i128 => u128);
-
-impl<T: CmovEq> CmovEq for [T] {
-    fn cmovne(&self, rhs: &Self, input: Condition, output: &mut Condition) {
-        // Short-circuit the comparison if the slices are of different lengths, and set the output
-        // condition to the input condition.
-        if self.len() != rhs.len() {
-            *output = input;
-            return;
-        }
-
-        // Compare each byte.
-        for (a, b) in self.iter().zip(rhs.iter()) {
-            a.cmovne(b, input, output);
-        }
-    }
-}
diff --git a/cmov/src/slice.rs b/cmov/src/slice.rs
@@ -0,0 +1,42 @@
+//! Trait impls for core slices.
+
+use crate::utils::{WORD_SIZE, Word, slice_as_chunks, slice_as_chunks_mut};
+use crate::{Cmov, CmovEq, Condition};
+
+/// Optimized implementation for byte slices which coalesces them into word-sized chunks first,
+/// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
+impl Cmov for [u8] {
+    #[inline]
+    fn cmovnz(&mut self, value: &Self, condition: Condition) {
+        let (self_chunks, self_remainder) = slice_as_chunks_mut::<u8, WORD_SIZE>(self);
+        let (value_chunks, value_remainder) = slice_as_chunks::<u8, WORD_SIZE>(value);
+
+        for (self_chunk, value_chunk) in self_chunks.iter_mut().zip(value_chunks.iter()) {
+            let mut a = Word::from_ne_bytes(*self_chunk);
+            let b = Word::from_ne_bytes(*value_chunk);
+            a.cmovnz(&b, condition);
+            self_chunk.copy_from_slice(&a.to_ne_bytes());
+        }
+
+        // Process the remainder a byte-at-a-time.
+        for (a, b) in self_remainder.iter_mut().zip(value_remainder.iter()) {
+            a.cmovnz(b, condition);
+        }
+    }
+}
+
+impl<T: CmovEq> CmovEq for [T] {
+    fn cmovne(&self, rhs: &Self, input: Condition, output: &mut Condition) {
+        // Short-circuit the comparison if the slices are of different lengths, and set the output
+        // condition to the input condition.
+        if self.len() != rhs.len() {
+            *output = input;
+            return;
+        }
+
+        // Compare each byte.
+        for (a, b) in self.iter().zip(rhs.iter()) {
+            a.cmovne(b, input, output);
+        }
+    }
+}
diff --git a/cmov/src/utils.rs b/cmov/src/utils.rs
@@ -23,6 +23,14 @@ macro_rules! masknz {
     }};
 }
 
+// Uses 64-bit words on 64-bit targets, 32-bit everywhere else
+#[cfg(not(target_pointer_width = "64"))]
+pub(crate) type Word = u32;
+#[cfg(target_pointer_width = "64")]
+pub(crate) type Word = u64;
+pub(crate) const WORD_SIZE: usize = size_of::<Word>();
+const _: () = debug_assert!(size_of::<usize>() <= WORD_SIZE, "unexpected word size");
+
 /// Rust core `[T]::as_chunks` vendored because of its 1.88 MSRV.
 /// TODO(tarcieri): use upstream function when we bump MSRV
 #[inline]