Skip to content

Commit db238d8

Browse files
authored
cmov: impl Cmov for [u8] (#1354)
Adapts the implementation written for core arrays in #1350/#1351 to work on byte slices as well, since there's no reason it couldn't in a code-identical manner. The `[u8; N]` impl now invokes the `[u8]` impl (hopefully inlining can take advantage of knowledge of static `N`). Unfortunately there's now an inconsistency between `Cmov` being impl'd for `[u8]` and `CmovEq` being impl'd for `[T]`. We should probably eventually have a breaking change for the latter, so the crate can provide an efficient impl of `CmovEq` for byte slices.
1 parent 46a43fb commit db238d8

File tree

4 files changed

+57
-42
lines changed

4 files changed

+57
-42
lines changed

cmov/src/array.rs

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,21 @@
1+
//! Trait impls for core arrays.
2+
13
use crate::{
24
Cmov, CmovEq, Condition,
3-
utils::{slice_as_chunks, slice_as_chunks_mut},
5+
utils::{WORD_SIZE, Word, slice_as_chunks},
46
};
57

6-
// Uses 64-bit words on 64-bit targets, 32-bit everywhere else
7-
#[cfg(not(target_pointer_width = "64"))]
8-
type Word = u32;
9-
#[cfg(target_pointer_width = "64")]
10-
type Word = u64;
11-
const WORD_SIZE: usize = size_of::<Word>();
12-
138
/// Optimized implementation for byte arrays which coalesces them into word-sized chunks first,
149
/// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
15-
///
16-
/// With compile-time knowledge of `N`, the compiler should also be able to unroll the loops in
17-
/// cases where efficiency would benefit, reducing the implementation to a sequence of word-sized
18-
/// [`Cmov`] ops (and if `N` isn't word-aligned, followed by a series of 1-byte ops).
1910
impl<const N: usize> Cmov for [u8; N] {
2011
#[inline]
2112
fn cmovnz(&mut self, value: &Self, condition: Condition) {
22-
let (self_chunks, self_remainder) = slice_as_chunks_mut::<u8, WORD_SIZE>(self);
23-
let (value_chunks, value_remainder) = slice_as_chunks::<u8, WORD_SIZE>(value);
24-
25-
for (self_chunk, value_chunk) in self_chunks.iter_mut().zip(value_chunks.iter()) {
26-
let mut a = Word::from_ne_bytes(*self_chunk);
27-
let b = Word::from_ne_bytes(*value_chunk);
28-
a.cmovnz(&b, condition);
29-
self_chunk.copy_from_slice(&a.to_ne_bytes());
30-
}
31-
32-
// Process the remainder a byte-at-a-time.
33-
for (a, b) in self_remainder.iter_mut().zip(value_remainder.iter()) {
34-
a.cmovnz(b, condition);
35-
}
13+
self.as_mut_slice().cmovnz(value, condition);
3614
}
3715
}
3816

17+
/// Optimized implementation for byte arrays which coalesces them into word-sized chunks first,
18+
/// then performs [`CmovEq`] at the word-level to cut down on the total number of instructions.
3919
impl<const N: usize> CmovEq for [u8; N] {
4020
fn cmovne(&self, rhs: &Self, input: Condition, output: &mut Condition) {
4121
let (self_chunks, self_remainder) = slice_as_chunks::<u8, WORD_SIZE>(self);

cmov/src/lib.rs

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ mod array;
3939
miri
4040
))]
4141
mod portable;
42+
mod slice;
4243
#[cfg(not(miri))]
4344
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4445
mod x86;
@@ -198,19 +199,3 @@ macro_rules! impl_cmov_traits_for_signed_ints {
198199
}
199200

200201
impl_cmov_traits_for_signed_ints!(i8 => u8, i16 => u16, i32 => u32, i64 => u64, i128 => u128);
201-
202-
impl<T: CmovEq> CmovEq for [T] {
203-
fn cmovne(&self, rhs: &Self, input: Condition, output: &mut Condition) {
204-
// Short-circuit the comparison if the slices are of different lengths, and set the output
205-
// condition to the input condition.
206-
if self.len() != rhs.len() {
207-
*output = input;
208-
return;
209-
}
210-
211-
// Compare each byte.
212-
for (a, b) in self.iter().zip(rhs.iter()) {
213-
a.cmovne(b, input, output);
214-
}
215-
}
216-
}

cmov/src/slice.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
//! Trait impls for core slices.
2+
3+
use crate::utils::{WORD_SIZE, Word, slice_as_chunks, slice_as_chunks_mut};
4+
use crate::{Cmov, CmovEq, Condition};
5+
6+
/// Optimized implementation for byte slices which coalesces them into word-sized chunks first,
7+
/// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
8+
impl Cmov for [u8] {
9+
#[inline]
10+
fn cmovnz(&mut self, value: &Self, condition: Condition) {
11+
let (self_chunks, self_remainder) = slice_as_chunks_mut::<u8, WORD_SIZE>(self);
12+
let (value_chunks, value_remainder) = slice_as_chunks::<u8, WORD_SIZE>(value);
13+
14+
for (self_chunk, value_chunk) in self_chunks.iter_mut().zip(value_chunks.iter()) {
15+
let mut a = Word::from_ne_bytes(*self_chunk);
16+
let b = Word::from_ne_bytes(*value_chunk);
17+
a.cmovnz(&b, condition);
18+
self_chunk.copy_from_slice(&a.to_ne_bytes());
19+
}
20+
21+
// Process the remainder a byte-at-a-time.
22+
for (a, b) in self_remainder.iter_mut().zip(value_remainder.iter()) {
23+
a.cmovnz(b, condition);
24+
}
25+
}
26+
}
27+
28+
impl<T: CmovEq> CmovEq for [T] {
29+
fn cmovne(&self, rhs: &Self, input: Condition, output: &mut Condition) {
30+
// Short-circuit the comparison if the slices are of different lengths, and set the output
31+
// condition to the input condition.
32+
if self.len() != rhs.len() {
33+
*output = input;
34+
return;
35+
}
36+
37+
// Compare each byte.
38+
for (a, b) in self.iter().zip(rhs.iter()) {
39+
a.cmovne(b, input, output);
40+
}
41+
}
42+
}

cmov/src/utils.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@ macro_rules! masknz {
2323
}};
2424
}
2525

26+
// Uses 64-bit words on 64-bit targets, 32-bit everywhere else
27+
#[cfg(not(target_pointer_width = "64"))]
28+
pub(crate) type Word = u32;
29+
#[cfg(target_pointer_width = "64")]
30+
pub(crate) type Word = u64;
31+
pub(crate) const WORD_SIZE: usize = size_of::<Word>();
32+
const _: () = debug_assert!(size_of::<usize>() <= WORD_SIZE, "unexpected word size");
33+
2634
/// Rust core `[T]::as_chunks` vendored because of its 1.88 MSRV.
2735
/// TODO(tarcieri): use upstream function when we bump MSRV
2836
#[inline]

0 commit comments

Comments
 (0)