Skip to content

Commit c6eeb2a

Browse files
committed
WIP. Implement native XOR3 for supported platforms
Add support for NEON EOR3 and AVX512 three-way exclusive OR (XOR3).
1 parent 41ff0f1 commit c6eeb2a

File tree

9 files changed

+112
-32
lines changed

9 files changed

+112
-32
lines changed

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ alloc = []
5050
# enable VPCLMULQDQ support in Rust for x86_64 using nightly toolchain builds
5151
vpclmulqdq = []
5252

53+
# enable AVX512 support in Rust for x86_64 using nightly toolchain builds
54+
avx512 = []
55+
5356
# enable using fast-crc32 optimized C implementations for CRC-32/ISCSI and CRC-32/ISO-HDLC, automatically detected
5457
optimize_crc32_auto = []
5558

src/algorithm.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -209,10 +209,10 @@ unsafe fn process_simd_chunks<T: ArchOps, W: EnhancedCrcWidth>(
209209
};
210210

211211
// Fold 16 bytes
212-
W::fold_16(&mut temp_state, coeff, ops);
212+
W::fold_16(&mut temp_state, coeff, yi, ops);
213213

214214
// XOR with new data
215-
*xi = ops.xor_vectors(temp_state.value, yi);
215+
*xi = temp_state.value;
216216
}
217217
}
218218

@@ -235,8 +235,9 @@ unsafe fn process_simd_chunks<T: ArchOps, W: EnhancedCrcWidth>(
235235
value: x[i],
236236
reflected: state.reflected,
237237
};
238-
W::fold_16(&mut temp_state, coeff, ops);
239-
res = ops.xor_vectors(res, temp_state.value);
238+
W::fold_16(&mut temp_state, coeff, res, ops);
239+
240+
res = temp_state.value
240241
}
241242

242243
// Perform final reduction and update state
@@ -338,10 +339,9 @@ where
338339
};
339340

340341
// Fold 16 bytes using width-specific method
341-
W::fold_16(&mut temp_state, coefficient, ops);
342+
W::fold_16(&mut temp_state, coefficient, new_data, ops);
342343

343-
// XOR with new data
344-
ops.xor_vectors(temp_state.value, new_data)
344+
temp_state.value
345345
}
346346

347347
/// Process inputs between 17 and 31 bytes
@@ -513,9 +513,9 @@ where
513513
(xmm2_blended, temp_state)
514514
};
515515

516-
W::fold_16(&mut temp_state, coefficient, ops);
516+
W::fold_16(&mut temp_state, coefficient, xmm2_blended, ops);
517517

518-
ops.xor_vectors(temp_state.value, xmm2_blended)
518+
temp_state.value
519519
} else {
520520
// For non-reflected mode (CRC-32f, CRC-64f)
521521

@@ -548,8 +548,8 @@ where
548548
reflected,
549549
};
550550

551-
W::fold_16(&mut temp_state, coefficient, ops);
551+
W::fold_16(&mut temp_state, coefficient, xmm2_blended, ops);
552552

553-
ops.xor_vectors(temp_state.value, xmm2_blended)
553+
temp_state.value
554554
}
555555
}

src/arch/aarch64.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
use crate::traits::ArchOps;
88
use std::arch::aarch64::*;
9+
use std::arch::is_aarch64_feature_detected;
910

1011
#[derive(Debug, Copy, Clone)]
1112
pub struct AArch64Ops;
@@ -255,6 +256,24 @@ impl ArchOps for AArch64Ops {
255256
vgetq_lane_p64(vreinterpretq_p64_u8(b), 1),
256257
))
257258
}
259+
260+
#[inline]
261+
#[cfg_attr(target_feature = "sha3", target_feature(enable = "neon,sha3"))]
262+
#[cfg_attr(not(target_feature = "sha3"), target_feature(enable = "neon"))]
263+
unsafe fn xor3_vectors(
264+
&self,
265+
a: Self::Vector,
266+
b: Self::Vector,
267+
c: Self::Vector,
268+
) -> Self::Vector {
269+
if is_aarch64_feature_detected!("sha3") {
270+
// Use native 3-way XOR instruction when available
271+
return veor3q_u8(a, b, c);
272+
}
273+
274+
// Fall back to two XOR operations
275+
veorq_u8(veorq_u8(a, b), c)
276+
}
258277
}
259278

260279
impl AArch64Ops {

src/arch/vpclmulqdq.rs

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,15 @@ impl Simd256 {
3737
}
3838

3939
#[inline]
40-
#[target_feature(enable = "avx2", enable = "vpclmulqdq")]
41-
unsafe fn fold_32(&self, coeff: &Self) -> Self {
42-
let result = _mm256_xor_si256(
40+
#[target_feature(enable = "avx2,avx512f,avx512vl,vpclmulqdq")]
41+
unsafe fn fold_32(&self, coeff: &Self, new_data: &Self) -> Self {
42+
// XOR3
43+
Self(_mm256_ternarylogic_epi64(
4344
_mm256_clmulepi64_epi128(self.0, coeff.0, 0x00),
4445
_mm256_clmulepi64_epi128(self.0, coeff.0, 0x11),
45-
);
46-
47-
Self(result)
46+
new_data.0,
47+
0x96,
48+
))
4849
}
4950

5051
#[inline]
@@ -118,7 +119,7 @@ impl Simd256 {
118119
impl VpclmulqdqOps {
119120
/// Process aligned blocks using VPCLMULQDQ
120121
#[inline]
121-
#[target_feature(enable = "avx2,vpclmulqdq,sse2,sse4.1,pclmulqdq")]
122+
#[target_feature(enable = "avx2,vpclmulqdq,sse2,sse4.1,pclmulqdq,avx512f,avx512vl")]
122123
unsafe fn process_vpclmulqdq_blocks<W: EnhancedCrcWidth>(
123124
&self,
124125
state: &mut CrcState<<VpclmulqdqOps as ArchOps>::Vector>,
@@ -165,7 +166,7 @@ impl VpclmulqdqOps {
165166
Simd256::from_m128i_pair(block[i * 2 + 1], block[i * 2]),
166167
);
167168

168-
*chunk = chunk.fold_32(&coeff).xor(&reflected_chunk);
169+
*chunk = chunk.fold_32(&coeff, &reflected_chunk);
169170
}
170171
}
171172

@@ -325,7 +326,7 @@ impl ArchOps for VpclmulqdqOps {
325326
type Vector = __m128i;
326327

327328
#[inline]
328-
#[target_feature(enable = "avx2,vpclmulqdq,sse2,sse4.1,pclmulqdq")]
329+
#[target_feature(enable = "avx2,vpclmulqdq,sse2,sse4.1,pclmulqdq,avx512f,avx512vl")]
329330
unsafe fn process_enhanced_simd_blocks<W: EnhancedCrcWidth>(
330331
&self,
331332
state: &mut CrcState<Self::Vector>,
@@ -535,4 +536,15 @@ impl ArchOps for VpclmulqdqOps {
535536
unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
536537
self.0.carryless_mul_11(a, b)
537538
}
539+
540+
#[inline]
541+
#[target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl")]
542+
unsafe fn xor3_vectors(
543+
&self,
544+
a: Self::Vector,
545+
b: Self::Vector,
546+
c: Self::Vector,
547+
) -> Self::Vector {
548+
self.0.xor3_vectors(a, b, c)
549+
}
538550
}

src/arch/x86.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,30 @@ impl ArchOps for X86Ops {
226226
unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
227227
_mm_clmulepi64_si128(a, b, 0x11)
228228
}
229+
230+
#[inline]
231+
#[cfg_attr(
232+
any(feature = "vpclmulqdq", feature = "avx512"),
233+
target_feature(enable = "avx512f,avx512vl")
234+
)]
235+
#[cfg_attr(
236+
all(not(feature = "vpclmulqdq"), not(feature = "avx512")),
237+
target_feature(enable = "sse2,sse4.1")
238+
)]
239+
unsafe fn xor3_vectors(
240+
&self,
241+
a: Self::Vector,
242+
b: Self::Vector,
243+
c: Self::Vector,
244+
) -> Self::Vector {
245+
#[cfg(any(feature = "vpclmulqdq", feature = "avx512"))]
246+
if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
247+
return _mm_ternarylogic_epi64(a, b, c, 0x96);
248+
}
249+
250+
// x86 doesn't have native XOR3 in SSE, use two XORs
251+
_mm_xor_si128(_mm_xor_si128(a, b), c)
252+
}
229253
}
230254

231255
impl X86Ops {

src/crc32/algorithm.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,12 @@ impl EnhancedCrcWidth for crate::structs::Width32 {
7676
}
7777

7878
#[inline(always)]
79-
unsafe fn fold_16<T: ArchOps>(state: &mut CrcState<T::Vector>, coeff: T::Vector, ops: &T)
80-
where
79+
unsafe fn fold_16<T: ArchOps>(
80+
state: &mut CrcState<T::Vector>,
81+
coeff: T::Vector,
82+
new_data: T::Vector,
83+
ops: &T,
84+
) where
8185
T::Vector: Copy,
8286
{
8387
// For CRC-32, we need to handle the 32-bit sections of each 64-bit value
@@ -95,7 +99,7 @@ impl EnhancedCrcWidth for crate::structs::Width32 {
9599
)
96100
};
97101

98-
state.value = ops.xor_vectors(h, l);
102+
state.value = ops.xor3_vectors(h, l, new_data);
99103
}
100104

101105
/// CRC-32 specific implementation for folding 8 bytes to 4 bytes

src/crc64/algorithm.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,20 @@ impl EnhancedCrcWidth for crate::structs::Width64 {
5757
}
5858

5959
#[inline(always)]
60-
unsafe fn fold_16<T: ArchOps>(state: &mut CrcState<T::Vector>, coeff: T::Vector, ops: &T)
61-
where
60+
unsafe fn fold_16<T: ArchOps>(
61+
state: &mut CrcState<T::Vector>,
62+
coeff: T::Vector,
63+
new_data: T::Vector,
64+
ops: &T,
65+
) where
6266
T::Vector: Copy,
6367
{
6468
// CRC-64 specific implementation for folding 16 bytes
6569
state.value = {
66-
ops.xor_vectors(
70+
ops.xor3_vectors(
6771
ops.carryless_mul_00(state.value, coeff),
6872
ops.carryless_mul_11(state.value, coeff),
73+
new_data,
6974
)
7075
};
7176
}

src/lib.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,10 @@
105105
//! assert_eq!(checksum.unwrap(), 0xcbf43926);
106106
//! ```
107107
108-
// if VPCLMULQDQ is enabled, enable extra AVX512 features
108+
// if VPCLMULQDQ or AVX512 is enabled, enable extra AVX512 features
109109
#![cfg_attr(
110-
feature = "vpclmulqdq",
111-
feature(avx512_target_feature, stdarch_x86_avx512)
110+
any(feature = "vpclmulqdq", feature = "avx512"),
111+
feature(stdarch_x86_avx512)
112112
)]
113113

114114
use crate::crc32::consts::{

src/traits.rs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,15 @@ pub trait ArchOps: Sized + Copy + Clone {
221221

222222
/// Perform carryless multiplication with immediate value 0x11 (high 64 bits of both vectors)
223223
unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector;
224+
225+
/// XOR three vectors together: a XOR b XOR c
226+
/// Uses native XOR3 instructions when available, falls back to two XOR operations otherwise
227+
unsafe fn xor3_vectors(
228+
&self,
229+
a: Self::Vector,
230+
b: Self::Vector,
231+
c: Self::Vector,
232+
) -> Self::Vector;
224233
}
225234

226235
#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
@@ -247,9 +256,13 @@ pub trait EnhancedCrcWidth: CrcWidth {
247256
where
248257
T::Vector: Copy;
249258

250-
/// Perform width-specific folding operations
251-
unsafe fn fold_16<T: ArchOps>(state: &mut CrcState<T::Vector>, coefficient: T::Vector, ops: &T)
252-
where
259+
/// Perform width-specific folding operations using CLMUL and two XOR operations (or one XOR3)
260+
unsafe fn fold_16<T: ArchOps>(
261+
state: &mut CrcState<T::Vector>,
262+
coefficient: T::Vector,
263+
new_data: T::Vector,
264+
ops: &T,
265+
) where
253266
T::Vector: Copy;
254267

255268
/// Fold width-specific number of bytes

0 commit comments

Comments
 (0)