WIP. Implement native XOR3 for supported platforms

onethumb · onethumb · commit c6eeb2a0db31 · 2025-06-04T12:45:29.000-07:00
Add support for NEON EOR3 and AVX512 three-way exclusive OR (XOR3).
diff --git a/Cargo.toml b/Cargo.toml
@@ -50,6 +50,9 @@ alloc = []
 # enable VPCLMULQDQ support in Rust for x86_64 using nightly toolchain builds
 vpclmulqdq = []
 
+# enable AVX512 support in Rust for x86_64 using nightly toolchain builds
+avx512 = []
+
 # enable using fast-crc32 optimized C implementations for CRC-32/ISCSI and CRC-32/ISO-HDLC, automatically detected
 optimize_crc32_auto = []
 
diff --git a/src/algorithm.rs b/src/algorithm.rs
@@ -209,10 +209,10 @@ unsafe fn process_simd_chunks<T: ArchOps, W: EnhancedCrcWidth>(
             };
 
             // Fold 16 bytes
-            W::fold_16(&mut temp_state, coeff, ops);
+            W::fold_16(&mut temp_state, coeff, yi, ops);
 
             // XOR with new data
-            *xi = ops.xor_vectors(temp_state.value, yi);
+            *xi = temp_state.value;
         }
     }
 
@@ -235,8 +235,9 @@ unsafe fn process_simd_chunks<T: ArchOps, W: EnhancedCrcWidth>(
             value: x[i],
             reflected: state.reflected,
         };
-        W::fold_16(&mut temp_state, coeff, ops);
-        res = ops.xor_vectors(res, temp_state.value);
+        W::fold_16(&mut temp_state, coeff, res, ops);
+
+        res = temp_state.value
     }
 
     // Perform final reduction and update state
@@ -338,10 +339,9 @@ where
     };
 
     // Fold 16 bytes using width-specific method
-    W::fold_16(&mut temp_state, coefficient, ops);
+    W::fold_16(&mut temp_state, coefficient, new_data, ops);
 
-    // XOR with new data
-    ops.xor_vectors(temp_state.value, new_data)
+    temp_state.value
 }
 
 /// Process inputs between 17 and 31 bytes
@@ -513,9 +513,9 @@ where
             (xmm2_blended, temp_state)
         };
 
-        W::fold_16(&mut temp_state, coefficient, ops);
+        W::fold_16(&mut temp_state, coefficient, xmm2_blended, ops);
 
-        ops.xor_vectors(temp_state.value, xmm2_blended)
+        temp_state.value
     } else {
         // For non-reflected mode (CRC-32f, CRC-64f)
 
@@ -548,8 +548,8 @@ where
             reflected,
         };
 
-        W::fold_16(&mut temp_state, coefficient, ops);
+        W::fold_16(&mut temp_state, coefficient, xmm2_blended, ops);
 
-        ops.xor_vectors(temp_state.value, xmm2_blended)
+        temp_state.value
     }
 }
diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs
@@ -6,6 +6,7 @@
 
 use crate::traits::ArchOps;
 use std::arch::aarch64::*;
+use std::arch::is_aarch64_feature_detected;
 
 #[derive(Debug, Copy, Clone)]
 pub struct AArch64Ops;
@@ -255,6 +256,24 @@ impl ArchOps for AArch64Ops {
             vgetq_lane_p64(vreinterpretq_p64_u8(b), 1),
         ))
     }
+
+    #[inline]
+    #[cfg_attr(target_feature = "sha3", target_feature(enable = "neon,sha3"))]
+    #[cfg_attr(not(target_feature = "sha3"), target_feature(enable = "neon"))]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        if is_aarch64_feature_detected!("sha3") {
+            // Use native 3-way XOR instruction when available
+            return veor3q_u8(a, b, c);
+        }
+
+        // Fall back to two XOR operations
+        veorq_u8(veorq_u8(a, b), c)
+    }
 }
 
 impl AArch64Ops {
diff --git a/src/arch/vpclmulqdq.rs b/src/arch/vpclmulqdq.rs
@@ -37,14 +37,15 @@ impl Simd256 {
     }
 
     #[inline]
-    #[target_feature(enable = "avx2", enable = "vpclmulqdq")]
-    unsafe fn fold_32(&self, coeff: &Self) -> Self {
-        let result = _mm256_xor_si256(
+    #[target_feature(enable = "avx2,avx512f,avx512vl,vpclmulqdq")]
+    unsafe fn fold_32(&self, coeff: &Self, new_data: &Self) -> Self {
+        // XOR3
+        Self(_mm256_ternarylogic_epi64(
             _mm256_clmulepi64_epi128(self.0, coeff.0, 0x00),
             _mm256_clmulepi64_epi128(self.0, coeff.0, 0x11),
-        );
-
-        Self(result)
+            new_data.0,
+            0x96,
+        ))
     }
 
     #[inline]
@@ -118,7 +119,7 @@ impl Simd256 {
 impl VpclmulqdqOps {
     /// Process aligned blocks using VPCLMULQDQ
     #[inline]
-    #[target_feature(enable = "avx2,vpclmulqdq,sse2,sse4.1,pclmulqdq")]
+    #[target_feature(enable = "avx2,vpclmulqdq,sse2,sse4.1,pclmulqdq,avx512f,avx512vl")]
     unsafe fn process_vpclmulqdq_blocks<W: EnhancedCrcWidth>(
         &self,
         state: &mut CrcState<<VpclmulqdqOps as ArchOps>::Vector>,
@@ -165,7 +166,7 @@ impl VpclmulqdqOps {
                     Simd256::from_m128i_pair(block[i * 2 + 1], block[i * 2]),
                 );
 
-                *chunk = chunk.fold_32(&coeff).xor(&reflected_chunk);
+                *chunk = chunk.fold_32(&coeff, &reflected_chunk);
             }
         }
 
@@ -325,7 +326,7 @@ impl ArchOps for VpclmulqdqOps {
     type Vector = __m128i;
 
     #[inline]
-    #[target_feature(enable = "avx2,vpclmulqdq,sse2,sse4.1,pclmulqdq")]
+    #[target_feature(enable = "avx2,vpclmulqdq,sse2,sse4.1,pclmulqdq,avx512f,avx512vl")]
     unsafe fn process_enhanced_simd_blocks<W: EnhancedCrcWidth>(
         &self,
         state: &mut CrcState<Self::Vector>,
@@ -535,4 +536,15 @@ impl ArchOps for VpclmulqdqOps {
     unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         self.0.carryless_mul_11(a, b)
     }
+
+    #[inline]
+    #[target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl")]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        self.0.xor3_vectors(a, b, c)
+    }
 }
diff --git a/src/arch/x86.rs b/src/arch/x86.rs
@@ -226,6 +226,30 @@ impl ArchOps for X86Ops {
     unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         _mm_clmulepi64_si128(a, b, 0x11)
     }
+
+    #[inline]
+    #[cfg_attr(
+        any(feature = "vpclmulqdq", feature = "avx512"),
+        target_feature(enable = "avx512f,avx512vl")
+    )]
+    #[cfg_attr(
+        all(not(feature = "vpclmulqdq"), not(feature = "avx512")),
+        target_feature(enable = "sse2,sse4.1")
+    )]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        #[cfg(any(feature = "vpclmulqdq", feature = "avx512"))]
+        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
+            return _mm_ternarylogic_epi64(a, b, c, 0x96);
+        }
+
+        // x86 doesn't have native XOR3 in SSE, use two XORs
+        _mm_xor_si128(_mm_xor_si128(a, b), c)
+    }
 }
 
 impl X86Ops {
diff --git a/src/crc32/algorithm.rs b/src/crc32/algorithm.rs
@@ -76,8 +76,12 @@ impl EnhancedCrcWidth for crate::structs::Width32 {
     }
 
     #[inline(always)]
-    unsafe fn fold_16<T: ArchOps>(state: &mut CrcState<T::Vector>, coeff: T::Vector, ops: &T)
-    where
+    unsafe fn fold_16<T: ArchOps>(
+        state: &mut CrcState<T::Vector>,
+        coeff: T::Vector,
+        new_data: T::Vector,
+        ops: &T,
+    ) where
         T::Vector: Copy,
     {
         // For CRC-32, we need to handle the 32-bit sections of each 64-bit value
@@ -95,7 +99,7 @@ impl EnhancedCrcWidth for crate::structs::Width32 {
             )
         };
 
-        state.value = ops.xor_vectors(h, l);
+        state.value = ops.xor3_vectors(h, l, new_data);
     }
 
     /// CRC-32 specific implementation for folding 8 bytes to 4 bytes
diff --git a/src/crc64/algorithm.rs b/src/crc64/algorithm.rs
@@ -57,15 +57,20 @@ impl EnhancedCrcWidth for crate::structs::Width64 {
     }
 
     #[inline(always)]
-    unsafe fn fold_16<T: ArchOps>(state: &mut CrcState<T::Vector>, coeff: T::Vector, ops: &T)
-    where
+    unsafe fn fold_16<T: ArchOps>(
+        state: &mut CrcState<T::Vector>,
+        coeff: T::Vector,
+        new_data: T::Vector,
+        ops: &T,
+    ) where
         T::Vector: Copy,
     {
         // CRC-64 specific implementation for folding 16 bytes
         state.value = {
-            ops.xor_vectors(
+            ops.xor3_vectors(
                 ops.carryless_mul_00(state.value, coeff),
                 ops.carryless_mul_11(state.value, coeff),
+                new_data,
             )
         };
     }
diff --git a/src/lib.rs b/src/lib.rs
@@ -105,10 +105,10 @@
 //! assert_eq!(checksum.unwrap(), 0xcbf43926);
 //! ```
 
-// if VPCLMULQDQ is enabled, enable extra AVX512 features
+// if VPCLMULQDQ or AVX512 is enabled, enable extra AVX512 features
 #![cfg_attr(
-    feature = "vpclmulqdq",
-    feature(avx512_target_feature, stdarch_x86_avx512)
+    any(feature = "vpclmulqdq", feature = "avx512"),
+    feature(stdarch_x86_avx512)
 )]
 
 use crate::crc32::consts::{
diff --git a/src/traits.rs b/src/traits.rs
@@ -221,6 +221,15 @@ pub trait ArchOps: Sized + Copy + Clone {
 
     /// Perform carryless multiplication with immediate value 0x11 (high 64 bits of both vectors)
     unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector;
+
+    /// XOR three vectors together: a XOR b XOR c
+    /// Uses native XOR3 instructions when available, falls back to two XOR operations otherwise
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector;
 }
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
@@ -247,9 +256,13 @@ pub trait EnhancedCrcWidth: CrcWidth {
     where
         T::Vector: Copy;
 
-    /// Perform width-specific folding operations
-    unsafe fn fold_16<T: ArchOps>(state: &mut CrcState<T::Vector>, coefficient: T::Vector, ops: &T)
-    where
+    /// Perform width-specific folding operations using CLMUL and two XOR operations (or one XOR3)
+    unsafe fn fold_16<T: ArchOps>(
+        state: &mut CrcState<T::Vector>,
+        coefficient: T::Vector,
+        new_data: T::Vector,
+        ops: &T,
+    ) where
         T::Vector: Copy;
 
     /// Fold width-specific number of bytes

Original file line number	Diff line number	Diff line change
`@@ -76,8 +76,12 @@ impl EnhancedCrcWidth for crate::structs::Width32 {`
`76`	`76`	`}`
`77`	`77`
`78`	`78`	`#[inline(always)]`
`79`		`- unsafe fn fold_16<T: ArchOps>(state: &mut CrcState<T::Vector>, coeff: T::Vector, ops: &T)`
`80`		`- where`
	`79`	`+ unsafe fn fold_16<T: ArchOps>(`
	`80`	`+ state: &mut CrcState<T::Vector>,`
	`81`	`+ coeff: T::Vector,`
	`82`	`+ new_data: T::Vector,`
	`83`	`+ ops: &T,`
	`84`	`+ ) where`
`81`	`85`	`T::Vector: Copy,`
`82`	`86`	`{`
`83`	`87`	`// For CRC-32, we need to handle the 32-bit sections of each 64-bit value`
`@@ -95,7 +99,7 @@ impl EnhancedCrcWidth for crate::structs::Width32 {`
`95`	`99`	`)`
`96`	`100`	`};`
`97`	`101`
`98`		`- state.value = ops.xor_vectors(h, l);`
	`102`	`+ state.value = ops.xor3_vectors(h, l, new_data);`
`99`	`103`	`}`
`100`	`104`
`101`	`105`	`/// CRC-32 specific implementation for folding 8 bytes to 4 bytes`
Original file line number	Diff line number	Diff line change
`@@ -57,15 +57,20 @@ impl EnhancedCrcWidth for crate::structs::Width64 {`
`57`	`57`	`}`
`58`	`58`
`59`	`59`	`#[inline(always)]`
`60`		`- unsafe fn fold_16<T: ArchOps>(state: &mut CrcState<T::Vector>, coeff: T::Vector, ops: &T)`
`61`		`- where`
	`60`	`+ unsafe fn fold_16<T: ArchOps>(`
	`61`	`+ state: &mut CrcState<T::Vector>,`
	`62`	`+ coeff: T::Vector,`
	`63`	`+ new_data: T::Vector,`
	`64`	`+ ops: &T,`
	`65`	`+ ) where`
`62`	`66`	`T::Vector: Copy,`
`63`	`67`	`{`
`64`	`68`	`// CRC-64 specific implementation for folding 16 bytes`
`65`	`69`	`state.value = {`
`66`		`- ops.xor_vectors(`
	`70`	`+ ops.xor3_vectors(`
`67`	`71`	`ops.carryless_mul_00(state.value, coeff),`
`68`	`72`	`ops.carryless_mul_11(state.value, coeff),`
	`73`	`+ new_data,`
`69`	`74`	`)`
`70`	`75`	`};`
`71`	`76`	`}`