Skip to content

Commit 75c2812

Browse files
committed
Merge branch 'main' into implement-vpclmulqdq-512-bits
2 parents 6eaf13e + 957a981 commit 75c2812

File tree

6 files changed

+56
-33
lines changed

6 files changed

+56
-33
lines changed

README.md

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -297,14 +297,13 @@ AKA `crc32` in many, but not all, implementations.
297297

298298
### CRC-64/NVME
299299

300-
| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) |
301-
|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
302-
| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-48xl | avx512_vpclmulqdq | ~20.3 | ~94.1 |
303-
| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512_vpclmulqdq | ~18.3 | ~53.9 |
304-
| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon_pclmulqdq | ~16.3 | ~16.3 |
305-
| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon_pclmulqdq | ~44.0 | ~71.9 |
306-
| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon_pclmulqdq | ~40.3 | ~72.3 |
307-
| aarch64 | Apple | M2 Ultra | Mac Studio (24 core) | neon_pclmulqdq | ~39.3 | ~65.0 |
300+
| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) |
301+
|:--------|:------|:----------------|:---------------------|:--------------------|-------------:|-------------:|
302+
| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512_vpclmulqdq | ~24.9 | ~109.7 |
303+
| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512_vpclmulqdq | ~24.4 | ~54.6 |
304+
| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon_pclmulqdq_eor3 | ~18.7 | ~36.8 |
305+
| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon_pclmulqdq | ~9.8 | ~15.9 |
306+
| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon_pclmulqdq_eor3 | ~49.5 | ~71.9 |
308307

309308
## Other CRC widths
310309

src/algorithm.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use crate::{crc32, crc64};
2929
)]
3030
#[cfg_attr(
3131
all(target_arch = "x86_64", feature = "vpclmulqdq"),
32-
target_feature(enable = "sse2,sse4.1,pclmulqdq,avx2,vpclmulqdq")
32+
target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl")
3333
)]
3434
#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
3535
pub unsafe fn update<T: ArchOps, W: EnhancedCrcWidth>(
@@ -120,6 +120,10 @@ where
120120
any(target_arch = "x86", target_arch = "x86_64"),
121121
target_feature(enable = "sse2,sse4.1,pclmulqdq")
122122
)]
123+
#[cfg_attr(
124+
all(target_arch = "x86_64", feature = "vpclmulqdq"),
125+
target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl")
126+
)]
123127
#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
124128
unsafe fn process_large_aligned<T: ArchOps, W: EnhancedCrcWidth>(
125129
bytes: &[u8],

src/arch/aarch64.rs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
use crate::traits::ArchOps;
88
use std::arch::aarch64::*;
9-
use std::arch::is_aarch64_feature_detected;
109

1110
#[derive(Debug, Copy, Clone)]
1211
pub struct AArch64Ops;
@@ -258,20 +257,27 @@ impl ArchOps for AArch64Ops {
258257
}
259258

260259
#[inline]
261-
#[cfg_attr(target_feature = "sha3", target_feature(enable = "neon,sha3"))]
262-
#[cfg_attr(not(target_feature = "sha3"), target_feature(enable = "neon"))]
260+
#[cfg(target_feature = "sha3")]
261+
#[target_feature(enable = "neon,sha3")]
263262
unsafe fn xor3_vectors(
264263
&self,
265264
a: Self::Vector,
266265
b: Self::Vector,
267266
c: Self::Vector,
268267
) -> Self::Vector {
269-
if is_aarch64_feature_detected!("sha3") {
270-
// Use native 3-way XOR instruction when available
271-
return veor3q_u8(a, b, c);
272-
}
268+
veor3q_u8(a, b, c)
269+
}
273270

274-
// Fall back to two XOR operations
271+
#[inline]
272+
#[cfg(not(target_feature = "sha3"))]
273+
#[target_feature(enable = "neon")]
274+
unsafe fn xor3_vectors(
275+
&self,
276+
a: Self::Vector,
277+
b: Self::Vector,
278+
c: Self::Vector,
279+
) -> Self::Vector {
280+
// Fallback for when SHA3 is not available
275281
veorq_u8(veorq_u8(a, b), c)
276282
}
277283
}

src/arch/mod.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,16 @@ mod x86;
3232
///
3333
/// # Safety
3434
/// May use native CPU features
35-
#[inline(always)]
35+
#[inline]
36+
#[cfg_attr(
37+
any(target_arch = "x86", target_arch = "x86_64"),
38+
target_feature(enable = "sse2,sse4.1,pclmulqdq")
39+
)]
40+
#[cfg_attr(
41+
all(target_arch = "x86_64", feature = "vpclmulqdq"),
42+
target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl")
43+
)]
44+
#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
3645
pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
3746
#[cfg(target_arch = "aarch64")]
3847
{
@@ -83,7 +92,10 @@ pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64
8392
}
8493

8594
pub fn get_target() -> String {
86-
#[cfg(target_arch = "aarch64")]
95+
#[cfg(all(target_arch = "aarch64", target_feature = "sha3"))]
96+
return "internal-aarch64-neon-eor3".to_string();
97+
98+
#[cfg(all(target_arch = "aarch64", not(target_feature = "sha3")))]
8799
return "internal-aarch64-neon".to_string();
88100

89101
#[cfg(all(target_arch = "x86_64", feature = "vpclmulqdq"))]

src/arch/x86.rs

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -228,27 +228,28 @@ impl ArchOps for X86Ops {
228228
}
229229

230230
#[inline]
231-
#[cfg_attr(
232-
any(feature = "vpclmulqdq", feature = "avx512"),
233-
target_feature(enable = "avx512f,avx512vl")
234-
)]
235-
#[cfg_attr(
236-
all(not(feature = "vpclmulqdq"), not(feature = "avx512")),
237-
target_feature(enable = "sse2,sse4.1")
238-
)]
231+
#[cfg(any(feature = "vpclmulqdq", feature = "avx512"))]
232+
#[target_feature(enable = "avx512f,avx512vl")]
239233
unsafe fn xor3_vectors(
240234
&self,
241235
a: Self::Vector,
242236
b: Self::Vector,
243237
c: Self::Vector,
244238
) -> Self::Vector {
245-
#[cfg(any(feature = "vpclmulqdq", feature = "avx512"))]
246-
if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
247-
return _mm_ternarylogic_epi64(
248-
a, b, c, 0x96, // XOR3
249-
);
250-
}
239+
_mm_ternarylogic_epi64(
240+
a, b, c, 0x96, // XOR3
241+
)
242+
}
251243

244+
#[inline]
245+
#[cfg(not(any(feature = "vpclmulqdq", feature = "avx512")))]
246+
#[target_feature(enable = "sse2,sse4.1")]
247+
unsafe fn xor3_vectors(
248+
&self,
249+
a: Self::Vector,
250+
b: Self::Vector,
251+
c: Self::Vector,
252+
) -> Self::Vector {
252253
// x86 doesn't have native XOR3 in SSE, use two XORs
253254
_mm_xor_si128(_mm_xor_si128(a, b), c)
254255
}

src/structs.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ pub struct CrcState<T> {
4747
pub(crate) struct Calculator {}
4848

4949
impl CrcCalculator for Calculator {
50+
#[inline(always)]
5051
fn calculate(state: u64, data: &[u8], params: CrcParams) -> u64 {
5152
unsafe { arch::update(state, data, params) }
5253
}

0 commit comments

Comments
 (0)