Update benchmarks in README

onethumb · onethumb · commit 41b760079f8e · 2025-06-06T20:35:31.000-07:00
Represents the new performance impact from the wider AVX512 registers.
diff --git a/README.md b/README.md
@@ -297,14 +297,14 @@ AKA `crc32` in many, but not all, implementations.
 
 ### CRC-64/NVME
 
-| Arch    | Brand | CPU             | System                    | Target          | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:----------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-48xl        | avx2_vpclmulqdq |        ~17.0 |        ~56.4 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx2_vpclmulqdq |        ~17.3 |        ~27.4 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon_pclmulqdq  |        ~16.3 |        ~16.3 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon_pclmulqdq  |        ~44.0 |        ~71.9 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon_pclmulqdq  |        ~40.3 |        ~72.3 | 
-| aarch64 | Apple | M2 Ultra        | Mac Studio (24 core)      | neon_pclmulqdq  |        ~39.3 |        ~65.0 |
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-48xl        | avx512_vpclmulqdq |        ~20.3 |        ~94.1 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512_vpclmulqdq |        ~18.3 |        ~53.9 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon_pclmulqdq    |        ~16.3 |        ~16.3 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon_pclmulqdq    |        ~44.0 |        ~71.9 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon_pclmulqdq    |        ~40.3 |        ~72.3 | 
+| aarch64 | Apple | M2 Ultra        | Mac Studio (24 core)      | neon_pclmulqdq    |        ~39.3 |        ~65.0 |
 
 ## Other CRC widths
 
diff --git a/benches/benchmark.rs b/benches/benchmark.rs
@@ -51,6 +51,28 @@ fn random_data(size: i32) -> Vec<u8> {
     buf
 }
 
+fn create_aligned_data(input: &[u8]) -> Vec<u8> {
+    // Size of our target alignment structure
+    let align_size = std::mem::size_of::<[[u64; 4]; 2]>(); // 64 bytes
+
+    // Create a vector with padding to ensure we can find a properly aligned position
+    let mut padded = Vec::with_capacity(input.len() + align_size);
+
+    // Fill with zeros initially to reach needed capacity
+    padded.resize(input.len() + align_size, 0);
+
+    // Find the first address that satisfies our alignment
+    let start_addr = padded.as_ptr() as usize;
+    let align_offset = (align_size - (start_addr % align_size)) % align_size;
+
+    // Copy the input into the aligned position
+    let aligned_start = &mut padded[align_offset..];
+    aligned_start[..input.len()].copy_from_slice(input);
+
+    // Return the exact slice we need
+    aligned_start[..input.len()].to_vec()
+}
+
 #[inline(always)]
 fn bench_crc32(c: &mut Criterion) {
     let mut group = c.benchmark_group("CRC-32");
@@ -65,7 +87,7 @@ fn bench_crc32(c: &mut Criterion) {
     );
 
     for (size_name, size) in SIZES {
-        let buf = random_data(*size);
+        let buf = create_aligned_data(&*random_data(*size));
 
         let (part1, rest) = buf.split_at(buf.len() / 4);
         let (part2, rest) = rest.split_at(rest.len() / 3);
@@ -108,7 +130,7 @@ fn bench_crc64(c: &mut Criterion) {
     let mut group = c.benchmark_group("CRC-64");
 
     for (size_name, size) in SIZES {
-        let buf = random_data(*size);
+        let buf = create_aligned_data(&*random_data(*size));
 
         let (part1, rest) = buf.split_at(buf.len() / 4);
         let (part2, rest) = rest.split_at(rest.len() / 3);
@@ -122,6 +144,8 @@ fn bench_crc64(c: &mut Criterion) {
 
             group.throughput(Throughput::Bytes(*size as u64));
 
+            group.measurement_time(Duration::from_secs(60));
+
             let bench_name = [alg_suffix.unwrap(), "(checksum)"].join(" ");
 
             group.bench_function(BenchmarkId::new(bench_name, size_name), |b| {