Initial commit.

jorgecarleitao · jorgecarleitao · commit 2e0a9f0b6ce7 · 2021-11-11T05:54:35.000Z
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+/target
+Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "simd_benches"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+core_simd = { git = "https://github.com/rust-lang/portable-simd" }
+packed_simd = { version = "0.3", package = "packed_simd_2" }
+
+[dev-dependencies]
+criterion = "0.3"
+
+[[bench]]
+name = "sum"
+harness = false
+
+[[bench]]
+name = "sum_nulls"
+harness = false
+
+[[bench]]
+name = "sum_nulls_bitmap"
+harness = false
diff --git a/README.md b/README.md
@@ -0,0 +1,64 @@
+# Benchmark Rust explicit simd
+
+This repository contains benchmarks for common vertical and horizontal operations that
+leverage SIMD, comparing different implementations of the same algorithms
+in them using `packed_simd2` and `core_simd`.
+
+Things implemented:
+
+* sum of values
+* sum of nullable values where nulls are represented as `Vec<bool>`
+* sum of nullable values where nulls are represented as `Bitmap`
+
+Algorithms implemented:
+
+* `core_simd`: vertical sum over lanes with a reduce at the end using `core_simd`
+* `packed_simd`: vertical sum over lanes with a reduce at the end using `packed_simd`
+* `nonsimd`: vertical sum over lanes with a reduce at the end using Rust arrays
+* `naive`: sum using rust iterators
+
+## Bench results on my computer
+
+### Sum of values
+
+```
+core_simd_sum 2^20 f32     [184.95 us 185.86 us 186.97 us]
+packed_simd_sum 2^20 f32   [184.97 us 186.85 us 189.59 us]
+nonsimd_sum 2^20 f32       [191.35 us 192.67 us 194.46 us]
+naive_sum 2^20 f32         [1.6385 ms 1.6426 ms 1.6466 ms]
+```
+
+### Sum of nullable values (`Vec<bool>`)
+
+```
+core_simd_sum null 2^20 f32   [882.21 us 889.56 us 897.74 us]
+packed_simd_sum null 2^20 f32 [824.37 us 835.77 us 849.63 us]
+nonsimd_sum null 2^20 f32     [695.79 us 707.87 us 721.98 us]
+naive_sum null 2^20 f32       [1.6418 ms 1.6520 ms 1.6660 ms]
+```
+
+### Sum of nullable values (`Bitmap`)
+
+```
+core_simd_sum bitmap 2^20 f32  [929.95 us 936.31 us 943.64 us]
+nonsimd_sum bitmap 2^20 f32    [454.78 us 462.08 us 471.82 us]
+naive_sum bitmap 2^20 f32      [1.7633 ms 1.7736 ms 1.7855 ms]
+```
+
+### Conclusions so far:
+
+* for non-null sums, it is advantageous to use SIMD
+* for sums with nulls, it is not advantageous to use SIMD
+
+## License
+
+Licensed under either of
+
+ * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+ * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+
+at your option.
+
+### Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.
diff --git a/benches/sum.rs b/benches/sum.rs
@@ -0,0 +1,35 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use simd_benches::sum::*;
+
+fn close(l: f32, r: f32) {
+    assert!((l - r).abs() < l * 0.0001);
+}
+
+fn add_benchmark(c: &mut Criterion) {
+    let name = "";
+    (10..=20).step_by(2).for_each(|log2_size| {
+        let size = 2usize.pow(log2_size);
+        let array = (0..size)
+            .map(|x| std::f32::consts::PI * x as f32 * x as f32 - std::f32::consts::PI * x as f32)
+            .collect::<Vec<_>>();
+        let result = naive_sum(&array);
+
+        c.bench_function(&format!("core_simd_sum{} 2^{} f32", name, log2_size), |b| {
+            b.iter(|| close(core_simd_sum(&array), result))
+        });
+        c.bench_function(
+            &format!("packed_simd_sum{} 2^{} f32", name, log2_size),
+            |b| b.iter(|| close(packed_simd_sum(&array), result)),
+        );
+        c.bench_function(&format!("nonsimd_sum{} 2^{} f32", name, log2_size), |b| {
+            b.iter(|| close(nonsimd_sum(&array), result))
+        });
+        c.bench_function(&format!("naive_sum{} 2^{} f32", name, log2_size), |b| {
+            b.iter(|| close(naive_sum(&array), result))
+        });
+    });
+}
+
+criterion_group!(benches, add_benchmark);
+criterion_main!(benches);
diff --git a/benches/sum_nulls.rs b/benches/sum_nulls.rs
@@ -0,0 +1,38 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use simd_benches::sum_nulls::*;
+
+fn close(l: f32, r: f32) {
+    assert!((l - r).abs() < l * 0.0001);
+}
+
+fn add_benchmark(c: &mut Criterion) {
+    let name = " null";
+    (10..=20).step_by(2).for_each(|log2_size| {
+        let size = 2usize.pow(log2_size);
+        let array = (0..size)
+            .map(|x| std::f32::consts::PI * x as f32 * x as f32 - std::f32::consts::PI * x as f32)
+            .collect::<Vec<_>>();
+
+        let mask = (0..size).map(|x| x % 123 == 0).collect::<Vec<_>>();
+
+        let result = naive_sum(&array, &mask);
+
+        c.bench_function(&format!("core_simd_sum{} 2^{} f32", name, log2_size), |b| {
+            b.iter(|| close(core_simd_sum(&array, &mask), result))
+        });
+        c.bench_function(
+            &format!("packed_simd_sum{} 2^{} f32", name, log2_size),
+            |b| b.iter(|| close(packed_simd_sum(&array, &mask), result)),
+        );
+        c.bench_function(&format!("nonsimd_sum{} 2^{} f32", name, log2_size), |b| {
+            b.iter(|| close(nonsimd_sum(&array, &mask), result))
+        });
+        c.bench_function(&format!("naive_sum{} 2^{} f32", name, log2_size), |b| {
+            b.iter(|| close(naive_sum(&array, &mask), result))
+        });
+    });
+}
+
+criterion_group!(benches, add_benchmark);
+criterion_main!(benches);
diff --git a/benches/sum_nulls_bitmap.rs b/benches/sum_nulls_bitmap.rs
@@ -0,0 +1,43 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use simd_benches::bitmap_ops;
+use simd_benches::sum_nulls_bitmap::*;
+
+fn close(l: f32, r: f32) {
+    assert!((l - r).abs() < l * 0.0001);
+}
+
+fn add_benchmark(c: &mut Criterion) {
+    let name = " bitmap";
+    (10..=20).step_by(2).for_each(|log2_size| {
+        let size = 2usize.pow(log2_size);
+        let array = (0..size)
+            .map(|x| std::f32::consts::PI * x as f32 * x as f32 - std::f32::consts::PI * x as f32)
+            .collect::<Vec<_>>();
+
+        let mut mask = vec![0u8; size / 8];
+        (0..size).for_each(|x| bitmap_ops::set_bit(&mut mask, x, x % 123 == 0));
+        let mask = (mask, size);
+
+        let result = naive_sum(&array, &mask);
+
+        c.bench_function(&format!("core_simd_sum{} 2^{} f32", name, log2_size), |b| {
+            b.iter(|| close(core_simd_sum(&array, &mask), result))
+        });
+        /*
+        c.bench_function(
+            &format!("packed_simd_sum{} 2^{} f32", name, log2_size),
+            |b| b.iter(|| close(packed_simd_sum(&array, &mask), result)),
+        );
+         */
+        c.bench_function(&format!("nonsimd_sum{} 2^{} f32", name, log2_size), |b| {
+            b.iter(|| close(nonsimd_sum(&array, &mask), result))
+        });
+        c.bench_function(&format!("naive_sum{} 2^{} f32", name, log2_size), |b| {
+            b.iter(|| close(naive_sum(&array, &mask), result))
+        });
+    });
+}
+
+criterion_group!(benches, add_benchmark);
+criterion_main!(benches);
diff --git a/src/bitmap_ops.rs b/src/bitmap_ops.rs
@@ -0,0 +1,39 @@
+const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
+const UNSET_BIT_MASK: [u8; 8] = [
+    255 - 1,
+    255 - 2,
+    255 - 4,
+    255 - 8,
+    255 - 16,
+    255 - 32,
+    255 - 64,
+    255 - 128,
+];
+
+/// Returns whether bit at position `i` in `byte` is set or not
+#[inline]
+pub fn is_set(byte: u8, i: usize) -> bool {
+    (byte & BIT_MASK[i]) != 0
+}
+
+/// Sets bit at position `i` in `byte`
+#[inline]
+pub fn set(byte: u8, i: usize, value: bool) -> u8 {
+    if value {
+        byte | BIT_MASK[i]
+    } else {
+        byte & UNSET_BIT_MASK[i]
+    }
+}
+
+/// Returns whether bit at position `i` in `data` is set or not
+#[inline]
+pub fn set_bit(data: &mut [u8], i: usize, value: bool) {
+    data[i / 8] = set(data[i / 8], i % 8, value);
+}
+
+/// Returns whether bit at position `i` in `data` is set or not
+#[inline]
+pub fn get_bit(data: &[u8], i: usize) -> bool {
+    is_set(data[i / 8], i % 8)
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -0,0 +1,6 @@
+#![feature(portable_simd)]
+
+pub mod bitmap_ops;
+pub mod sum;
+pub mod sum_nulls;
+pub mod sum_nulls_bitmap;
diff --git a/src/sum.rs b/src/sum.rs
@@ -0,0 +1,67 @@
+use std::convert::TryInto;
+
+use core_simd::f32x16;
+use packed_simd::f32x16 as p_f32x16;
+
+const LANES: usize = 16;
+
+pub fn packed_simd_sum(values: &[f32]) -> f32 {
+    let chunks = values.chunks_exact(LANES);
+    let remainder = chunks.remainder();
+
+    let sum = chunks.fold(p_f32x16::default(), |acc, chunk| {
+        let chunk: [f32; 16] = chunk.try_into().unwrap();
+        let chunk: p_f32x16 = p_f32x16::from_slice_unaligned(&chunk);
+
+        acc + chunk
+    });
+
+    let remainder: f32 = remainder.iter().copied().sum();
+
+    sum.sum() + remainder
+}
+
+pub fn core_simd_sum(values: &[f32]) -> f32 {
+    let chunks = values.chunks_exact(LANES);
+    let remainder = chunks.remainder();
+
+    let sum = chunks.fold(f32x16::default(), |acc, chunk| {
+        let chunk: [f32; 16] = chunk.try_into().unwrap();
+        let chunk: f32x16 = f32x16::from_array(chunk);
+
+        acc + chunk
+    });
+
+    let remainder: f32 = remainder.iter().copied().sum();
+
+    let mut reduced = 0.0f32;
+    for i in 0..LANES {
+        reduced += sum[i];
+    }
+    reduced + remainder
+}
+
+pub fn nonsimd_sum(values: &[f32]) -> f32 {
+    let chunks = values.chunks_exact(LANES);
+    let remainder = chunks.remainder();
+
+    let sum = chunks.fold([0.0f32; LANES], |mut acc, chunk| {
+        let chunk: [f32; LANES] = chunk.try_into().unwrap();
+        for i in 0..LANES {
+            acc[i] += chunk[i];
+        }
+        acc
+    });
+
+    let remainder: f32 = remainder.iter().copied().sum();
+
+    let mut reduced = 0.0f32;
+    (0..LANES).for_each(|i| {
+        reduced += sum[i];
+    });
+    reduced + remainder
+}
+
+pub fn naive_sum(values: &[f32]) -> f32 {
+    values.iter().sum()
+}
diff --git a/src/sum_nulls.rs b/src/sum_nulls.rs
diff --git a/src/sum_nulls_bitmap.rs b/src/sum_nulls_bitmap.rs