Skip to content

Commit 2e0a9f0

Browse files
Initial commit.
0 parents  commit 2e0a9f0

File tree

11 files changed

+547
-0
lines changed

11 files changed

+547
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/target
2+
Cargo.lock

Cargo.toml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[package]
2+
name = "simd_benches"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
[dependencies]
7+
core_simd = { git = "https://github.com/rust-lang/portable-simd" }
8+
packed_simd = { version = "0.3", package = "packed_simd_2" }
9+
10+
[dev-dependencies]
11+
criterion = "0.3"
12+
13+
[[bench]]
14+
name = "sum"
15+
harness = false
16+
17+
[[bench]]
18+
name = "sum_nulls"
19+
harness = false
20+
21+
[[bench]]
22+
name = "sum_nulls_bitmap"
23+
harness = false

README.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Benchmark Rust explicit simd
2+
3+
This repository contains benchmarks for common vertical and horizontal operations that
4+
leverage SIMD, comparing different implementations of the same algorithms
5+
in them using `packed_simd2` and `core_simd`.
6+
7+
Things implemented:
8+
9+
* sum of values
10+
* sum of nullable values where nulls are represented as `Vec<bool>`
11+
* sum of nullable values where nulls are represented as `Bitmap`
12+
13+
Algorithms implemented:
14+
15+
* `core_simd`: vertical sum over lanes with a reduce at the end using `core_simd`
16+
* `packed_simd`: vertical sum over lanes with a reduce at the end using `packed_simd`
17+
* `nonsimd`: vertical sum over lanes with a reduce at the end using Rust arrays
18+
* `naive`: sum using rust iterators
19+
20+
## Bench results on my computer
21+
22+
### Sum of values
23+
24+
```
25+
core_simd_sum 2^20 f32 [184.95 us 185.86 us 186.97 us]
26+
packed_simd_sum 2^20 f32 [184.97 us 186.85 us 189.59 us]
27+
nonsimd_sum 2^20 f32 [191.35 us 192.67 us 194.46 us]
28+
naive_sum 2^20 f32 [1.6385 ms 1.6426 ms 1.6466 ms]
29+
```
30+
31+
### Sum of nullable values (`Vec<bool>`)
32+
33+
```
34+
core_simd_sum null 2^20 f32 [882.21 us 889.56 us 897.74 us]
35+
packed_simd_sum null 2^20 f32 [824.37 us 835.77 us 849.63 us]
36+
nonsimd_sum null 2^20 f32 [695.79 us 707.87 us 721.98 us]
37+
naive_sum null 2^20 f32 [1.6418 ms 1.6520 ms 1.6660 ms]
38+
```
39+
40+
### Sum of nullable values (`Bitmap`)
41+
42+
```
43+
core_simd_sum bitmap 2^20 f32 [929.95 us 936.31 us 943.64 us]
44+
nonsimd_sum bitmap 2^20 f32 [454.78 us 462.08 us 471.82 us]
45+
naive_sum bitmap 2^20 f32 [1.7633 ms 1.7736 ms 1.7855 ms]
46+
```
47+
48+
### Conclusions so far:
49+
50+
* for non-null sums, it is advantageous to use SIMD
51+
* for sums with nulls, it is not advantageous to use SIMD
52+
53+
## License
54+
55+
Licensed under either of
56+
57+
* Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
58+
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
59+
60+
at your option.
61+
62+
### Contribution
63+
64+
Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.

benches/sum.rs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
use criterion::{criterion_group, criterion_main, Criterion};
2+
3+
use simd_benches::sum::*;
4+
5+
fn close(l: f32, r: f32) {
6+
assert!((l - r).abs() < l * 0.0001);
7+
}
8+
9+
fn add_benchmark(c: &mut Criterion) {
10+
let name = "";
11+
(10..=20).step_by(2).for_each(|log2_size| {
12+
let size = 2usize.pow(log2_size);
13+
let array = (0..size)
14+
.map(|x| std::f32::consts::PI * x as f32 * x as f32 - std::f32::consts::PI * x as f32)
15+
.collect::<Vec<_>>();
16+
let result = naive_sum(&array);
17+
18+
c.bench_function(&format!("core_simd_sum{} 2^{} f32", name, log2_size), |b| {
19+
b.iter(|| close(core_simd_sum(&array), result))
20+
});
21+
c.bench_function(
22+
&format!("packed_simd_sum{} 2^{} f32", name, log2_size),
23+
|b| b.iter(|| close(packed_simd_sum(&array), result)),
24+
);
25+
c.bench_function(&format!("nonsimd_sum{} 2^{} f32", name, log2_size), |b| {
26+
b.iter(|| close(nonsimd_sum(&array), result))
27+
});
28+
c.bench_function(&format!("naive_sum{} 2^{} f32", name, log2_size), |b| {
29+
b.iter(|| close(naive_sum(&array), result))
30+
});
31+
});
32+
}
33+
34+
criterion_group!(benches, add_benchmark);
35+
criterion_main!(benches);

benches/sum_nulls.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
use criterion::{criterion_group, criterion_main, Criterion};
2+
3+
use simd_benches::sum_nulls::*;
4+
5+
fn close(l: f32, r: f32) {
6+
assert!((l - r).abs() < l * 0.0001);
7+
}
8+
9+
fn add_benchmark(c: &mut Criterion) {
10+
let name = " null";
11+
(10..=20).step_by(2).for_each(|log2_size| {
12+
let size = 2usize.pow(log2_size);
13+
let array = (0..size)
14+
.map(|x| std::f32::consts::PI * x as f32 * x as f32 - std::f32::consts::PI * x as f32)
15+
.collect::<Vec<_>>();
16+
17+
let mask = (0..size).map(|x| x % 123 == 0).collect::<Vec<_>>();
18+
19+
let result = naive_sum(&array, &mask);
20+
21+
c.bench_function(&format!("core_simd_sum{} 2^{} f32", name, log2_size), |b| {
22+
b.iter(|| close(core_simd_sum(&array, &mask), result))
23+
});
24+
c.bench_function(
25+
&format!("packed_simd_sum{} 2^{} f32", name, log2_size),
26+
|b| b.iter(|| close(packed_simd_sum(&array, &mask), result)),
27+
);
28+
c.bench_function(&format!("nonsimd_sum{} 2^{} f32", name, log2_size), |b| {
29+
b.iter(|| close(nonsimd_sum(&array, &mask), result))
30+
});
31+
c.bench_function(&format!("naive_sum{} 2^{} f32", name, log2_size), |b| {
32+
b.iter(|| close(naive_sum(&array, &mask), result))
33+
});
34+
});
35+
}
36+
37+
criterion_group!(benches, add_benchmark);
38+
criterion_main!(benches);

benches/sum_nulls_bitmap.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
use criterion::{criterion_group, criterion_main, Criterion};
2+
3+
use simd_benches::bitmap_ops;
4+
use simd_benches::sum_nulls_bitmap::*;
5+
6+
fn close(l: f32, r: f32) {
7+
assert!((l - r).abs() < l * 0.0001);
8+
}
9+
10+
fn add_benchmark(c: &mut Criterion) {
11+
let name = " bitmap";
12+
(10..=20).step_by(2).for_each(|log2_size| {
13+
let size = 2usize.pow(log2_size);
14+
let array = (0..size)
15+
.map(|x| std::f32::consts::PI * x as f32 * x as f32 - std::f32::consts::PI * x as f32)
16+
.collect::<Vec<_>>();
17+
18+
let mut mask = vec![0u8; size / 8];
19+
(0..size).for_each(|x| bitmap_ops::set_bit(&mut mask, x, x % 123 == 0));
20+
let mask = (mask, size);
21+
22+
let result = naive_sum(&array, &mask);
23+
24+
c.bench_function(&format!("core_simd_sum{} 2^{} f32", name, log2_size), |b| {
25+
b.iter(|| close(core_simd_sum(&array, &mask), result))
26+
});
27+
/*
28+
c.bench_function(
29+
&format!("packed_simd_sum{} 2^{} f32", name, log2_size),
30+
|b| b.iter(|| close(packed_simd_sum(&array, &mask), result)),
31+
);
32+
*/
33+
c.bench_function(&format!("nonsimd_sum{} 2^{} f32", name, log2_size), |b| {
34+
b.iter(|| close(nonsimd_sum(&array, &mask), result))
35+
});
36+
c.bench_function(&format!("naive_sum{} 2^{} f32", name, log2_size), |b| {
37+
b.iter(|| close(naive_sum(&array, &mask), result))
38+
});
39+
});
40+
}
41+
42+
criterion_group!(benches, add_benchmark);
43+
criterion_main!(benches);

src/bitmap_ops.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
2+
const UNSET_BIT_MASK: [u8; 8] = [
3+
255 - 1,
4+
255 - 2,
5+
255 - 4,
6+
255 - 8,
7+
255 - 16,
8+
255 - 32,
9+
255 - 64,
10+
255 - 128,
11+
];
12+
13+
/// Returns whether bit at position `i` in `byte` is set or not
14+
#[inline]
15+
pub fn is_set(byte: u8, i: usize) -> bool {
16+
(byte & BIT_MASK[i]) != 0
17+
}
18+
19+
/// Sets bit at position `i` in `byte`
20+
#[inline]
21+
pub fn set(byte: u8, i: usize, value: bool) -> u8 {
22+
if value {
23+
byte | BIT_MASK[i]
24+
} else {
25+
byte & UNSET_BIT_MASK[i]
26+
}
27+
}
28+
29+
/// Returns whether bit at position `i` in `data` is set or not
30+
#[inline]
31+
pub fn set_bit(data: &mut [u8], i: usize, value: bool) {
32+
data[i / 8] = set(data[i / 8], i % 8, value);
33+
}
34+
35+
/// Returns whether bit at position `i` in `data` is set or not
36+
#[inline]
37+
pub fn get_bit(data: &[u8], i: usize) -> bool {
38+
is_set(data[i / 8], i % 8)
39+
}

src/lib.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#![feature(portable_simd)]
2+
3+
pub mod bitmap_ops;
4+
pub mod sum;
5+
pub mod sum_nulls;
6+
pub mod sum_nulls_bitmap;

src/sum.rs

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
use std::convert::TryInto;
2+
3+
use core_simd::f32x16;
4+
use packed_simd::f32x16 as p_f32x16;
5+
6+
const LANES: usize = 16;
7+
8+
pub fn packed_simd_sum(values: &[f32]) -> f32 {
9+
let chunks = values.chunks_exact(LANES);
10+
let remainder = chunks.remainder();
11+
12+
let sum = chunks.fold(p_f32x16::default(), |acc, chunk| {
13+
let chunk: [f32; 16] = chunk.try_into().unwrap();
14+
let chunk: p_f32x16 = p_f32x16::from_slice_unaligned(&chunk);
15+
16+
acc + chunk
17+
});
18+
19+
let remainder: f32 = remainder.iter().copied().sum();
20+
21+
sum.sum() + remainder
22+
}
23+
24+
pub fn core_simd_sum(values: &[f32]) -> f32 {
25+
let chunks = values.chunks_exact(LANES);
26+
let remainder = chunks.remainder();
27+
28+
let sum = chunks.fold(f32x16::default(), |acc, chunk| {
29+
let chunk: [f32; 16] = chunk.try_into().unwrap();
30+
let chunk: f32x16 = f32x16::from_array(chunk);
31+
32+
acc + chunk
33+
});
34+
35+
let remainder: f32 = remainder.iter().copied().sum();
36+
37+
let mut reduced = 0.0f32;
38+
for i in 0..LANES {
39+
reduced += sum[i];
40+
}
41+
reduced + remainder
42+
}
43+
44+
pub fn nonsimd_sum(values: &[f32]) -> f32 {
45+
let chunks = values.chunks_exact(LANES);
46+
let remainder = chunks.remainder();
47+
48+
let sum = chunks.fold([0.0f32; LANES], |mut acc, chunk| {
49+
let chunk: [f32; LANES] = chunk.try_into().unwrap();
50+
for i in 0..LANES {
51+
acc[i] += chunk[i];
52+
}
53+
acc
54+
});
55+
56+
let remainder: f32 = remainder.iter().copied().sum();
57+
58+
let mut reduced = 0.0f32;
59+
(0..LANES).for_each(|i| {
60+
reduced += sum[i];
61+
});
62+
reduced + remainder
63+
}
64+
65+
pub fn naive_sum(values: &[f32]) -> f32 {
66+
values.iter().sum()
67+
}

0 commit comments

Comments
 (0)