diff --git a/Cargo.toml b/Cargo.toml index 4a59910..5f99f32 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,3 +21,11 @@ harness = false [[bench]] name = "sum_nulls_bitmap" harness = false + +[[bench]] +name = "take" +harness = false + +[[bench]] +name = "take_nulls_bitmap" +harness = false diff --git a/README.md b/README.md index b755b8a..4e33027 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,20 @@ nonsimd_sum bitmap 2^20 f32 [541.78 us 545.16 us 549.09 us] naive_sum bitmap 2^20 f32 [1.6740 ms 1.6922 ms 1.7149 ms] ``` +### Take of values + +``` +core_simd_take 2^20 f32 time: [911.13 us 912.21 us 913.33 us] +naive_take 2^20 f32 time: [912.39 us 915.22 us 918.41 us] +``` + +### Nullable take of values (`Bitmap`) + +``` +core_simd_take_nulls 2^20 f32 time: [950.40 us 954.08 us 958.88 us] +naive_take_nulls 2^20 f32 time: [2.3714 ms 2.3968 ms 2.4296 ms] +``` + ## Bench results on default Command: diff --git a/benches/take.rs b/benches/take.rs new file mode 100644 index 0000000..660b24e --- /dev/null +++ b/benches/take.rs @@ -0,0 +1,34 @@ +use criterion::{criterion_group, criterion_main, Criterion}; + +use simd_benches::take::*; + +fn close(l: &[f32], r: &[f32]) { + for (l, r) in l.iter().zip(r.iter()) { + assert!((l - r).abs() < l * 0.001); + } +} + +fn add_benchmark(c: &mut Criterion) { + let name = ""; + (10..=20).step_by(2).for_each(|log2_size| { + let size = 2usize.pow(log2_size); + let array = (0..size).map(|x| 1.0 + x as f32).collect::>(); + let indices = (0..size).collect::>(); + // check that they are equal... + close( + &core_simd_take(&array, &indices), + &naive_take(&array, &indices), + ); + + c.bench_function( + &format!("core_simd_take{} 2^{} f32", name, log2_size), + |b| b.iter(|| core_simd_take(&array, &indices)), + ); + c.bench_function(&format!("naive_take{} 2^{} f32", name, log2_size), |b| { + b.iter(|| naive_take(&array, &indices)) + }); + }); +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); diff --git a/benches/take_nulls_bitmap.rs b/benches/take_nulls_bitmap.rs new file mode 100644 index 0000000..c3a8058 --- /dev/null +++ b/benches/take_nulls_bitmap.rs @@ -0,0 +1,40 @@ +use criterion::{criterion_group, criterion_main, Criterion}; + +use simd_benches::bitmap_ops; +use simd_benches::take::*; + +fn close(l: &[f32], r: &[f32]) { + for (l, r) in l.iter().zip(r.iter()) { + assert!((l - r).abs() < l * 0.001 || (l.abs() < 0.000001 && r.abs() < 0.000001)); + } +} + +fn add_benchmark(c: &mut Criterion) { + let name = ""; + (10..=20).step_by(2).for_each(|log2_size| { + let size = 2usize.pow(log2_size); + let array = (0..size).map(|x| 1.0 + x as f32).collect::>(); + let mut mask = vec![0u8; size / 8]; + // 10% nulls + (0..size).for_each(|x| bitmap_ops::set_bit(&mut mask, x, (1 + x) % 10 != 0)); + let mask = (mask, size); + let indices = (0..size).collect::>(); + // check that they are equal... + close( + &core_simd_take_nulls(&array, &indices, &mask), + &naive_take_nulls(&array, &indices, &mask), + ); + + c.bench_function( + &format!("core_simd_take_nulls{} 2^{} f32", name, log2_size), + |b| b.iter(|| core_simd_take_nulls(&array, &indices, &mask)), + ); + c.bench_function( + &format!("naive_take_nulls{} 2^{} f32", name, log2_size), + |b| b.iter(|| naive_take_nulls(&array, &indices, &mask)), + ); + }); +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); diff --git a/src/lib.rs b/src/lib.rs index 101f6f8..01a5c31 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,3 +4,4 @@ pub mod bitmap_ops; pub mod sum; pub mod sum_nulls; pub mod sum_nulls_bitmap; +pub mod take; diff --git a/src/take.rs b/src/take.rs new file mode 100644 index 0000000..9245403 --- /dev/null +++ b/src/take.rs @@ -0,0 +1,70 @@ +use core_simd::*; + +use super::bitmap_ops::*; + +pub fn naive_take(values: &[f32], indices: &[usize]) -> Vec { + indices.iter().map(|i| values[*i]).collect() +} + +const LANES: usize = 8; +const MASK_LANES: usize = 8 / 8; + +pub fn core_simd_take(values: &[f32], indices: &[usize]) -> Vec { + let chunks = indices.chunks_exact(LANES); + // todo handle remainder + + let mut result = vec![0.0; indices.len()]; // todo: maybeUninit + let result_chunks = result.chunks_exact_mut(LANES); + chunks.zip(result_chunks).for_each(|(chunk, r_chunk)| { + let idxs: [usize; LANES] = chunk.try_into().unwrap(); + let idxs: usizex8 = usizex8::from_array(idxs); + + let r = Simd::gather_or_default(&values, idxs); + let r: [f32; LANES] = r.to_array(); + + let r_chunk: &mut [f32; LANES] = r_chunk.try_into().unwrap(); + *r_chunk = r; + }); + + result +} + +type Bitmap = (Vec, usize); + +pub fn naive_take_nulls(values: &[f32], indices: &[usize], mask: &Bitmap) -> Vec { + let mask = (0..mask.1).map(|x| get_bit(&mask.0, x)); + + indices + .iter() + .zip(mask) + .map(|(x, m)| if m { values[*x] } else { 0.0f32 }) + .collect() +} + +pub fn core_simd_take_nulls(values: &[f32], indices: &[usize], mask: &Bitmap) -> Vec { + assert_eq!(mask.1 % 16, 0); // todo: handle remainders + let chunks = indices.chunks_exact(LANES); + let mask_chunks = mask.0.chunks_exact(MASK_LANES); + //let remainder = chunks.remainder(); + //let mask_remainder = mask_chunks.remainder(); + + let mut result = vec![0.0; indices.len()]; // todo: maybeUninit + let result_chunks = result.chunks_exact_mut(LANES); + chunks + .zip(mask_chunks) + .zip(result_chunks) + .for_each(|((chunk, mask_chunk), r_chunk)| { + let idxs: [usize; LANES] = chunk.try_into().unwrap(); + let idxs: usizex8 = usizex8::from_array(idxs); + + let mask: [u8; MASK_LANES] = mask_chunk.try_into().unwrap(); + let mask = masksizex8::from_bitmask(mask); + + let r = Simd::gather_select(&values, mask, idxs, Simd::splat(f32::default())); + let r: [f32; LANES] = r.to_array(); + + let r_chunk: &mut [f32; LANES] = r_chunk.try_into().unwrap(); + *r_chunk = r; + }); + result +}