Skip to content

Commit 3b9f233

Browse files
authored
perf: optimize expand (#5285)
prev: ``` ╰─ expand_selectivity │ │ │ │ │ ├─ u8 │ │ │ │ │ │ ├─ 0.01 749.7 ns │ 2.52 µs │ 838.4 ns │ 841.1 ns │ 1000 │ 8000 │ ├─ 0.1 739.2 ns │ 2.182 µs │ 833.1 ns │ 835.9 ns │ 1000 │ 8000 │ ├─ 0.2 728.9 ns │ 1.796 µs │ 817.5 ns │ 819.9 ns │ 1000 │ 8000 │ ├─ 0.3 723.6 ns │ 2.635 µs │ 812.2 ns │ 810.9 ns │ 1000 │ 8000 │ ├─ 0.4 708 ns │ 6.885 µs │ 801.7 ns │ 803.6 ns │ 1000 │ 4000 │ ├─ 0.5 687.2 ns │ 3.541 µs │ 781 ns │ 770.9 ns │ 1000 │ 4000 │ ├─ 0.6 682 ns │ 2.072 µs │ 765.4 ns │ 756.5 ns │ 1000 │ 8000 │ ├─ 0.7 661.2 ns │ 2.354 µs │ 744.6 ns │ 742.9 ns │ 1000 │ 8000 │ ├─ 0.8 650.7 ns │ 2.104 µs │ 729 ns │ 733.6 ns │ 1000 │ 8000 │ ├─ 0.9 630 ns │ 1.421 µs │ 713.4 ns │ 710.9 ns │ 1000 │ 8000 │ ╰─ 0.99 598.7 ns │ 2.197 µs │ 676.9 ns │ 660.7 ns │ 1000 │ 8000 ├─ u32 │ │ │ │ │ │ ├─ 0.01 760 ns │ 1.645 µs │ 854 ns │ 852.6 ns │ 1000 │ 4000 │ ├─ 0.1 760 ns │ 3.656 µs │ 854 ns │ 862.5 ns │ 1000 │ 4000 │ ├─ 0.2 749.7 ns │ 2.567 µs │ 848.7 ns │ 850.2 ns │ 1000 │ 8000 │ ├─ 0.3 728.9 ns │ 2.291 µs │ 827.9 ns │ 827.7 ns │ 1000 │ 8000 │ ├─ 0.4 708 ns │ 2.176 µs │ 801.7 ns │ 797.4 ns │ 1000 │ 8000 │ ├─ 0.5 687.2 ns │ 2.208 µs │ 775.7 ns │ 767.6 ns │ 1000 │ 8000 │ ├─ 0.6 676.9 ns │ 2.432 µs │ 770.5 ns │ 757.1 ns │ 1000 │ 8000 │ ├─ 0.7 656 ns │ 2.255 µs │ 749.7 ns │ 739.2 ns │ 1000 │ 8000 │ ├─ 0.8 650.7 ns │ 1.697 µs │ 734.1 ns │ 725.9 ns │ 1000 │ 8000 │ ├─ 0.9 629.9 ns │ 3.629 µs │ 713.2 ns │ 691.3 ns │ 1000 │ 8000 │ ╰─ 0.99 604 ns │ 2.984 µs │ 687.2 ns │ 672.7 ns │ 1000 │ 8000 ╰─ u64 │ │ │ │ │ ├─ 0.01 546.6 ns │ 1.885 µs │ 635.1 ns │ 635.8 ns │ 1000 │ 8000 ├─ 0.1 572.6 ns │ 1.546 µs │ 661.2 ns │ 659.5 ns │ 1000 │ 8000 ├─ 0.2 572.6 ns │ 3.031 µs │ 661.2 ns │ 661.3 ns │ 1000 │ 8000 ├─ 0.3 577.9 ns │ 2.447 µs │ 666.4 ns │ 666.8 ns │ 1000 │ 8000 ├─ 0.4 583.1 ns │ 1.791 µs │ 666.4 ns │ 655 ns │ 1000 │ 8000 ├─ 0.5 598.6 ns │ 2.265 µs │ 687.2 ns │ 682.3 ns │ 1000 │ 8000 ├─ 0.6 598.7 ns │ 2.296 µs │ 687.2 ns │ 685.9 ns │ 1000 │ 8000 ├─ 0.7 614.2 ns │ 2.114 µs │ 692.5 ns │ 691.3 ns │ 1000 │ 8000 ├─ 0.8 609.1 ns │ 2.171 µs │ 697.6 ns │ 688.8 ns │ 1000 │ 8000 ├─ 0.9 614.2 ns │ 2.093 µs │ 697.6 ns │ 687.9 ns │ 1000 │ 8000 ╰─ 0.99 609.1 ns │ 2.468 µs │ 692.4 ns │ 683.3 ns │ 1000 │ 8000 ``` now: ``` expand_buffer fastest │ slowest │ median │ mean │ samples │ iters ╰─ expand_buffer │ │ │ │ │ ├─ u8 │ │ │ │ │ │ ├─ (256, 0.1) 73.33 ns │ 222.4 ns │ 82.46 ns │ 81.68 ns │ 1000 │ 64000 │ ├─ (256, 0.5) 79.85 ns │ 273.8 ns │ 90.91 ns │ 89.55 ns │ 1000 │ 64000 │ ├─ (256, 0.9) 89.61 ns │ 390.4 ns │ 96.11 ns │ 98.66 ns │ 1000 │ 32000 │ ├─ (1024, 0.1) 161.2 ns │ 238 ns │ 182 ns │ 176.4 ns │ 1000 │ 32000 │ ├─ (1024, 0.5) 195 ns │ 463.3 ns │ 210.7 ns │ 215.1 ns │ 1000 │ 32000 │ ├─ (1024, 0.9) 236.7 ns │ 1.692 µs │ 247.2 ns │ 260.1 ns │ 1000 │ 16000 │ ├─ (4096, 0.1) 520.5 ns │ 1.228 µs │ 588.2 ns │ 570.5 ns │ 1000 │ 8000 │ ├─ (4096, 0.5) 702.9 ns │ 1.067 µs │ 718.5 ns │ 750.1 ns │ 1000 │ 8000 │ ├─ (4096, 0.9) 885.1 ns │ 2.546 µs │ 895.6 ns │ 930.1 ns │ 1000 │ 8000 │ ├─ (16384, 0.1) 2.041 µs │ 9.291 µs │ 2.291 µs │ 2.22 µs │ 1000 │ 2000 │ ├─ (16384, 0.5) 2.749 µs │ 10.83 µs │ 2.895 µs │ 3.002 µs │ 1000 │ 2000 │ ╰─ (16384, 0.9) 3.457 µs │ 6.332 µs │ 3.541 µs │ 3.679 µs │ 1000 │ 1000 ├─ u32 │ │ │ │ │ │ ├─ (256, 0.1) 57.06 ns │ 116.6 ns │ 64.55 ns │ 63.1 ns │ 1000 │ 128000 │ ├─ (256, 0.5) 61.61 ns │ 92.21 ns │ 69.44 ns │ 67.91 ns │ 1000 │ 64000 │ ├─ (256, 0.9) 75.29 ns │ 284.2 ns │ 77.9 ns │ 81.76 ns │ 1000 │ 64000 │ ├─ (1024, 0.1) 146.9 ns │ 515.4 ns │ 165.1 ns │ 161.9 ns │ 1000 │ 32000 │ ├─ (1024, 0.5) 193.7 ns │ 292.7 ns │ 204.2 ns │ 211.2 ns │ 1000 │ 32000 │ ├─ (1024, 0.9) 239.3 ns │ 2.669 µs │ 244.5 ns │ 259.7 ns │ 1000 │ 16000 │ ├─ (4096, 0.1) 520.6 ns │ 3.687 µs │ 538.8 ns │ 570.3 ns │ 1000 │ 8000 │ ├─ (4096, 0.5) 697.5 ns │ 4.614 µs │ 728.7 ns │ 763.2 ns │ 1000 │ 4000 │ ├─ (4096, 0.9) 885 ns │ 4.624 µs │ 906 ns │ 950.2 ns │ 1000 │ 4000 │ ├─ (16384, 0.1) 2.207 µs │ 5.041 µs │ 2.479 µs │ 2.411 µs │ 1000 │ 1000 │ ├─ (16384, 0.5) 2.812 µs │ 10.49 µs │ 2.874 µs │ 2.994 µs │ 1000 │ 2000 │ ╰─ (16384, 0.9) 3.499 µs │ 40.16 µs │ 3.583 µs │ 3.765 µs │ 1000 │ 1000 ╰─ u64 │ │ │ │ │ ├─ (256, 0.1) 56.41 ns │ 104.9 ns │ 63.57 ns │ 62.17 ns │ 1000 │ 128000 ├─ (256, 0.5) 62.91 ns │ 95.47 ns │ 71.38 ns │ 69.82 ns │ 1000 │ 64000 ├─ (256, 0.9) 77.9 ns │ 230.9 ns │ 88.32 ns │ 85.89 ns │ 1000 │ 64000 ├─ (1024, 0.1) 145.6 ns │ 348.7 ns │ 166.4 ns │ 161.5 ns │ 1000 │ 32000 ├─ (1024, 0.5) 192.4 ns │ 291.4 ns │ 197.7 ns │ 206.2 ns │ 1000 │ 16000 ├─ (1024, 0.9) 244.5 ns │ 892.9 ns │ 252.3 ns │ 267.6 ns │ 1000 │ 16000 ├─ (4096, 0.1) 525.7 ns │ 2.541 µs │ 588.4 ns │ 613 ns │ 1000 │ 8000 ├─ (4096, 0.5) 713.2 ns │ 2.598 µs │ 728.9 ns │ 781.7 ns │ 1000 │ 8000 ├─ (4096, 0.9) 895.5 ns │ 4.593 µs │ 926.7 ns │ 987.1 ns │ 1000 │ 4000 ├─ (16384, 0.1) 2.416 µs │ 9.708 µs │ 2.749 µs │ 2.75 µs │ 1000 │ 1000 ├─ (16384, 0.5) 2.978 µs │ 9.291 µs │ 3.041 µs │ 3.289 µs │ 1000 │ 2000 ╰─ (16384, 0.9) 3.54 µs │ 14.74 µs │ 3.624 µs │ 3.932 µs │ 1000 │ 1000 ``` --------- Signed-off-by: Alexander Droste <[email protected]>
1 parent ecb43e5 commit 3b9f233

File tree

6 files changed

+171
-198
lines changed

6 files changed

+171
-198
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-buffer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ workspace = true
4343
[dev-dependencies]
4444
arrow-buffer = { workspace = true }
4545
divan = { workspace = true }
46+
rstest = { workspace = true }
4647

4748
[[bench]]
4849
name = "vortex_buffer"

vortex-buffer/src/bit/buf.rs

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,67 @@ impl BitBuffer {
406406
pub fn bitand_not(&self, rhs: &BitBuffer) -> BitBuffer {
407407
bitwise_binary_op(self, rhs, |a, b| a & !b)
408408
}
409+
410+
/// Iterate through bits in a buffer.
411+
///
412+
/// # Arguments
413+
///
414+
/// * `f` - Callback function taking (bit_index, is_set)
415+
///
416+
/// # Panics
417+
///
418+
/// Panics if the range is outside valid bounds of the buffer.
419+
#[inline]
420+
pub fn iter_bits<F>(&self, mut f: F)
421+
where
422+
F: FnMut(usize, bool),
423+
{
424+
let total_bits = self.len;
425+
if total_bits == 0 {
426+
return;
427+
}
428+
429+
let is_bit_set = |byte: u8, bit_idx: usize| (byte & (1 << bit_idx)) != 0;
430+
let bit_offset = self.offset % 8;
431+
let mut buffer_ptr = unsafe { self.buffer.as_ptr().add(self.offset / 8) };
432+
let mut callback_idx = 0;
433+
434+
// Handle incomplete first byte.
435+
if bit_offset > 0 {
436+
let bits_in_first_byte = (8 - bit_offset).min(total_bits);
437+
let byte = unsafe { *buffer_ptr };
438+
439+
for bit_idx in 0..bits_in_first_byte {
440+
f(callback_idx, is_bit_set(byte, bit_offset + bit_idx));
441+
callback_idx += 1;
442+
}
443+
444+
buffer_ptr = unsafe { buffer_ptr.add(1) };
445+
}
446+
447+
// Process complete bytes.
448+
let complete_bytes = (total_bits - callback_idx) / 8;
449+
for _ in 0..complete_bytes {
450+
let byte = unsafe { *buffer_ptr };
451+
452+
for bit_idx in 0..8 {
453+
f(callback_idx, is_bit_set(byte, bit_idx));
454+
callback_idx += 1;
455+
}
456+
buffer_ptr = unsafe { buffer_ptr.add(1) };
457+
}
458+
459+
// Handle remaining bits at the end.
460+
let remaining_bits = total_bits - callback_idx;
461+
if remaining_bits > 0 {
462+
let byte = unsafe { *buffer_ptr };
463+
464+
for bit_idx in 0..remaining_bits {
465+
f(callback_idx, is_bit_set(byte, bit_idx));
466+
callback_idx += 1;
467+
}
468+
}
469+
}
409470
}
410471

411472
impl<'a> IntoIterator for &'a BitBuffer {
@@ -419,6 +480,8 @@ impl<'a> IntoIterator for &'a BitBuffer {
419480

420481
#[cfg(test)]
421482
mod tests {
483+
use rstest::rstest;
484+
422485
use crate::bit::BitBuffer;
423486
use crate::{ByteBuffer, buffer};
424487

@@ -488,4 +551,57 @@ mod tests {
488551
"Buffer slices with different bits should not be equal (`PartialEq` needs `iter_padded()`)"
489552
);
490553
}
554+
555+
#[test]
556+
fn test_slice_offset_calculation() {
557+
let buf = BitBuffer::collect_bool(16, |_| true);
558+
let sliced = buf.slice(10..16);
559+
assert_eq!(sliced.offset(), 10);
560+
}
561+
562+
#[rstest]
563+
#[case(5)]
564+
#[case(8)]
565+
#[case(10)]
566+
#[case(13)]
567+
#[case(16)]
568+
#[case(23)]
569+
#[case(100)]
570+
fn test_iter_bits(#[case] len: usize) {
571+
let buf = BitBuffer::collect_bool(len, |i| i % 2 == 0);
572+
573+
let mut collected = Vec::new();
574+
buf.iter_bits(|idx, is_set| {
575+
collected.push((idx, is_set));
576+
});
577+
578+
assert_eq!(collected.len(), len);
579+
580+
for (idx, is_set) in collected {
581+
assert_eq!(is_set, idx % 2 == 0);
582+
}
583+
}
584+
585+
#[rstest]
586+
#[case(3, 5)]
587+
#[case(3, 8)]
588+
#[case(5, 10)]
589+
#[case(2, 16)]
590+
fn test_iter_bits_with_offset(#[case] offset: usize, #[case] len: usize) {
591+
let total_bits = offset + len;
592+
let buf = BitBuffer::collect_bool(total_bits, |i| i % 2 == 0);
593+
let buf_with_offset = BitBuffer::new_with_offset(buf.inner().clone(), len, offset);
594+
595+
let mut collected = Vec::new();
596+
buf_with_offset.iter_bits(|idx, is_set| {
597+
collected.push((idx, is_set));
598+
});
599+
600+
assert_eq!(collected.len(), len);
601+
602+
for (idx, is_set) in collected {
603+
// The bits should match the original buffer at positions offset + idx
604+
assert_eq!(is_set, (offset + idx) % 2 == 0);
605+
}
606+
}
491607
}

vortex-compute/benches/expand_buffer.rs

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,30 @@
33

44
//! Expand benchmarks for `Buffer`.
55
6-
use divan::Bencher;
76
use vortex_buffer::Buffer;
87
use vortex_compute::expand::Expand;
98
use vortex_mask::Mask;
109

10+
// buffer size, selectivity
11+
const PARAMETERS: &[(usize, f64)] = &[
12+
(256, 0.1),
13+
(256, 0.5),
14+
(256, 0.9),
15+
(1024, 0.1),
16+
(1024, 0.5),
17+
(1024, 0.9),
18+
(4096, 0.1),
19+
(4096, 0.5),
20+
(4096, 0.9),
21+
(16384, 0.1),
22+
(16384, 0.5),
23+
(16384, 0.9),
24+
];
25+
1126
fn main() {
1227
divan::main();
1328
}
1429

15-
const BUFFER_SIZE: usize = 1024;
16-
17-
const SELECTIVITIES: &[f64] = &[
18-
0.01, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.99,
19-
];
20-
2130
fn create_test_buffer<T>(size: usize) -> Buffer<T>
2231
where
2332
T: Copy + Default + From<u8> + Send + 'static,
@@ -49,20 +58,20 @@ fn generate_mask(len: usize, selectivity: f64) -> Mask {
4958
Mask::from_iter(selection)
5059
}
5160

52-
#[divan::bench(types = [u8, u32, u64], args = SELECTIVITIES, sample_count = 1000)]
53-
fn expand_selectivity<T: Copy + Default + From<u8> + Send + 'static>(
54-
bencher: Bencher,
55-
selectivity: f64,
61+
#[divan::bench(types = [u8, u32, u64], args = PARAMETERS, sample_count = 1000)]
62+
fn expand_buffer<T: Copy + Default + From<u8> + Send + 'static>(
63+
bencher: divan::Bencher,
64+
(buffer_size, selectivity): (usize, f64),
5665
) {
5766
bencher
5867
.with_inputs(|| {
59-
let mask = generate_mask(BUFFER_SIZE, selectivity);
68+
let mask = generate_mask(buffer_size, selectivity);
6069
let true_count = mask.true_count();
6170
let buffer = create_test_buffer::<T>(true_count);
6271
(buffer, mask)
6372
})
64-
.bench_values(|(buffer, mask)| {
65-
let result = buffer.expand(&mask);
73+
.bench_refs(|(buffer, mask)| {
74+
let result = buffer.expand(mask);
6675
divan::black_box(result);
6776
});
6877
}

0 commit comments

Comments
 (0)