Skip to content

Commit a091281

Browse files
authored
feat: small mask improvements (#2035)
fixes #2030
1 parent 64b6087 commit a091281

File tree

4 files changed

+68
-9
lines changed

4 files changed

+68
-9
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-mask/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "vortex-mask"
3-
description = "Vortex Mask - sorted, unique, positive integers"
3+
description = "Vortex Mask - sorted, unique, non-negative integers"
44
version.workspace = true
55
homepage.workspace = true
66
repository.workspace = true
@@ -15,6 +15,7 @@ categories.workspace = true
1515

1616
[dependencies]
1717
arrow-buffer = { workspace = true }
18+
itertools = { workspace = true }
1819
vortex-error = { workspace = true }
1920

2021
[lints]

vortex-mask/src/intersect_by_rank.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,24 @@
11
use crate::Mask;
22

33
impl Mask {
4-
/// take the intersection of the `mask` with the set of true values in `self`.
4+
/// Take the intersection of the `mask` with the set of true values in `self`.
55
///
66
/// We are more interested in low selectivity `self` (as indices) with a boolean buffer mask,
77
/// so we don't optimize for other cases, yet.
8+
///
9+
/// # Examples
10+
///
11+
/// Keep the third and fifth set values from mask `m1`:
12+
/// ```
13+
/// use vortex_mask::Mask;
14+
///
15+
/// let m1 = Mask::from_iter([true, false, false, true, true, true, false, true]);
16+
/// let m2 = Mask::from_iter([false, false, true, false, true]);
17+
/// assert_eq!(
18+
/// m1.intersect_by_rank(&m2),
19+
/// Mask::from_iter([false, false, false, false, true, false, false, true])
20+
/// );
21+
/// ```
822
pub fn intersect_by_rank(&self, mask: &Mask) -> Mask {
923
assert_eq!(self.true_count(), mask.len());
1024

vortex-mask/src/lib.rs

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use std::cmp::Ordering;
66
use std::sync::{Arc, OnceLock};
77

88
use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder};
9+
use itertools::Itertools;
910
use vortex_error::vortex_panic;
1011

1112
/// If the mask selects more than this fraction of rows, iterate over slices instead of indices.
@@ -31,6 +32,7 @@ struct Inner {
3132
// Pre-computed values.
3233
len: usize,
3334
true_count: usize,
35+
// i.e., the fraction of values that are true
3436
selectivity: f64,
3537
}
3638

@@ -51,6 +53,7 @@ impl Inner {
5153
// TODO(ngates): for dense indices, we can do better by collecting into u64s.
5254
buf.append_n(self.len, false);
5355
indices.iter().for_each(|idx| buf.set_bit(*idx, true));
56+
debug_assert_eq!(buf.len(), self.len);
5457
return BooleanBuffer::from(buf);
5558
}
5659

@@ -85,12 +88,16 @@ impl Inner {
8588
if let Some(buffer) = self.buffer.get() {
8689
let mut indices = Vec::with_capacity(self.true_count);
8790
indices.extend(buffer.set_indices());
91+
debug_assert!(indices.is_sorted());
92+
assert_eq!(indices.len(), self.true_count);
8893
return indices;
8994
}
9095

9196
if let Some(slices) = self.slices.get() {
9297
let mut indices = Vec::with_capacity(self.true_count);
9398
indices.extend(slices.iter().flat_map(|(start, end)| *start..*end));
99+
debug_assert!(indices.is_sorted());
100+
assert_eq!(indices.len(), self.true_count);
94101
return indices;
95102
}
96103

@@ -99,6 +106,7 @@ impl Inner {
99106
}
100107

101108
/// Constructs a slices vector from one of the other representations.
109+
#[allow(clippy::cast_possible_truncation)]
102110
fn slices(&self) -> &[(usize, usize)] {
103111
self.slices.get_or_init(|| {
104112
if self.true_count == self.len {
@@ -110,7 +118,10 @@ impl Inner {
110118
}
111119

112120
if let Some(indices) = self.indices.get() {
113-
let mut slices = Vec::with_capacity(self.true_count); // Upper bound
121+
// Expected number of contiguous slices assuming a uniform distribution of true values.
122+
let expected_num_slices =
123+
(self.selectivity * (self.len - self.true_count + 1) as f64).round() as usize;
124+
let mut slices = Vec::with_capacity(expected_num_slices);
114125
let mut iter = indices.iter().copied();
115126

116127
// Handle empty input
@@ -200,7 +211,11 @@ impl Mask {
200211
/// Create a new [`Mask`] from a [`Vec<usize>`].
201212
pub fn from_indices(len: usize, vec: Vec<usize>) -> Self {
202213
let true_count = vec.len();
203-
assert!(vec.iter().all(|&idx| idx < len));
214+
assert!(vec.is_sorted(), "Mask indices must be sorted");
215+
assert!(
216+
vec.last().is_none_or(|&idx| idx < len),
217+
"Mask indices must be in bounds (len={len})"
218+
);
204219
Self(Arc::new(Inner {
205220
buffer: Default::default(),
206221
indices: OnceLock::from(vec),
@@ -214,7 +229,14 @@ impl Mask {
214229
/// Create a new [`Mask`] from a [`Vec<(usize, usize)>`] where each range
215230
/// represents a contiguous range of true values.
216231
pub fn from_slices(len: usize, vec: Vec<(usize, usize)>) -> Self {
217-
assert!(vec.iter().all(|&(b, e)| b < e && e <= len));
232+
Self::check_slices(len, &vec);
233+
Self::from_slices_unchecked(len, vec)
234+
}
235+
236+
fn from_slices_unchecked(len: usize, vec: Vec<(usize, usize)>) -> Self {
237+
#[cfg(debug_assertions)]
238+
Self::check_slices(len, &vec);
239+
218240
let true_count = vec.iter().map(|(b, e)| e - b).sum();
219241
Self(Arc::new(Inner {
220242
buffer: Default::default(),
@@ -226,6 +248,25 @@ impl Mask {
226248
}))
227249
}
228250

251+
#[inline(always)]
252+
fn check_slices(len: usize, vec: &[(usize, usize)]) {
253+
assert!(vec.iter().all(|&(b, e)| b < e && e <= len));
254+
for (first, second) in vec.iter().tuple_windows() {
255+
assert!(
256+
first.0 < second.0,
257+
"Slices must be sorted, got {:?} and {:?}",
258+
first,
259+
second
260+
);
261+
assert!(
262+
first.1 <= second.0,
263+
"Slices must be non-overlapping, got {:?} and {:?}",
264+
first,
265+
second
266+
);
267+
}
268+
}
269+
229270
/// Create a new [`Mask`] from the intersection of two indices slices.
230271
pub fn from_intersection_indices(
231272
len: usize,
@@ -254,7 +295,7 @@ impl Mask {
254295
}
255296

256297
#[inline]
257-
// There is good definition of is_empty, does it mean len == 0 or true_count == 0?
298+
// There is no good definition of is_empty, does it mean len == 0 or true_count == 0?
258299
#[allow(clippy::len_without_is_empty)]
259300
pub fn len(&self) -> usize {
260301
self.0.len
@@ -328,7 +369,8 @@ impl Mask {
328369
let indices = indices
329370
.iter()
330371
.copied()
331-
.filter(|&idx| offset <= idx && idx < end)
372+
.skip_while(|idx| *idx < offset)
373+
.take_while(|idx| *idx < end)
332374
.map(|idx| idx - offset)
333375
.collect();
334376
return Self::from_indices(length, indices);
@@ -338,10 +380,11 @@ impl Mask {
338380
let slices = slices
339381
.iter()
340382
.copied()
341-
.filter(|(s, e)| *s < end && *e > offset)
383+
.skip_while(|(_, e)| *e <= offset)
384+
.take_while(|(s, _)| *s < end)
342385
.map(|(s, e)| (s.max(offset), e.min(end)))
343386
.collect();
344-
return Self::from_slices(length, slices);
387+
return Self::from_slices_unchecked(length, slices);
345388
}
346389

347390
vortex_panic!("No mask representation found")

0 commit comments

Comments
 (0)