Skip to content

Commit ee7abec

Browse files
authored
feat: teach RunEndArray NullCount and TrueCount (#2007)
1 parent 2a46ba9 commit ee7abec

File tree

6 files changed

+312
-25
lines changed

6 files changed

+312
-25
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

encodings/runend/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@ workspace = true
3131
[dev-dependencies]
3232
vortex-array = { workspace = true, features = ["test-harness"] }
3333
criterion = { workspace = true }
34+
rand = { workspace = true }
3435

3536
[[bench]]
3637
name = "run_end_filter"
3738
harness = false
39+
40+
[[bench]]
41+
name = "run_end_null_count"
42+
harness = false
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#![allow(clippy::unwrap_used)]
2+
3+
use std::iter::Iterator;
4+
5+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
6+
use rand::rngs::StdRng;
7+
use rand::{Rng, SeedableRng as _};
8+
use vortex_array::array::PrimitiveArray;
9+
use vortex_array::stats::Stat;
10+
use vortex_array::IntoArrayData;
11+
use vortex_buffer::Buffer;
12+
use vortex_runend::RunEndArray;
13+
14+
const LENS: [usize; 2] = [1000, 100_000];
15+
16+
/// Create RunEnd arrays where the runs are equal size, and the null_count mask is evenly spaced.
17+
fn run_end_null_count(c: &mut Criterion) {
18+
let mut rng = StdRng::seed_from_u64(0);
19+
let mut group = c.benchmark_group("run_end_null_count");
20+
21+
for &n in LENS.iter().rev() {
22+
for run_step in [1usize << 2, 1 << 4, 1 << 8, 1 << 16] {
23+
let ends = (0..=n)
24+
.step_by(run_step)
25+
.map(|x| x as u64)
26+
.collect::<Buffer<_>>()
27+
.into_array();
28+
let run_count = ends.len() - 1;
29+
for valid_density in [0.01, 0.1, 0.5] {
30+
let values = PrimitiveArray::from_option_iter(
31+
(0..ends.len()).map(|x| rng.gen_bool(valid_density).then_some(x as u64)),
32+
)
33+
.into_array();
34+
let array = RunEndArray::try_new(ends.clone(), values)
35+
.unwrap()
36+
.into_array();
37+
38+
group.bench_function(
39+
format!(
40+
"null_count_run_end n: {}, run_count: {}, valid_density: {}",
41+
n, run_count, valid_density
42+
),
43+
|b| {
44+
b.iter(|| {
45+
black_box(
46+
array
47+
.encoding()
48+
.compute_statistics(&array, Stat::NullCount)
49+
.unwrap(),
50+
)
51+
});
52+
},
53+
);
54+
}
55+
}
56+
}
57+
}
58+
59+
criterion_group!(benches, run_end_null_count);
60+
criterion_main!(benches);

encodings/runend/src/array.rs

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use vortex_array::compute::{
66
scalar_at, search_sorted_usize, search_sorted_usize_many, SearchSortedSide,
77
};
88
use vortex_array::encoding::ids;
9-
use vortex_array::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet};
9+
use vortex_array::stats::{ArrayStatistics, StatsSet};
1010
use vortex_array::validate::ValidateVTable;
1111
use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable};
1212
use vortex_array::variants::{BoolArrayTrait, PrimitiveArrayTrait, VariantsVTable};
@@ -18,7 +18,6 @@ use vortex_array::{
1818
use vortex_buffer::Buffer;
1919
use vortex_dtype::{DType, PType};
2020
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};
21-
use vortex_scalar::Scalar;
2221

2322
use crate::compress::{runend_decode_bools, runend_decode_primitive, runend_encode};
2423

@@ -224,29 +223,6 @@ impl VisitorVTable<RunEndArray> for RunEndEncoding {
224223
}
225224
}
226225

227-
impl StatisticsVTable<RunEndArray> for RunEndEncoding {
228-
fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult<StatsSet> {
229-
let maybe_stat = match stat {
230-
Stat::Min | Stat::Max => array.values().statistics().compute(stat),
231-
Stat::IsSorted => Some(Scalar::from(
232-
array
233-
.values()
234-
.statistics()
235-
.compute_is_sorted()
236-
.unwrap_or(false)
237-
&& array.logical_validity().all_valid(),
238-
)),
239-
_ => None,
240-
};
241-
242-
let mut stats = StatsSet::default();
243-
if let Some(stat_value) = maybe_stat {
244-
stats.set(stat, stat_value);
245-
}
246-
Ok(stats)
247-
}
248-
}
249-
250226
#[cfg(test)]
251227
mod tests {
252228
use vortex_array::compute::scalar_at;

encodings/runend/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ mod array;
44
pub mod compress;
55
mod compute;
66
mod iter;
7+
mod statistics;
78

89
#[doc(hidden)]
910
pub mod _benchmarking {

encodings/runend/src/statistics.rs

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
use std::cmp;
2+
3+
use arrow_buffer::BooleanBuffer;
4+
use itertools::Itertools;
5+
use vortex_array::stats::{ArrayStatistics as _, Stat, StatisticsVTable, StatsSet};
6+
use vortex_array::validity::{ArrayValidity as _, LogicalValidity};
7+
use vortex_array::variants::PrimitiveArrayTrait;
8+
use vortex_array::{ArrayDType as _, ArrayLen as _, IntoArrayVariant as _};
9+
use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType};
10+
use vortex_error::VortexResult;
11+
use vortex_scalar::Scalar;
12+
13+
use crate::{RunEndArray, RunEndEncoding};
14+
15+
impl StatisticsVTable<RunEndArray> for RunEndEncoding {
16+
fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult<StatsSet> {
17+
let maybe_stat = match stat {
18+
Stat::Min | Stat::Max => array.values().statistics().compute(stat),
19+
Stat::IsSorted => Some(Scalar::from(
20+
array
21+
.values()
22+
.statistics()
23+
.compute_is_sorted()
24+
.unwrap_or(false)
25+
&& array.logical_validity().all_valid(),
26+
)),
27+
Stat::TrueCount => match array.dtype() {
28+
DType::Bool(_) => Some(Scalar::from(array.true_count()?)),
29+
_ => None,
30+
},
31+
Stat::NullCount => Some(Scalar::from(array.null_count()?)),
32+
_ => None,
33+
};
34+
35+
let mut stats = StatsSet::default();
36+
if let Some(stat_value) = maybe_stat {
37+
stats.set(stat, stat_value);
38+
}
39+
Ok(stats)
40+
}
41+
}
42+
43+
impl RunEndArray {
44+
fn true_count(&self) -> VortexResult<u64> {
45+
let ends = self.ends().into_primitive()?;
46+
let values = self.values().into_bool()?.boolean_buffer();
47+
48+
match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.typed_true_count(ends.as_slice::<$P>(), values))
49+
}
50+
51+
fn typed_true_count<P: NativePType + Into<u64>>(
52+
&self,
53+
decompressed_ends: &[P],
54+
decompressed_values: BooleanBuffer,
55+
) -> VortexResult<u64> {
56+
Ok(match self.values().logical_validity() {
57+
LogicalValidity::AllValid(_) => {
58+
let mut begin = self.offset() as u64;
59+
decompressed_ends
60+
.iter()
61+
.copied()
62+
.zip_eq(&decompressed_values)
63+
.map(|(end, bool_value)| {
64+
let end: u64 = end.into();
65+
let len = end - begin;
66+
begin = end;
67+
len * u64::from(bool_value)
68+
})
69+
.sum()
70+
}
71+
LogicalValidity::AllInvalid(_) => 0,
72+
LogicalValidity::Array(is_valid) => {
73+
let is_valid = is_valid.into_bool()?.boolean_buffer();
74+
let mut is_valid = is_valid.set_indices();
75+
match is_valid.next() {
76+
None => self.len() as u64,
77+
Some(valid_index) => {
78+
let mut true_count: u64 = 0;
79+
let offsetted_begin = self.offset() as u64;
80+
let offsetted_len = (self.len() + self.offset()) as u64;
81+
let begin = if valid_index == 0 {
82+
offsetted_begin
83+
} else {
84+
decompressed_ends[valid_index - 1].into()
85+
};
86+
87+
let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
88+
true_count += decompressed_values.value(valid_index) as u64 * (end - begin);
89+
90+
for valid_index in is_valid {
91+
let valid_end: u64 = decompressed_ends[valid_index].into();
92+
let end = cmp::min(valid_end, offsetted_len);
93+
true_count +=
94+
decompressed_values.value(valid_index) as u64 * (end - valid_end);
95+
}
96+
97+
true_count
98+
}
99+
}
100+
}
101+
})
102+
}
103+
104+
fn null_count(&self) -> VortexResult<u64> {
105+
let ends = self.ends().into_primitive()?;
106+
let null_count = match self.values().logical_validity() {
107+
LogicalValidity::AllValid(_) => 0u64,
108+
LogicalValidity::AllInvalid(_) => self.len() as u64,
109+
LogicalValidity::Array(is_valid) => {
110+
let is_valid = is_valid.into_bool()?.boolean_buffer();
111+
match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.null_count_with_array_validity(ends.as_slice::<$P>(), is_valid))
112+
}
113+
};
114+
Ok(null_count)
115+
}
116+
117+
fn null_count_with_array_validity<P: NativePType + Into<u64>>(
118+
&self,
119+
decompressed_ends: &[P],
120+
is_valid: BooleanBuffer,
121+
) -> u64 {
122+
let mut is_valid = is_valid.set_indices();
123+
match is_valid.next() {
124+
None => self.len() as u64,
125+
Some(valid_index) => {
126+
let offsetted_len = (self.len() + self.offset()) as u64;
127+
let mut null_count: u64 = self.len() as u64;
128+
let begin = if valid_index == 0 {
129+
0
130+
} else {
131+
decompressed_ends[valid_index - 1].into()
132+
};
133+
134+
let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
135+
null_count -= end - begin;
136+
137+
for valid_index in is_valid {
138+
let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
139+
null_count -= end - decompressed_ends[valid_index - 1].into();
140+
}
141+
142+
null_count
143+
}
144+
}
145+
}
146+
}
147+
148+
#[cfg(test)]
149+
mod tests {
150+
use arrow_buffer::BooleanBuffer;
151+
use vortex_array::array::BoolArray;
152+
use vortex_array::compute::slice;
153+
use vortex_array::stats::{ArrayStatistics as _, Stat};
154+
use vortex_array::validity::Validity;
155+
use vortex_array::IntoArrayData;
156+
use vortex_buffer::buffer;
157+
158+
use crate::RunEndArray;
159+
160+
#[test]
161+
fn test_runend_int_stats() {
162+
let arr = RunEndArray::try_new(
163+
buffer![2u32, 5, 10].into_array(),
164+
buffer![1i32, 2, 3].into_array(),
165+
)
166+
.unwrap();
167+
168+
assert_eq!(arr.statistics().compute_as::<i32>(Stat::Min).unwrap(), 1);
169+
assert_eq!(arr.statistics().compute_as::<i32>(Stat::Max).unwrap(), 3);
170+
assert_eq!(
171+
arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
172+
0
173+
);
174+
assert!(arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap());
175+
}
176+
177+
#[test]
178+
fn test_runend_bool_stats() {
179+
let arr = RunEndArray::try_new(
180+
buffer![2u32, 5, 10].into_array(),
181+
BoolArray::try_new(
182+
BooleanBuffer::from_iter([true, true, false]),
183+
Validity::Array(BoolArray::from_iter([true, false, true]).into_array()),
184+
)
185+
.unwrap()
186+
.into_array(),
187+
)
188+
.unwrap();
189+
190+
assert!(!arr.statistics().compute_as::<bool>(Stat::Min).unwrap());
191+
assert!(arr.statistics().compute_as::<bool>(Stat::Max).unwrap());
192+
assert_eq!(
193+
arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
194+
3
195+
);
196+
assert!(!arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap());
197+
assert_eq!(
198+
arr.statistics().compute_as::<u64>(Stat::TrueCount).unwrap(),
199+
2
200+
);
201+
202+
let sliced = slice(arr, 4, 7).unwrap();
203+
204+
assert!(!sliced.statistics().compute_as::<bool>(Stat::Min).unwrap());
205+
assert!(!sliced.statistics().compute_as::<bool>(Stat::Max).unwrap());
206+
assert_eq!(
207+
sliced
208+
.statistics()
209+
.compute_as::<u64>(Stat::NullCount)
210+
.unwrap(),
211+
1
212+
);
213+
// Not sorted because null must come last
214+
assert!(!sliced
215+
.statistics()
216+
.compute_as::<bool>(Stat::IsSorted)
217+
.unwrap());
218+
assert_eq!(
219+
sliced
220+
.statistics()
221+
.compute_as::<u64>(Stat::TrueCount)
222+
.unwrap(),
223+
0
224+
);
225+
}
226+
227+
#[test]
228+
fn test_all_invalid_true_count() {
229+
let arr = RunEndArray::try_new(
230+
buffer![2u32, 5, 10].into_array(),
231+
BoolArray::from_iter([None, None, None]).into_array(),
232+
)
233+
.unwrap()
234+
.into_array();
235+
assert_eq!(
236+
arr.statistics().compute_as::<u64>(Stat::TrueCount).unwrap(),
237+
0
238+
);
239+
assert_eq!(
240+
arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
241+
10
242+
);
243+
}
244+
}

0 commit comments

Comments
 (0)