Skip to content

Commit 34459fb

Browse files
authored
Faster boolean stats (#1301)
For true count, we just need to count_ones in the u64. This delegates to Arrow for true counts which does the right thing. There's overall 15% reduction in time for TPCH Q15, didn't run on any others.
1 parent 8484445 commit 34459fb

File tree

2 files changed

+62
-41
lines changed

2 files changed

+62
-41
lines changed

vortex-array/src/array/bool/stats.rs

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
use std::ops::BitAnd;
2+
13
use arrow_buffer::BooleanBuffer;
24
use itertools::Itertools;
35
use vortex_dtype::{DType, Nullability};
46
use vortex_error::VortexResult;
7+
use vortex_scalar::Scalar;
58

69
use crate::aliases::hash_map::HashMap;
710
use crate::array::BoolArray;
@@ -30,7 +33,18 @@ impl ArrayStatisticsCompute for BoolArray {
3033
struct NullableBools<'a>(&'a BooleanBuffer, &'a BooleanBuffer);
3134

3235
impl ArrayStatisticsCompute for NullableBools<'_> {
33-
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
36+
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
37+
// Fast-path if we just want the true-count
38+
if matches!(
39+
stat,
40+
Stat::TrueCount | Stat::Min | Stat::Max | Stat::IsConstant
41+
) {
42+
return Ok(true_count_stats(
43+
self.0.bitand(self.1).count_set_bits(),
44+
self.0.len(),
45+
));
46+
}
47+
3448
let first_non_null_idx = self
3549
.1
3650
.iter()
@@ -59,9 +73,13 @@ impl ArrayStatisticsCompute for NullableBools<'_> {
5973
}
6074

6175
impl ArrayStatisticsCompute for BooleanBuffer {
62-
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
63-
if self.is_empty() {
64-
return Ok(StatsSet::new());
76+
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
77+
// Fast-path if we just want the true-count
78+
if matches!(
79+
stat,
80+
Stat::TrueCount | Stat::Min | Stat::Max | Stat::IsConstant
81+
) {
82+
return Ok(true_count_stats(self.count_set_bits(), self.len()));
6583
}
6684

6785
let mut stats = BoolStatsAccumulator::new(self.value(0));
@@ -70,6 +88,18 @@ impl ArrayStatisticsCompute for BooleanBuffer {
7088
}
7189
}
7290

91+
fn true_count_stats(true_count: usize, len: usize) -> StatsSet {
92+
StatsSet::from(HashMap::<Stat, Scalar>::from([
93+
(Stat::TrueCount, true_count.into()),
94+
(Stat::Min, (true_count == len).into()),
95+
(Stat::Max, (true_count > 0).into()),
96+
(
97+
Stat::IsConstant,
98+
(true_count == 0 || true_count == len).into(),
99+
),
100+
]))
101+
}
102+
73103
struct BoolStatsAccumulator {
74104
prev: bool,
75105
is_sorted: bool,
@@ -123,23 +153,14 @@ impl BoolStatsAccumulator {
123153

124154
pub fn finish(self) -> StatsSet {
125155
StatsSet::from(HashMap::from([
126-
(Stat::Min, (self.true_count == self.len).into()),
127-
(Stat::Max, (self.true_count > 0).into()),
128156
(Stat::NullCount, self.null_count.into()),
129-
(
130-
Stat::IsConstant,
131-
(self.null_count == 0 && (self.true_count == self.len || self.true_count == 0)
132-
|| self.null_count == self.len)
133-
.into(),
134-
),
135157
(Stat::IsSorted, self.is_sorted.into()),
136158
(
137159
Stat::IsStrictSorted,
138160
(self.is_sorted && (self.len < 2 || (self.len == 2 && self.true_count == 1)))
139161
.into(),
140162
),
141163
(Stat::RunCount, self.run_count.into()),
142-
(Stat::TrueCount, self.true_count.into()),
143164
]))
144165
}
145166
}

vortex-array/src/stats/statsset.rs

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ impl StatsSet {
8787
stats
8888
}
8989

90-
pub fn of(stat: Stat, value: Scalar) -> Self {
91-
Self::from(HashMap::from([(stat, value)]))
90+
pub fn of<S: Into<Scalar>>(stat: Stat, value: S) -> Self {
91+
Self::from(HashMap::from([(stat, value.into())]))
9292
}
9393

9494
pub fn get(&self, stat: Stat) -> Option<&Scalar> {
@@ -317,71 +317,71 @@ mod test {
317317

318318
#[test]
319319
fn merge_into_min() {
320-
let mut first = StatsSet::of(Stat::Min, 42.into());
320+
let mut first = StatsSet::of(Stat::Min, 42);
321321
first.merge_ordered(&StatsSet::new());
322322
assert_eq!(first.get(Stat::Min), None);
323323
}
324324

325325
#[test]
326326
fn merge_from_min() {
327327
let mut first = StatsSet::new();
328-
first.merge_ordered(&StatsSet::of(Stat::Min, 42.into()));
328+
first.merge_ordered(&StatsSet::of(Stat::Min, 42));
329329
assert_eq!(first.get(Stat::Min), None);
330330
}
331331

332332
#[test]
333333
fn merge_mins() {
334-
let mut first = StatsSet::of(Stat::Min, 37.into());
335-
first.merge_ordered(&StatsSet::of(Stat::Min, 42.into()));
334+
let mut first = StatsSet::of(Stat::Min, 37);
335+
first.merge_ordered(&StatsSet::of(Stat::Min, 42));
336336
assert_eq!(first.get(Stat::Min).cloned(), Some(37.into()));
337337
}
338338

339339
#[test]
340340
fn merge_into_max() {
341-
let mut first = StatsSet::of(Stat::Max, 42.into());
341+
let mut first = StatsSet::of(Stat::Max, 42);
342342
first.merge_ordered(&StatsSet::new());
343343
assert_eq!(first.get(Stat::Max), None);
344344
}
345345

346346
#[test]
347347
fn merge_from_max() {
348348
let mut first = StatsSet::new();
349-
first.merge_ordered(&StatsSet::of(Stat::Max, 42.into()));
349+
first.merge_ordered(&StatsSet::of(Stat::Max, 42));
350350
assert_eq!(first.get(Stat::Max), None);
351351
}
352352

353353
#[test]
354354
fn merge_maxes() {
355-
let mut first = StatsSet::of(Stat::Max, 37.into());
356-
first.merge_ordered(&StatsSet::of(Stat::Max, 42.into()));
355+
let mut first = StatsSet::of(Stat::Max, 37);
356+
first.merge_ordered(&StatsSet::of(Stat::Max, 42));
357357
assert_eq!(first.get(Stat::Max).cloned(), Some(42.into()));
358358
}
359359

360360
#[test]
361361
fn merge_into_scalar() {
362-
let mut first = StatsSet::of(Stat::TrueCount, 42.into());
362+
let mut first = StatsSet::of(Stat::TrueCount, 42);
363363
first.merge_ordered(&StatsSet::new());
364364
assert_eq!(first.get(Stat::TrueCount), None);
365365
}
366366

367367
#[test]
368368
fn merge_from_scalar() {
369369
let mut first = StatsSet::new();
370-
first.merge_ordered(&StatsSet::of(Stat::TrueCount, 42.into()));
370+
first.merge_ordered(&StatsSet::of(Stat::TrueCount, 42));
371371
assert_eq!(first.get(Stat::TrueCount), None);
372372
}
373373

374374
#[test]
375375
fn merge_scalars() {
376-
let mut first = StatsSet::of(Stat::TrueCount, 37.into());
377-
first.merge_ordered(&StatsSet::of(Stat::TrueCount, 42.into()));
376+
let mut first = StatsSet::of(Stat::TrueCount, 37);
377+
first.merge_ordered(&StatsSet::of(Stat::TrueCount, 42));
378378
assert_eq!(first.get(Stat::TrueCount).cloned(), Some(79u64.into()));
379379
}
380380

381381
#[test]
382382
fn merge_into_freq() {
383383
let vec = (0..255).collect_vec();
384-
let mut first = StatsSet::of(Stat::BitWidthFreq, vec.into());
384+
let mut first = StatsSet::of(Stat::BitWidthFreq, vec);
385385
first.merge_ordered(&StatsSet::new());
386386
assert_eq!(first.get(Stat::BitWidthFreq), None);
387387
}
@@ -390,48 +390,48 @@ mod test {
390390
fn merge_from_freq() {
391391
let vec = (0..255).collect_vec();
392392
let mut first = StatsSet::new();
393-
first.merge_ordered(&StatsSet::of(Stat::BitWidthFreq, vec.into()));
393+
first.merge_ordered(&StatsSet::of(Stat::BitWidthFreq, vec));
394394
assert_eq!(first.get(Stat::BitWidthFreq), None);
395395
}
396396

397397
#[test]
398398
fn merge_freqs() {
399399
let vec_in = vec![5u64; 256];
400400
let vec_out = vec![10u64; 256];
401-
let mut first = StatsSet::of(Stat::BitWidthFreq, vec_in.clone().into());
402-
first.merge_ordered(&StatsSet::of(Stat::BitWidthFreq, vec_in.into()));
401+
let mut first = StatsSet::of(Stat::BitWidthFreq, vec_in.clone());
402+
first.merge_ordered(&StatsSet::of(Stat::BitWidthFreq, vec_in));
403403
assert_eq!(first.get(Stat::BitWidthFreq).cloned(), Some(vec_out.into()));
404404
}
405405

406406
#[test]
407407
fn merge_into_sortedness() {
408-
let mut first = StatsSet::of(Stat::IsStrictSorted, true.into());
408+
let mut first = StatsSet::of(Stat::IsStrictSorted, true);
409409
first.merge_ordered(&StatsSet::new());
410410
assert_eq!(first.get(Stat::IsStrictSorted), None);
411411
}
412412

413413
#[test]
414414
fn merge_from_sortedness() {
415415
let mut first = StatsSet::new();
416-
first.merge_ordered(&StatsSet::of(Stat::IsStrictSorted, true.into()));
416+
first.merge_ordered(&StatsSet::of(Stat::IsStrictSorted, true));
417417
assert_eq!(first.get(Stat::IsStrictSorted), None);
418418
}
419419

420420
#[test]
421421
fn merge_sortedness() {
422-
let mut first = StatsSet::of(Stat::IsStrictSorted, true.into());
422+
let mut first = StatsSet::of(Stat::IsStrictSorted, true);
423423
first.set(Stat::Max, 1.into());
424-
let mut second = StatsSet::of(Stat::IsStrictSorted, true.into());
424+
let mut second = StatsSet::of(Stat::IsStrictSorted, true);
425425
second.set(Stat::Min, 2.into());
426426
first.merge_ordered(&second);
427427
assert_eq!(first.get(Stat::IsStrictSorted).cloned(), Some(true.into()));
428428
}
429429

430430
#[test]
431431
fn merge_sortedness_out_of_order() {
432-
let mut first = StatsSet::of(Stat::IsStrictSorted, true.into());
432+
let mut first = StatsSet::of(Stat::IsStrictSorted, true);
433433
first.set(Stat::Min, 1.into());
434-
let mut second = StatsSet::of(Stat::IsStrictSorted, true.into());
434+
let mut second = StatsSet::of(Stat::IsStrictSorted, true);
435435
second.set(Stat::Max, 2.into());
436436
second.merge_ordered(&first);
437437
assert_eq!(
@@ -442,9 +442,9 @@ mod test {
442442

443443
#[test]
444444
fn merge_sortedness_only_one_sorted() {
445-
let mut first = StatsSet::of(Stat::IsStrictSorted, true.into());
445+
let mut first = StatsSet::of(Stat::IsStrictSorted, true);
446446
first.set(Stat::Max, 1.into());
447-
let mut second = StatsSet::of(Stat::IsStrictSorted, false.into());
447+
let mut second = StatsSet::of(Stat::IsStrictSorted, false);
448448
second.set(Stat::Min, 2.into());
449449
first.merge_ordered(&second);
450450
assert_eq!(
@@ -455,9 +455,9 @@ mod test {
455455

456456
#[test]
457457
fn merge_sortedness_missing_min() {
458-
let mut first = StatsSet::of(Stat::IsStrictSorted, true.into());
458+
let mut first = StatsSet::of(Stat::IsStrictSorted, true);
459459
first.set(Stat::Max, 1.into());
460-
let second = StatsSet::of(Stat::IsStrictSorted, true.into());
460+
let second = StatsSet::of(Stat::IsStrictSorted, true);
461461
first.merge_ordered(&second);
462462
assert_eq!(first.get(Stat::IsStrictSorted).cloned(), None);
463463
}

0 commit comments

Comments
 (0)