Skip to content

Commit ab7d4ef

Browse files
authored
Use more stats in the btrblocks compressor (#2720)
taken out of #2718
1 parent 0ba29b3 commit ab7d4ef

File tree

3 files changed

+31
-38
lines changed

3 files changed

+31
-38
lines changed

vortex-btrblocks/src/float/stats.rs

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,12 @@ where
135135
};
136136
}
137137

138-
let validity = array.validity_mask().vortex_expect("logical_validity");
139-
let null_count = validity.false_count();
140-
let value_count = validity.true_count();
141-
let mut min = T::max_value();
142-
let mut max = T::min_value();
138+
let null_count = array
139+
.statistics()
140+
.compute_null_count()
141+
.vortex_expect("null count");
142+
let value_count = array.len() - null_count;
143+
143144
// Keep a HashMap of T, then convert the keys into PValue afterward since value is
144145
// so much more efficient to hash and search for.
145146
let mut distinct_values = if count_distinct_values {
@@ -148,6 +149,8 @@ where
148149
HashMap::with_hasher(FxBuildHasher)
149150
};
150151

152+
let validity = array.validity_mask().vortex_expect("logical_validity");
153+
151154
let mut runs = 1;
152155
let head_idx = validity
153156
.first()
@@ -159,9 +162,6 @@ where
159162
match validity.boolean_buffer() {
160163
AllOr::All => {
161164
for value in first_valid_buff {
162-
min = min.min(value);
163-
max = max.max(value);
164-
165165
if count_distinct_values {
166166
*distinct_values.entry(value.to_bits()).or_insert(0) += 1;
167167
}
@@ -179,9 +179,6 @@ where
179179
.zip_eq(v.slice(head_idx, array.len() - head_idx).iter())
180180
{
181181
if valid {
182-
min = min.min(value);
183-
max = max.max(value);
184-
185182
if count_distinct_values {
186183
*distinct_values.entry(value.to_bits()).or_insert(0) += 1;
187184
}

vortex-btrblocks/src/integer/stats.rs

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@ use num_traits::PrimInt;
55
use rustc_hash::FxBuildHasher;
66
use vortex_array::aliases::hash_map::HashMap;
77
use vortex_array::arrays::PrimitiveArray;
8+
use vortex_array::stats::Stat;
89
use vortex_array::variants::PrimitiveArrayTrait;
910
use vortex_array::{Array, ToCanonical};
1011
use vortex_dtype::{NativePType, match_each_integer_ptype};
11-
use vortex_error::{VortexExpect, VortexUnwrap};
12+
use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
1213
use vortex_mask::AllOr;
13-
use vortex_scalar::PValue;
14+
use vortex_scalar::{PValue, ScalarValue};
1415

1516
use crate::sample::sample;
1617
use crate::{CompressorStats, GenerateStatsOptions};
@@ -150,11 +151,9 @@ impl CompressorStats for IntegerStats {
150151
}
151152
}
152153

153-
fn typed_int_stats<T: NativePType + Hash + PrimInt>(
154-
array: &PrimitiveArray,
155-
count_distinct_values: bool,
156-
) -> IntegerStats
154+
fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
157155
where
156+
T: NativePType + Hash + PrimInt + for<'a> TryFrom<&'a ScalarValue, Error = VortexError>,
158157
TypedStats<T>: Into<ErasedStats>,
159158
{
160159
// Special case: empty array
@@ -204,8 +203,6 @@ where
204203
let head = buffer[head_idx];
205204

206205
let mut loop_state = LoopState {
207-
min: head,
208-
max: head,
209206
distinct_values: if count_distinct_values {
210207
HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
211208
} else {
@@ -281,9 +278,19 @@ where
281278
u32::MAX
282279
};
283280

281+
let min = array
282+
.statistics()
283+
.compute_as::<T>(Stat::Min)
284+
.vortex_expect("min should be computed");
285+
286+
let max = array
287+
.statistics()
288+
.compute_as::<T>(Stat::Max)
289+
.vortex_expect("max should be computed");
290+
284291
let typed = TypedStats {
285-
min: loop_state.min,
286-
max: loop_state.max,
292+
min,
293+
max,
287294
distinct_values: loop_state.distinct_values,
288295
top_value,
289296
top_count,
@@ -307,8 +314,6 @@ where
307314
}
308315

309316
struct LoopState<T> {
310-
min: T,
311-
max: T,
312317
prev: T,
313318
runs: u32,
314319
distinct_values: HashMap<T, u32, FxBuildHasher>,
@@ -321,9 +326,6 @@ fn inner_loop_nonnull<T: PrimInt + Hash>(
321326
state: &mut LoopState<T>,
322327
) {
323328
for &value in values {
324-
state.min = state.min.min(value);
325-
state.max = state.max.max(value);
326-
327329
if count_distinct_values {
328330
*state.distinct_values.entry(value).or_insert(0) += 1;
329331
}
@@ -344,9 +346,6 @@ fn inner_loop_nullable<T: PrimInt + Hash>(
344346
) {
345347
for (idx, &value) in values.iter().enumerate() {
346348
if is_valid.value(idx) {
347-
state.min = state.min.min(value);
348-
state.max = state.max.max(value);
349-
350349
if count_distinct_values {
351350
*state.distinct_values.entry(value).or_insert(0) += 1;
352351
}
@@ -368,9 +367,6 @@ fn inner_loop_naive<T: PrimInt + Hash>(
368367
) {
369368
for (idx, &value) in values.iter().enumerate() {
370369
if is_valid.value(idx) {
371-
state.min = state.min.min(value);
372-
state.max = state.max.max(value);
373-
374370
if count_distinct_values {
375371
*state.distinct_values.entry(value).or_insert(0) += 1;
376372
}

vortex-btrblocks/src/string.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,15 @@ fn estimate_distinct_count(strings: &VarBinViewArray) -> u32 {
2929
// Iterate the views. Two strings which are equal must have the same first 8-bytes.
3030
// NOTE: there are cases where this performs pessimally, e.g. when we have strings that all
3131
// share a 4-byte prefix and have the same length.
32-
let mut disinct = HashSet::with_capacity(views.len() / 2);
32+
let mut distinct = HashSet::with_capacity(views.len() / 2);
3333
views.iter().for_each(|&view| {
3434
// SAFETY: we're doing bitwise manipulations. Did you expect that to be safe??
3535
let view_u128: u128 = unsafe { std::mem::transmute(view) };
3636
let len_and_prefix = view_u128 as u64;
37-
disinct.insert(len_and_prefix);
37+
distinct.insert(len_and_prefix);
3838
});
3939

40-
disinct
40+
distinct
4141
.len()
4242
.try_into()
4343
.vortex_expect("distinct count must fit in u32")
@@ -48,9 +48,9 @@ impl CompressorStats for StringStats {
4848

4949
fn generate_opts(input: &Self::ArrayType, opts: GenerateStatsOptions) -> Self {
5050
let null_count = input
51-
.validity()
52-
.null_count(input.len())
53-
.vortex_expect("null_count");
51+
.statistics()
52+
.compute_null_count()
53+
.vortex_expect("null count");
5454
let value_count = input.len() - null_count;
5555
let estimated_distinct = if opts.count_distinct_values {
5656
estimate_distinct_count(input)

0 commit comments

Comments
 (0)