Skip to content

Commit b05207e

Browse files
authored
feat: eagerly compute pruning stats during compression (#1252)
follow up to #1236
1 parent 0a1a715 commit b05207e

File tree

2 files changed

+16
-1
lines changed

2 files changed

+16
-1
lines changed

vortex-sampling-compressor/src/compressors/chunked.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use std::sync::Arc;
44
use log::warn;
55
use vortex_array::aliases::hash_set::HashSet;
66
use vortex_array::array::{Chunked, ChunkedArray};
7+
use vortex_array::compress::compute_pruning_stats;
78
use vortex_array::encoding::EncodingRef;
89
use vortex_array::stats::ArrayStatistics as _;
910
use vortex_array::{Array, ArrayDType, ArrayDef, IntoArray};
@@ -116,6 +117,12 @@ impl ChunkedCompressor {
116117
)?;
117118
let mut compressed_chunks = Vec::with_capacity(less_chunked.nchunks());
118119
for (index, chunk) in less_chunked.chunks().enumerate() {
120+
// these are extremely valuable when reading/writing, but are potentially much more expensive
121+
// to compute post-compression. That's because not all encodings implement stats, so we would
122+
// potentially have to canonicalize during writes just to get stats, which would be silly.
123+
// Also, we only really require them for column chunks, not for every array.
124+
compute_pruning_stats(&chunk)?;
125+
119126
let like = previous.as_ref().map(|(like, _)| like);
120127
let (compressed_chunk, tree) = ctx
121128
.named(&format!("chunk-{}", index))

vortex-sampling-compressor/src/compressors/struct_.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use itertools::Itertools;
22
use vortex_array::aliases::hash_set::HashSet;
33
use vortex_array::array::{Struct, StructArray};
4+
use vortex_array::compress::compute_pruning_stats;
45
use vortex_array::encoding::EncodingRef;
56
use vortex_array::stats::ArrayStatistics as _;
67
use vortex_array::variants::StructArrayTrait;
@@ -45,7 +46,14 @@ impl EncodingCompressor for StructCompressor {
4546
let (arrays, trees) = array
4647
.children()
4748
.zip_eq(children_trees)
48-
.map(|(array, like)| ctx.compress(&array, like.as_ref()))
49+
.map(|(array, like)| {
50+
// these are extremely valuable when reading/writing, but are potentially much more expensive
51+
// to compute post-compression. That's because not all encodings implement stats, so we would
52+
// potentially have to canonicalize during writes just to get stats, which would be silly.
53+
// Also, we only really require them for column chunks, not for every array.
54+
compute_pruning_stats(&array)?;
55+
ctx.compress(&array, like.as_ref())
56+
})
4957
.process_results(|iter| iter.map(|x| (x.array, x.path)).unzip())?;
5058

5159
Ok(CompressedArray::compressed(

0 commit comments

Comments
 (0)