feat: eagerly compute pruning stats during compression (#1252)

lwwmanning · web-flow · commit b05207e9a4da · 2024-11-08T10:37:14.000-05:00
follow up to #1236
diff --git a/vortex-sampling-compressor/src/compressors/chunked.rs b/vortex-sampling-compressor/src/compressors/chunked.rs
@@ -4,6 +4,7 @@ use std::sync::Arc;
 use log::warn;
 use vortex_array::aliases::hash_set::HashSet;
 use vortex_array::array::{Chunked, ChunkedArray};
+use vortex_array::compress::compute_pruning_stats;
 use vortex_array::encoding::EncodingRef;
 use vortex_array::stats::ArrayStatistics as _;
 use vortex_array::{Array, ArrayDType, ArrayDef, IntoArray};
@@ -116,6 +117,12 @@ impl ChunkedCompressor {
         )?;
         let mut compressed_chunks = Vec::with_capacity(less_chunked.nchunks());
         for (index, chunk) in less_chunked.chunks().enumerate() {
+            // these are extremely valuable when reading/writing, but are potentially much more expensive
+            // to compute post-compression. That's because not all encodings implement stats, so we would
+            // potentially have to canonicalize during writes just to get stats, which would be silly.
+            // Also, we only really require them for column chunks, not for every array.
+            compute_pruning_stats(&chunk)?;
+
             let like = previous.as_ref().map(|(like, _)| like);
             let (compressed_chunk, tree) = ctx
                 .named(&format!("chunk-{}", index))
diff --git a/vortex-sampling-compressor/src/compressors/struct_.rs b/vortex-sampling-compressor/src/compressors/struct_.rs
@@ -1,6 +1,7 @@
 use itertools::Itertools;
 use vortex_array::aliases::hash_set::HashSet;
 use vortex_array::array::{Struct, StructArray};
+use vortex_array::compress::compute_pruning_stats;
 use vortex_array::encoding::EncodingRef;
 use vortex_array::stats::ArrayStatistics as _;
 use vortex_array::variants::StructArrayTrait;
@@ -45,7 +46,14 @@ impl EncodingCompressor for StructCompressor {
         let (arrays, trees) = array
             .children()
             .zip_eq(children_trees)
-            .map(|(array, like)| ctx.compress(&array, like.as_ref()))
+            .map(|(array, like)| {
+                // these are extremely valuable when reading/writing, but are potentially much more expensive
+                // to compute post-compression. That's because not all encodings implement stats, so we would
+                // potentially have to canonicalize during writes just to get stats, which would be silly.
+                // Also, we only really require them for column chunks, not for every array.
+                compute_pruning_stats(&array)?;
+                ctx.compress(&array, like.as_ref())
+            })
             .process_results(|iter| iter.map(|x| (x.array, x.path)).unzip())?;
 
         Ok(CompressedArray::compressed(