Minimal docs for vortex-btrblocks (#4730)

AdamGS · web-flow · commit 060b2d6f9ead · 2025-09-23T14:58:51.000Z
Signed-off-by: Adam Gutglick &lt;adam@spiraldb.com&gt;
diff --git a/vortex-btrblocks/benches/compress.rs b/vortex-btrblocks/benches/compress.rs
@@ -11,8 +11,7 @@ mod benchmarks {
     use rand::prelude::StdRng;
     use rand::{RngCore, SeedableRng};
     use vortex_array::{ArrayRef, IntoArray, ToCanonical};
-    use vortex_btrblocks::Compressor;
-    use vortex_btrblocks::integer::IntCompressor;
+    use vortex_btrblocks::{Compressor, IntCompressor};
     use vortex_buffer::buffer_mut;
     use vortex_utils::aliases::hash_set::HashSet;
 
diff --git a/vortex-btrblocks/benches/dict_encode.rs b/vortex-btrblocks/benches/dict_encode.rs
@@ -7,9 +7,7 @@ use divan::Bencher;
 use vortex_array::IntoArray;
 use vortex_array::arrays::{BoolArray, PrimitiveArray};
 use vortex_array::validity::Validity;
-use vortex_btrblocks::CompressorStats;
-use vortex_btrblocks::integer::IntegerStats;
-use vortex_btrblocks::integer::dictionary::dictionary_encode;
+use vortex_btrblocks::{CompressorStats, IntegerStats, integer_dictionary_encode};
 use vortex_buffer::BufferMut;
 use vortex_dict::builders::dict_encode;
 
@@ -38,7 +36,7 @@ fn encode_generic(bencher: Bencher) {
 fn encode_specialized(bencher: Bencher) {
     bencher
         .with_inputs(|| IntegerStats::generate(&make_array()))
-        .bench_values(|stats| dictionary_encode(&stats));
+        .bench_values(|stats| integer_dictionary_encode(&stats));
 }
 
 fn main() {
diff --git a/vortex-btrblocks/benches/stats_calc.rs b/vortex-btrblocks/benches/stats_calc.rs
@@ -10,8 +10,7 @@ mod benchmarks {
     use divan::Bencher;
     use vortex_array::arrays::PrimitiveArray;
     use vortex_array::validity::Validity;
-    use vortex_btrblocks::integer::IntegerStats;
-    use vortex_btrblocks::{CompressorStats, GenerateStatsOptions};
+    use vortex_btrblocks::{CompressorStats, GenerateStatsOptions, IntegerStats};
     use vortex_buffer::{Buffer, BufferMut};
 
     fn generate_dataset(max_run: u32, distinct: u32) -> Buffer<u32> {
diff --git a/vortex-btrblocks/src/float.rs b/vortex-btrblocks/src/float.rs
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-mod dictionary;
+pub(crate) mod dictionary;
 mod stats;
 
 use vortex_alp::{ALPArray, ALPEncoding, ALPVTable, RDEncoder};
@@ -11,7 +11,7 @@ use vortex_dict::DictArray;
 use vortex_dtype::PType;
 use vortex_error::{VortexExpect, VortexResult, vortex_panic};
 
-use self::stats::FloatStats;
+pub use self::stats::FloatStats;
 use crate::float::dictionary::dictionary_encode;
 use crate::integer::{IntCompressor, IntegerStats};
 use crate::patches::compress_patches;
@@ -24,6 +24,7 @@ pub trait FloatScheme: Scheme<StatsType = FloatStats, CodeType = FloatCode> {}
 
 impl<T> FloatScheme for T where T: Scheme<StatsType = FloatStats, CodeType = FloatCode> {}
 
+/// [`Compressor`] for floating-point numbers.
 pub struct FloatCompressor;
 
 impl Compressor for FloatCompressor {
diff --git a/vortex-btrblocks/src/float/dictionary.rs b/vortex-btrblocks/src/float/dictionary.rs
@@ -43,6 +43,7 @@ macro_rules! typed_encode {
     }};
 }
 
+/// Compresses a floating-point array into a dictionary arrays according to attached stats.
 pub fn dictionary_encode(stats: &FloatStats) -> DictArray {
     let validity = stats.src.validity();
     match &stats.distinct_values {
diff --git a/vortex-btrblocks/src/float/stats.rs b/vortex-btrblocks/src/float/stats.rs
@@ -43,7 +43,7 @@ impl_from_typed!(f16, ErasedDistinctValues::F16);
 impl_from_typed!(f32, ErasedDistinctValues::F32);
 impl_from_typed!(f64, ErasedDistinctValues::F64);
 
-// We want to allow not rebuilding all of the stats every time.
+/// Array of floating-point numbers and relevant stats for compression.
 #[derive(Debug, Clone)]
 pub struct FloatStats {
     pub(super) src: PrimitiveArray,
diff --git a/vortex-btrblocks/src/integer.rs b/vortex-btrblocks/src/integer.rs
@@ -27,6 +27,7 @@ use crate::{
     estimate_compression_ratio_with_sampling,
 };
 
+/// [`Compressor`] for signed and unsigned integers.
 pub struct IntCompressor;
 
 impl Compressor for IntCompressor {
@@ -57,7 +58,7 @@ impl Compressor for IntCompressor {
 }
 
 impl IntCompressor {
-    pub fn compress_no_dict(
+    pub(crate) fn compress_no_dict(
         array: &PrimitiveArray,
         is_sample: bool,
         allowed_cascading: usize,
diff --git a/vortex-btrblocks/src/integer/dictionary.rs b/vortex-btrblocks/src/integer/dictionary.rs
@@ -43,6 +43,7 @@ macro_rules! typed_encode {
     }};
 }
 
+/// Compresses an integer array into a dictionary arrays according to attached stats.
 #[allow(clippy::cognitive_complexity)]
 pub fn dictionary_encode(stats: &IntegerStats) -> DictArray {
     // We need to preserve the nullability somehow from the original
diff --git a/vortex-btrblocks/src/integer/stats.rs b/vortex-btrblocks/src/integer/stats.rs
@@ -119,6 +119,7 @@ impl_from_typed!(i16, ErasedStats::I16);
 impl_from_typed!(i32, ErasedStats::I32);
 impl_from_typed!(i64, ErasedStats::I64);
 
+/// Array of integers and relevant stats for compression.
 #[derive(Clone, Debug)]
 pub struct IntegerStats {
     pub(super) src: PrimitiveArray,
diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs
@@ -1,6 +1,35 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+#![deny(missing_docs)]
+
+//! Vortex's [BtrBlocks]-inspired adaptive compression framework.
+//!
+//! This crate provides a sophisticated multi-level compression system that adaptively selects
+//! optimal compression schemes based on data characteristics. The compressor analyzes arrays
+//! to determine the best encoding strategy, supporting cascaded compression with multiple
+//! encoding layers for maximum efficiency.
+//!
+//! # Key Features
+//!
+//! - **Adaptive Compression**: Automatically selects the best compression scheme based on data patterns
+//! - **Type-Specific Compressors**: Specialized compression for integers, floats, strings, and temporal data
+//! - **Cascaded Encoding**: Multiple compression layers can be applied for optimal results
+//! - **Statistical Analysis**: Uses data sampling and statistics to predict compression ratios
+//! - **Recursive Structure Handling**: Compresses nested structures like structs and lists
+//!
+//! # Example
+//!
+//! ```rust
+//! use vortex_btrblocks::BtrBlocksCompressor;
+//! use vortex_array::Array;
+//!
+//! let compressor = BtrBlocksCompressor::default();
+//! // let compressed = compressor.compress(&array)?;
+//! ```
+//!
+//! [BtrBlocks]: https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf
+
 use std::fmt::Debug;
 use std::hash::Hash;
 
@@ -15,20 +44,24 @@ use vortex_dtype::{DType, Nullability};
 use vortex_error::{VortexResult, VortexUnwrap};
 
 use crate::decimal::compress_decimal;
-pub use crate::float::FloatCompressor;
-pub use crate::integer::IntCompressor;
-pub use crate::string::StringCompressor;
+pub use crate::float::dictionary::dictionary_encode as float_dictionary_encode;
+pub use crate::float::{FloatCompressor, FloatStats};
+pub use crate::integer::dictionary::dictionary_encode as integer_dictionary_encode;
+pub use crate::integer::{IntCompressor, IntegerStats};
+pub use crate::string::{StringCompressor, StringStats};
 pub use crate::temporal::compress_temporal;
 
 mod decimal;
 mod float;
-pub mod integer;
+mod integer;
 mod patches;
 mod sample;
 mod string;
 mod temporal;
 
+/// Configures how stats are generated.
 pub struct GenerateStatsOptions {
+    /// Should distinct values should be counted during stats generation.
     pub count_distinct_values: bool,
     // pub count_runs: bool,
     // should this be scheme-specific?
@@ -47,33 +80,39 @@ const SAMPLE_SIZE: u32 = 64;
 
 /// Stats for the compressor.
 pub trait CompressorStats: Debug + Clone {
+    /// The type of the underlying source array vtable.
     type ArrayVTable: VTable;
 
-    /// Generate stats with default options
+    /// Generates stats with default options.
     fn generate(input: &<Self::ArrayVTable as VTable>::Array) -> Self {
         Self::generate_opts(input, GenerateStatsOptions::default())
     }
 
-    /// Generate stats with provided options
+    /// Generates stats with provided options.
     fn generate_opts(
         input: &<Self::ArrayVTable as VTable>::Array,
         opts: GenerateStatsOptions,
     ) -> Self;
 
+    /// Returns the underlying source array that statistics were generated from.
     fn source(&self) -> &<Self::ArrayVTable as VTable>::Array;
 
+    /// Sample the array with default options.
     fn sample(&self, sample_size: u32, sample_count: u32) -> Self {
         self.sample_opts(sample_size, sample_count, GenerateStatsOptions::default())
     }
 
+    /// Sample the array with provided options.
     fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self;
 }
 
 /// Top-level compression scheme trait.
 ///
 /// Variants are specialized for each data type, e.g. see `IntegerScheme`, `FloatScheme`, etc.
 pub trait Scheme: Debug {
+    /// Type of the stats generated by the compression scheme.
     type StatsType: CompressorStats;
+    /// Type of the code used to uniquely identify the compression scheme.
     type CodeType: Copy + Eq + Hash;
 
     /// Scheme unique identifier.
@@ -117,17 +156,7 @@ pub trait Scheme: Debug {
     ) -> VortexResult<ArrayRef>;
 }
 
-pub struct SchemeTree {
-    /// Scheme to use for the array.
-    ///
-    /// This is in the type-specific code space, for example either the `IntCompressor` or
-    /// `FloatCompressor` code space.
-    pub scheme: u8,
-    /// Specified schemes to use for children.
-    pub children: Vec<SchemeTree>,
-}
-
-pub fn estimate_compression_ratio_with_sampling<T: Scheme + ?Sized>(
+fn estimate_compression_ratio_with_sampling<T: Scheme + ?Sized>(
     compressor: &T,
     stats: &T::StatsType,
     is_sample: bool,
@@ -172,20 +201,32 @@ const MAX_CASCADE: usize = 3;
 
 /// A compressor for a particular input type.
 ///
-/// The `Input` type should be one of the canonical array variants, e.g. `PrimitiveArray`.
+/// This trait defines the interface for type-specific compressors that can adaptively
+/// choose and apply compression schemes based on data characteristics. Compressors
+/// analyze input arrays, select optimal compression schemes, and handle cascading
+/// compression with multiple encoding layers.
 ///
-/// Compressors expose a `compress` function.
+/// The compressor works by generating statistics on the input data, evaluating
+/// available compression schemes, and selecting the one with the best compression ratio.
 pub trait Compressor {
+    /// The VTable type for arrays this compressor operates on.
     type ArrayVTable: VTable;
+    /// The compression scheme type used by this compressor.
     type SchemeType: Scheme<StatsType = Self::StatsType> + ?Sized;
-
-    // Stats type instead?
+    /// The statistics type used to analyze arrays for compression.
     type StatsType: CompressorStats<ArrayVTable = Self::ArrayVTable>;
 
+    /// Returns all available compression schemes for this compressor.
     fn schemes() -> &'static [&'static Self::SchemeType];
+    /// Returns the default fallback compression scheme.
     fn default_scheme() -> &'static Self::SchemeType;
+    /// Returns the scheme code for dictionary compression.
     fn dict_scheme_code() -> <Self::SchemeType as Scheme>::CodeType;
 
+    /// Compresses an array using the optimal compression scheme.
+    ///
+    /// Generates statistics on the input array, selects the best compression scheme,
+    /// and applies it. Returns the original array if compression would increase size.
     fn compress(
         array: &<Self::ArrayVTable as VTable>::Array,
         is_sample: bool,
@@ -222,6 +263,11 @@ pub trait Compressor {
         }
     }
 
+    /// Selects the best compression scheme based on expected compression ratios.
+    ///
+    /// Evaluates all available schemes against the provided statistics and returns
+    /// the one with the highest compression ratio. Falls back to the default scheme
+    /// if no scheme provides compression benefits.
     fn choose_scheme(
         stats: &Self::StatsType,
         is_sample: bool,
@@ -272,12 +318,41 @@ pub trait Compressor {
     }
 }
 
+/// The main compressor type implementing BtrBlocks-inspired compression.
+///
+/// This compressor applies adaptive compression schemes to arrays based on their data types
+/// and characteristics. It recursively compresses nested structures like structs and lists,
+/// and chooses optimal compression schemes for primitive types.
+///
+/// The compressor works by:
+/// 1. Canonicalizing input arrays to a standard representation
+/// 2. Analyzing data characteristics to choose optimal compression schemes
+/// 3. Recursively compressing nested structures
+/// 4. Applying type-specific compression for primitives, strings, and temporal data
+///
+/// # Examples
+///
+/// ```rust
+/// use vortex_btrblocks::BtrBlocksCompressor;
+/// use vortex_array::Array;
+///
+/// let compressor = BtrBlocksCompressor::default();
+/// // let compressed = compressor.compress(&array)?;
+/// ```
 #[derive(Default, Debug, Clone)]
 pub struct BtrBlocksCompressor {
+    /// Whether to exclude ints from dictionary encoding.
+    ///
+    /// When `true`, integer arrays will not use dictionary compression schemes,
+    /// which can be useful when the data has high cardinality or when dictionary
+    /// overhead would exceed compression benefits.
     pub exclude_int_dict_encoding: bool,
 }
 
 impl BtrBlocksCompressor {
+    /// Compresses an array using BtrBlocks-inspired compression.
+    ///
+    /// First canonicalizes and compacts the array, then applies optimal compression schemes.
     pub fn compress(&self, array: &dyn Array) -> VortexResult<ArrayRef> {
         // Canonicalize the array
         let canonical = array.to_canonical();
@@ -288,6 +363,9 @@ impl BtrBlocksCompressor {
         self.compress_canonical(compact)
     }
 
+    /// Compresses a canonical array by dispatching to type-specific compressors.
+    ///
+    /// Recursively compresses nested structures and applies optimal schemes for each data type.
     pub fn compress_canonical(&self, array: Canonical) -> VortexResult<ArrayRef> {
         match array {
             Canonical::Null(null_array) => Ok(null_array.into_array()),
diff --git a/vortex-btrblocks/src/string.rs b/vortex-btrblocks/src/string.rs
@@ -17,6 +17,7 @@ use crate::{
     estimate_compression_ratio_with_sampling, integer,
 };
 
+/// Array of variable-length byte arrays, and relevant stats for compression.
 #[derive(Clone, Debug)]
 pub struct StringStats {
     src: VarBinViewArray,
@@ -76,6 +77,7 @@ impl CompressorStats for StringStats {
     }
 }
 
+/// [`Compressor`] for strings.
 pub struct StringCompressor;
 
 impl Compressor for StringCompressor {

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ macro_rules! typed_encode {`
`43`	`43`	`}};`
`44`	`44`	`}`
`45`	`45`
	`46`	`+/// Compresses a floating-point array into a dictionary arrays according to attached stats.`
`46`	`47`	`pub fn dictionary_encode(stats: &FloatStats) -> DictArray {`
`47`	`48`	`let validity = stats.src.validity();`
`48`	`49`	`match &stats.distinct_values {`