11// SPDX-License-Identifier: Apache-2.0
22// SPDX-FileCopyrightText: Copyright the Vortex contributors
33
4+ #![ deny( missing_docs) ]
5+
6+ //! Vortex's [BtrBlocks]-inspired adaptive compression framework.
7+ //!
8+ //! This crate provides a sophisticated multi-level compression system that adaptively selects
9+ //! optimal compression schemes based on data characteristics. The compressor analyzes arrays
10+ //! to determine the best encoding strategy, supporting cascaded compression with multiple
11+ //! encoding layers for maximum efficiency.
12+ //!
13+ //! # Key Features
14+ //!
15+ //! - **Adaptive Compression**: Automatically selects the best compression scheme based on data patterns
16+ //! - **Type-Specific Compressors**: Specialized compression for integers, floats, strings, and temporal data
17+ //! - **Cascaded Encoding**: Multiple compression layers can be applied for optimal results
18+ //! - **Statistical Analysis**: Uses data sampling and statistics to predict compression ratios
19+ //! - **Recursive Structure Handling**: Compresses nested structures like structs and lists
20+ //!
21+ //! # Example
22+ //!
23+ //! ```rust
24+ //! use vortex_btrblocks::BtrBlocksCompressor;
25+ //! use vortex_array::Array;
26+ //!
27+ //! let compressor = BtrBlocksCompressor::default();
28+ //! // let compressed = compressor.compress(&array)?;
29+ //! ```
30+ //!
31+ //! [BtrBlocks]: https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf
32+
433use std:: fmt:: Debug ;
534use std:: hash:: Hash ;
635
@@ -15,20 +44,24 @@ use vortex_dtype::{DType, Nullability};
1544use vortex_error:: { VortexResult , VortexUnwrap } ;
1645
1746use crate :: decimal:: compress_decimal;
18- pub use crate :: float:: FloatCompressor ;
19- pub use crate :: integer:: IntCompressor ;
20- pub use crate :: string:: StringCompressor ;
47+ pub use crate :: float:: dictionary:: dictionary_encode as float_dictionary_encode;
48+ pub use crate :: float:: { FloatCompressor , FloatStats } ;
49+ pub use crate :: integer:: dictionary:: dictionary_encode as integer_dictionary_encode;
50+ pub use crate :: integer:: { IntCompressor , IntegerStats } ;
51+ pub use crate :: string:: { StringCompressor , StringStats } ;
2152pub use crate :: temporal:: compress_temporal;
2253
2354mod decimal;
2455mod float;
25- pub mod integer;
56+ mod integer;
2657mod patches;
2758mod sample;
2859mod string;
2960mod temporal;
3061
62+ /// Configures how stats are generated.
3163pub struct GenerateStatsOptions {
64+ /// Should distinct values should be counted during stats generation.
3265 pub count_distinct_values : bool ,
3366 // pub count_runs: bool,
3467 // should this be scheme-specific?
@@ -47,33 +80,39 @@ const SAMPLE_SIZE: u32 = 64;
4780
4881/// Stats for the compressor.
4982pub trait CompressorStats : Debug + Clone {
83+ /// The type of the underlying source array vtable.
5084 type ArrayVTable : VTable ;
5185
52- /// Generate stats with default options
86+ /// Generates stats with default options.
5387 fn generate ( input : & <Self :: ArrayVTable as VTable >:: Array ) -> Self {
5488 Self :: generate_opts ( input, GenerateStatsOptions :: default ( ) )
5589 }
5690
57- /// Generate stats with provided options
91+ /// Generates stats with provided options.
5892 fn generate_opts (
5993 input : & <Self :: ArrayVTable as VTable >:: Array ,
6094 opts : GenerateStatsOptions ,
6195 ) -> Self ;
6296
97+ /// Returns the underlying source array that statistics were generated from.
6398 fn source ( & self ) -> & <Self :: ArrayVTable as VTable >:: Array ;
6499
100+ /// Sample the array with default options.
65101 fn sample ( & self , sample_size : u32 , sample_count : u32 ) -> Self {
66102 self . sample_opts ( sample_size, sample_count, GenerateStatsOptions :: default ( ) )
67103 }
68104
105+ /// Sample the array with provided options.
69106 fn sample_opts ( & self , sample_size : u32 , sample_count : u32 , opts : GenerateStatsOptions ) -> Self ;
70107}
71108
72109/// Top-level compression scheme trait.
73110///
74111/// Variants are specialized for each data type, e.g. see `IntegerScheme`, `FloatScheme`, etc.
75112pub trait Scheme : Debug {
113+ /// Type of the stats generated by the compression scheme.
76114 type StatsType : CompressorStats ;
115+ /// Type of the code used to uniquely identify the compression scheme.
77116 type CodeType : Copy + Eq + Hash ;
78117
79118 /// Scheme unique identifier.
@@ -117,17 +156,7 @@ pub trait Scheme: Debug {
117156 ) -> VortexResult < ArrayRef > ;
118157}
119158
120- pub struct SchemeTree {
121- /// Scheme to use for the array.
122- ///
123- /// This is in the type-specific code space, for example either the `IntCompressor` or
124- /// `FloatCompressor` code space.
125- pub scheme : u8 ,
126- /// Specified schemes to use for children.
127- pub children : Vec < SchemeTree > ,
128- }
129-
130- pub fn estimate_compression_ratio_with_sampling < T : Scheme + ?Sized > (
159+ fn estimate_compression_ratio_with_sampling < T : Scheme + ?Sized > (
131160 compressor : & T ,
132161 stats : & T :: StatsType ,
133162 is_sample : bool ,
@@ -172,20 +201,32 @@ const MAX_CASCADE: usize = 3;
172201
173202/// A compressor for a particular input type.
174203///
175- /// The `Input` type should be one of the canonical array variants, e.g. `PrimitiveArray`.
204+ /// This trait defines the interface for type-specific compressors that can adaptively
205+ /// choose and apply compression schemes based on data characteristics. Compressors
206+ /// analyze input arrays, select optimal compression schemes, and handle cascading
207+ /// compression with multiple encoding layers.
176208///
177- /// Compressors expose a `compress` function.
209+ /// The compressor works by generating statistics on the input data, evaluating
210+ /// available compression schemes, and selecting the one with the best compression ratio.
178211pub trait Compressor {
212+ /// The VTable type for arrays this compressor operates on.
179213 type ArrayVTable : VTable ;
214+ /// The compression scheme type used by this compressor.
180215 type SchemeType : Scheme < StatsType = Self :: StatsType > + ?Sized ;
181-
182- // Stats type instead?
216+ /// The statistics type used to analyze arrays for compression.
183217 type StatsType : CompressorStats < ArrayVTable = Self :: ArrayVTable > ;
184218
219+ /// Returns all available compression schemes for this compressor.
185220 fn schemes ( ) -> & ' static [ & ' static Self :: SchemeType ] ;
221+ /// Returns the default fallback compression scheme.
186222 fn default_scheme ( ) -> & ' static Self :: SchemeType ;
223+ /// Returns the scheme code for dictionary compression.
187224 fn dict_scheme_code ( ) -> <Self :: SchemeType as Scheme >:: CodeType ;
188225
226+ /// Compresses an array using the optimal compression scheme.
227+ ///
228+ /// Generates statistics on the input array, selects the best compression scheme,
229+ /// and applies it. Returns the original array if compression would increase size.
189230 fn compress (
190231 array : & <Self :: ArrayVTable as VTable >:: Array ,
191232 is_sample : bool ,
@@ -222,6 +263,11 @@ pub trait Compressor {
222263 }
223264 }
224265
266+ /// Selects the best compression scheme based on expected compression ratios.
267+ ///
268+ /// Evaluates all available schemes against the provided statistics and returns
269+ /// the one with the highest compression ratio. Falls back to the default scheme
270+ /// if no scheme provides compression benefits.
225271 fn choose_scheme (
226272 stats : & Self :: StatsType ,
227273 is_sample : bool ,
@@ -272,12 +318,41 @@ pub trait Compressor {
272318 }
273319}
274320
321+ /// The main compressor type implementing BtrBlocks-inspired compression.
322+ ///
323+ /// This compressor applies adaptive compression schemes to arrays based on their data types
324+ /// and characteristics. It recursively compresses nested structures like structs and lists,
325+ /// and chooses optimal compression schemes for primitive types.
326+ ///
327+ /// The compressor works by:
328+ /// 1. Canonicalizing input arrays to a standard representation
329+ /// 2. Analyzing data characteristics to choose optimal compression schemes
330+ /// 3. Recursively compressing nested structures
331+ /// 4. Applying type-specific compression for primitives, strings, and temporal data
332+ ///
333+ /// # Examples
334+ ///
335+ /// ```rust
336+ /// use vortex_btrblocks::BtrBlocksCompressor;
337+ /// use vortex_array::Array;
338+ ///
339+ /// let compressor = BtrBlocksCompressor::default();
340+ /// // let compressed = compressor.compress(&array)?;
341+ /// ```
275342#[ derive( Default , Debug , Clone ) ]
276343pub struct BtrBlocksCompressor {
344+ /// Whether to exclude ints from dictionary encoding.
345+ ///
346+ /// When `true`, integer arrays will not use dictionary compression schemes,
347+ /// which can be useful when the data has high cardinality or when dictionary
348+ /// overhead would exceed compression benefits.
277349 pub exclude_int_dict_encoding : bool ,
278350}
279351
280352impl BtrBlocksCompressor {
353+ /// Compresses an array using BtrBlocks-inspired compression.
354+ ///
355+ /// First canonicalizes and compacts the array, then applies optimal compression schemes.
281356 pub fn compress ( & self , array : & dyn Array ) -> VortexResult < ArrayRef > {
282357 // Canonicalize the array
283358 let canonical = array. to_canonical ( ) ;
@@ -288,6 +363,9 @@ impl BtrBlocksCompressor {
288363 self . compress_canonical ( compact)
289364 }
290365
366+ /// Compresses a canonical array by dispatching to type-specific compressors.
367+ ///
368+ /// Recursively compresses nested structures and applies optimal schemes for each data type.
291369 pub fn compress_canonical ( & self , array : Canonical ) -> VortexResult < ArrayRef > {
292370 match array {
293371 Canonical :: Null ( null_array) => Ok ( null_array. into_array ( ) ) ,
0 commit comments