Skip to content

Commit 060b2d6

Browse files
authored
Minimal docs for vortex-btrblocks (#4730)
Signed-off-by: Adam Gutglick <[email protected]>
1 parent 606b33c commit 060b2d6

File tree

11 files changed

+114
-33
lines changed

11 files changed

+114
-33
lines changed

vortex-btrblocks/benches/compress.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@ mod benchmarks {
1111
use rand::prelude::StdRng;
1212
use rand::{RngCore, SeedableRng};
1313
use vortex_array::{ArrayRef, IntoArray, ToCanonical};
14-
use vortex_btrblocks::Compressor;
15-
use vortex_btrblocks::integer::IntCompressor;
14+
use vortex_btrblocks::{Compressor, IntCompressor};
1615
use vortex_buffer::buffer_mut;
1716
use vortex_utils::aliases::hash_set::HashSet;
1817

vortex-btrblocks/benches/dict_encode.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@ use divan::Bencher;
77
use vortex_array::IntoArray;
88
use vortex_array::arrays::{BoolArray, PrimitiveArray};
99
use vortex_array::validity::Validity;
10-
use vortex_btrblocks::CompressorStats;
11-
use vortex_btrblocks::integer::IntegerStats;
12-
use vortex_btrblocks::integer::dictionary::dictionary_encode;
10+
use vortex_btrblocks::{CompressorStats, IntegerStats, integer_dictionary_encode};
1311
use vortex_buffer::BufferMut;
1412
use vortex_dict::builders::dict_encode;
1513

@@ -38,7 +36,7 @@ fn encode_generic(bencher: Bencher) {
3836
fn encode_specialized(bencher: Bencher) {
3937
bencher
4038
.with_inputs(|| IntegerStats::generate(&make_array()))
41-
.bench_values(|stats| dictionary_encode(&stats));
39+
.bench_values(|stats| integer_dictionary_encode(&stats));
4240
}
4341

4442
fn main() {

vortex-btrblocks/benches/stats_calc.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@ mod benchmarks {
1010
use divan::Bencher;
1111
use vortex_array::arrays::PrimitiveArray;
1212
use vortex_array::validity::Validity;
13-
use vortex_btrblocks::integer::IntegerStats;
14-
use vortex_btrblocks::{CompressorStats, GenerateStatsOptions};
13+
use vortex_btrblocks::{CompressorStats, GenerateStatsOptions, IntegerStats};
1514
use vortex_buffer::{Buffer, BufferMut};
1615

1716
fn generate_dataset(max_run: u32, distinct: u32) -> Buffer<u32> {

vortex-btrblocks/src/float.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
mod dictionary;
4+
pub(crate) mod dictionary;
55
mod stats;
66

77
use vortex_alp::{ALPArray, ALPEncoding, ALPVTable, RDEncoder};
@@ -11,7 +11,7 @@ use vortex_dict::DictArray;
1111
use vortex_dtype::PType;
1212
use vortex_error::{VortexExpect, VortexResult, vortex_panic};
1313

14-
use self::stats::FloatStats;
14+
pub use self::stats::FloatStats;
1515
use crate::float::dictionary::dictionary_encode;
1616
use crate::integer::{IntCompressor, IntegerStats};
1717
use crate::patches::compress_patches;
@@ -24,6 +24,7 @@ pub trait FloatScheme: Scheme<StatsType = FloatStats, CodeType = FloatCode> {}
2424

2525
impl<T> FloatScheme for T where T: Scheme<StatsType = FloatStats, CodeType = FloatCode> {}
2626

27+
/// [`Compressor`] for floating-point numbers.
2728
pub struct FloatCompressor;
2829

2930
impl Compressor for FloatCompressor {

vortex-btrblocks/src/float/dictionary.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ macro_rules! typed_encode {
4343
}};
4444
}
4545

46+
/// Compresses a floating-point array into a dictionary arrays according to attached stats.
4647
pub fn dictionary_encode(stats: &FloatStats) -> DictArray {
4748
let validity = stats.src.validity();
4849
match &stats.distinct_values {

vortex-btrblocks/src/float/stats.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ impl_from_typed!(f16, ErasedDistinctValues::F16);
4343
impl_from_typed!(f32, ErasedDistinctValues::F32);
4444
impl_from_typed!(f64, ErasedDistinctValues::F64);
4545

46-
// We want to allow not rebuilding all of the stats every time.
46+
/// Array of floating-point numbers and relevant stats for compression.
4747
#[derive(Debug, Clone)]
4848
pub struct FloatStats {
4949
pub(super) src: PrimitiveArray,

vortex-btrblocks/src/integer.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use crate::{
2727
estimate_compression_ratio_with_sampling,
2828
};
2929

30+
/// [`Compressor`] for signed and unsigned integers.
3031
pub struct IntCompressor;
3132

3233
impl Compressor for IntCompressor {
@@ -57,7 +58,7 @@ impl Compressor for IntCompressor {
5758
}
5859

5960
impl IntCompressor {
60-
pub fn compress_no_dict(
61+
pub(crate) fn compress_no_dict(
6162
array: &PrimitiveArray,
6263
is_sample: bool,
6364
allowed_cascading: usize,

vortex-btrblocks/src/integer/dictionary.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ macro_rules! typed_encode {
4343
}};
4444
}
4545

46+
/// Compresses an integer array into a dictionary arrays according to attached stats.
4647
#[allow(clippy::cognitive_complexity)]
4748
pub fn dictionary_encode(stats: &IntegerStats) -> DictArray {
4849
// We need to preserve the nullability somehow from the original

vortex-btrblocks/src/integer/stats.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ impl_from_typed!(i16, ErasedStats::I16);
119119
impl_from_typed!(i32, ErasedStats::I32);
120120
impl_from_typed!(i64, ErasedStats::I64);
121121

122+
/// Array of integers and relevant stats for compression.
122123
#[derive(Clone, Debug)]
123124
pub struct IntegerStats {
124125
pub(super) src: PrimitiveArray,

vortex-btrblocks/src/lib.rs

Lines changed: 99 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,35 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
#![deny(missing_docs)]
5+
6+
//! Vortex's [BtrBlocks]-inspired adaptive compression framework.
7+
//!
8+
//! This crate provides a sophisticated multi-level compression system that adaptively selects
9+
//! optimal compression schemes based on data characteristics. The compressor analyzes arrays
10+
//! to determine the best encoding strategy, supporting cascaded compression with multiple
11+
//! encoding layers for maximum efficiency.
12+
//!
13+
//! # Key Features
14+
//!
15+
//! - **Adaptive Compression**: Automatically selects the best compression scheme based on data patterns
16+
//! - **Type-Specific Compressors**: Specialized compression for integers, floats, strings, and temporal data
17+
//! - **Cascaded Encoding**: Multiple compression layers can be applied for optimal results
18+
//! - **Statistical Analysis**: Uses data sampling and statistics to predict compression ratios
19+
//! - **Recursive Structure Handling**: Compresses nested structures like structs and lists
20+
//!
21+
//! # Example
22+
//!
23+
//! ```rust
24+
//! use vortex_btrblocks::BtrBlocksCompressor;
25+
//! use vortex_array::Array;
26+
//!
27+
//! let compressor = BtrBlocksCompressor::default();
28+
//! // let compressed = compressor.compress(&array)?;
29+
//! ```
30+
//!
31+
//! [BtrBlocks]: https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf
32+
433
use std::fmt::Debug;
534
use std::hash::Hash;
635

@@ -15,20 +44,24 @@ use vortex_dtype::{DType, Nullability};
1544
use vortex_error::{VortexResult, VortexUnwrap};
1645

1746
use crate::decimal::compress_decimal;
18-
pub use crate::float::FloatCompressor;
19-
pub use crate::integer::IntCompressor;
20-
pub use crate::string::StringCompressor;
47+
pub use crate::float::dictionary::dictionary_encode as float_dictionary_encode;
48+
pub use crate::float::{FloatCompressor, FloatStats};
49+
pub use crate::integer::dictionary::dictionary_encode as integer_dictionary_encode;
50+
pub use crate::integer::{IntCompressor, IntegerStats};
51+
pub use crate::string::{StringCompressor, StringStats};
2152
pub use crate::temporal::compress_temporal;
2253

2354
mod decimal;
2455
mod float;
25-
pub mod integer;
56+
mod integer;
2657
mod patches;
2758
mod sample;
2859
mod string;
2960
mod temporal;
3061

62+
/// Configures how stats are generated.
3163
pub struct GenerateStatsOptions {
64+
/// Should distinct values should be counted during stats generation.
3265
pub count_distinct_values: bool,
3366
// pub count_runs: bool,
3467
// should this be scheme-specific?
@@ -47,33 +80,39 @@ const SAMPLE_SIZE: u32 = 64;
4780

4881
/// Stats for the compressor.
4982
pub trait CompressorStats: Debug + Clone {
83+
/// The type of the underlying source array vtable.
5084
type ArrayVTable: VTable;
5185

52-
/// Generate stats with default options
86+
/// Generates stats with default options.
5387
fn generate(input: &<Self::ArrayVTable as VTable>::Array) -> Self {
5488
Self::generate_opts(input, GenerateStatsOptions::default())
5589
}
5690

57-
/// Generate stats with provided options
91+
/// Generates stats with provided options.
5892
fn generate_opts(
5993
input: &<Self::ArrayVTable as VTable>::Array,
6094
opts: GenerateStatsOptions,
6195
) -> Self;
6296

97+
/// Returns the underlying source array that statistics were generated from.
6398
fn source(&self) -> &<Self::ArrayVTable as VTable>::Array;
6499

100+
/// Sample the array with default options.
65101
fn sample(&self, sample_size: u32, sample_count: u32) -> Self {
66102
self.sample_opts(sample_size, sample_count, GenerateStatsOptions::default())
67103
}
68104

105+
/// Sample the array with provided options.
69106
fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self;
70107
}
71108

72109
/// Top-level compression scheme trait.
73110
///
74111
/// Variants are specialized for each data type, e.g. see `IntegerScheme`, `FloatScheme`, etc.
75112
pub trait Scheme: Debug {
113+
/// Type of the stats generated by the compression scheme.
76114
type StatsType: CompressorStats;
115+
/// Type of the code used to uniquely identify the compression scheme.
77116
type CodeType: Copy + Eq + Hash;
78117

79118
/// Scheme unique identifier.
@@ -117,17 +156,7 @@ pub trait Scheme: Debug {
117156
) -> VortexResult<ArrayRef>;
118157
}
119158

120-
pub struct SchemeTree {
121-
/// Scheme to use for the array.
122-
///
123-
/// This is in the type-specific code space, for example either the `IntCompressor` or
124-
/// `FloatCompressor` code space.
125-
pub scheme: u8,
126-
/// Specified schemes to use for children.
127-
pub children: Vec<SchemeTree>,
128-
}
129-
130-
pub fn estimate_compression_ratio_with_sampling<T: Scheme + ?Sized>(
159+
fn estimate_compression_ratio_with_sampling<T: Scheme + ?Sized>(
131160
compressor: &T,
132161
stats: &T::StatsType,
133162
is_sample: bool,
@@ -172,20 +201,32 @@ const MAX_CASCADE: usize = 3;
172201

173202
/// A compressor for a particular input type.
174203
///
175-
/// The `Input` type should be one of the canonical array variants, e.g. `PrimitiveArray`.
204+
/// This trait defines the interface for type-specific compressors that can adaptively
205+
/// choose and apply compression schemes based on data characteristics. Compressors
206+
/// analyze input arrays, select optimal compression schemes, and handle cascading
207+
/// compression with multiple encoding layers.
176208
///
177-
/// Compressors expose a `compress` function.
209+
/// The compressor works by generating statistics on the input data, evaluating
210+
/// available compression schemes, and selecting the one with the best compression ratio.
178211
pub trait Compressor {
212+
/// The VTable type for arrays this compressor operates on.
179213
type ArrayVTable: VTable;
214+
/// The compression scheme type used by this compressor.
180215
type SchemeType: Scheme<StatsType = Self::StatsType> + ?Sized;
181-
182-
// Stats type instead?
216+
/// The statistics type used to analyze arrays for compression.
183217
type StatsType: CompressorStats<ArrayVTable = Self::ArrayVTable>;
184218

219+
/// Returns all available compression schemes for this compressor.
185220
fn schemes() -> &'static [&'static Self::SchemeType];
221+
/// Returns the default fallback compression scheme.
186222
fn default_scheme() -> &'static Self::SchemeType;
223+
/// Returns the scheme code for dictionary compression.
187224
fn dict_scheme_code() -> <Self::SchemeType as Scheme>::CodeType;
188225

226+
/// Compresses an array using the optimal compression scheme.
227+
///
228+
/// Generates statistics on the input array, selects the best compression scheme,
229+
/// and applies it. Returns the original array if compression would increase size.
189230
fn compress(
190231
array: &<Self::ArrayVTable as VTable>::Array,
191232
is_sample: bool,
@@ -222,6 +263,11 @@ pub trait Compressor {
222263
}
223264
}
224265

266+
/// Selects the best compression scheme based on expected compression ratios.
267+
///
268+
/// Evaluates all available schemes against the provided statistics and returns
269+
/// the one with the highest compression ratio. Falls back to the default scheme
270+
/// if no scheme provides compression benefits.
225271
fn choose_scheme(
226272
stats: &Self::StatsType,
227273
is_sample: bool,
@@ -272,12 +318,41 @@ pub trait Compressor {
272318
}
273319
}
274320

321+
/// The main compressor type implementing BtrBlocks-inspired compression.
322+
///
323+
/// This compressor applies adaptive compression schemes to arrays based on their data types
324+
/// and characteristics. It recursively compresses nested structures like structs and lists,
325+
/// and chooses optimal compression schemes for primitive types.
326+
///
327+
/// The compressor works by:
328+
/// 1. Canonicalizing input arrays to a standard representation
329+
/// 2. Analyzing data characteristics to choose optimal compression schemes
330+
/// 3. Recursively compressing nested structures
331+
/// 4. Applying type-specific compression for primitives, strings, and temporal data
332+
///
333+
/// # Examples
334+
///
335+
/// ```rust
336+
/// use vortex_btrblocks::BtrBlocksCompressor;
337+
/// use vortex_array::Array;
338+
///
339+
/// let compressor = BtrBlocksCompressor::default();
340+
/// // let compressed = compressor.compress(&array)?;
341+
/// ```
275342
#[derive(Default, Debug, Clone)]
276343
pub struct BtrBlocksCompressor {
344+
/// Whether to exclude ints from dictionary encoding.
345+
///
346+
/// When `true`, integer arrays will not use dictionary compression schemes,
347+
/// which can be useful when the data has high cardinality or when dictionary
348+
/// overhead would exceed compression benefits.
277349
pub exclude_int_dict_encoding: bool,
278350
}
279351

280352
impl BtrBlocksCompressor {
353+
/// Compresses an array using BtrBlocks-inspired compression.
354+
///
355+
/// First canonicalizes and compacts the array, then applies optimal compression schemes.
281356
pub fn compress(&self, array: &dyn Array) -> VortexResult<ArrayRef> {
282357
// Canonicalize the array
283358
let canonical = array.to_canonical();
@@ -288,6 +363,9 @@ impl BtrBlocksCompressor {
288363
self.compress_canonical(compact)
289364
}
290365

366+
/// Compresses a canonical array by dispatching to type-specific compressors.
367+
///
368+
/// Recursively compresses nested structures and applies optimal schemes for each data type.
291369
pub fn compress_canonical(&self, array: Canonical) -> VortexResult<ArrayRef> {
292370
match array {
293371
Canonical::Null(null_array) => Ok(null_array.into_array()),

0 commit comments

Comments
 (0)