Skip to content

Commit 6b04887

Browse files
a10yclaude
andauthored
feat: add some more examples in vortex crate (#5019)
Two examples. 1. A fairly trivial showcase of compression performance 2. A more involved and complex example of building a `tracing` Subscriber that writes to a sequence of Vortex files using the Compact compressor <img width="1328" height="1108" alt="image" src="https://github.com/user-attachments/assets/7f36a348-97fb-43fb-b873-d88201a4f476" /> Claude-tested, Duffy-approved. --------- Signed-off-by: Andrew Duffy <[email protected]> Co-authored-by: Claude <[email protected]>
1 parent b6d3661 commit 6b04887

File tree

5 files changed

+684
-0
lines changed

5 files changed

+684
-0
lines changed

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

codecov.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,7 @@ comment:
1818
require_head: true
1919
hide_project_coverage: false
2020
after_n_builds: 3
21+
22+
# ignore example binaries
23+
ignore:
24+
- "**/examples/*.rs"

vortex/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,10 @@ itertools = { workspace = true }
5959
mimalloc = { workspace = true }
6060
parquet = { workspace = true }
6161
rand = { workspace = true }
62+
serde_json = { workspace = true }
6263
tokio = { workspace = true, features = ["full"] }
64+
tracing = { workspace = true }
65+
tracing-subscriber = { workspace = true }
6366
vortex = { path = ".", features = ["tokio"] }
6467

6568
[features]
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Compression Strategies Showcase
5+
//!
6+
//! This example demonstrates Vortex's powerful compression capabilities,
7+
//! comparing different encoding strategies for various data patterns.
8+
//!
9+
//! Run with: cargo run --example compression_showcase
10+
11+
use vortex::arrays::{PrimitiveArray, StructArray, VarBinArray};
12+
use vortex::compressor::BtrBlocksCompressor;
13+
use vortex::dtype::{DType, Nullability};
14+
use vortex::validity::Validity;
15+
use vortex::{Array, IntoArray};
16+
use vortex_buffer::Buffer;
17+
18+
fn main() -> Result<(), Box<dyn std::error::Error>> {
19+
println!("=== Vortex Compression Showcase ===\n");
20+
21+
println!("This example demonstrates how Vortex automatically selects");
22+
println!("optimal compression strategies for different data patterns.\n");
23+
24+
// 1. Compress sequential/monotonic data
25+
println!("1. Sequential Data Compression:");
26+
compress_sequential_data()?;
27+
28+
// 2. Compress repetitive data
29+
println!("\n2. Repetitive Data Compression:");
30+
compress_repetitive_data()?;
31+
32+
// 3. Compress string data
33+
println!("\n3. String Data Compression:");
34+
compress_string_data()?;
35+
36+
// 4. Compress floating-point data
37+
println!("\n4. Floating-Point Data Compression:");
38+
compress_float_data()?;
39+
40+
// 5. Compress sparse data
41+
println!("\n5. Sparse Data Compression:");
42+
compress_sparse_data()?;
43+
44+
// 6. Compress structured data
45+
println!("\n6. Structured Data Compression:");
46+
compress_structured_data()?;
47+
48+
println!("\n=== Compression showcase completed! ===");
49+
Ok(())
50+
}
51+
52+
fn compress_sequential_data() -> Result<(), Box<dyn std::error::Error>> {
53+
// Create sequential data (e.g., timestamps, IDs)
54+
let sequential: PrimitiveArray = (1000..11000).map(|i| i as u64).collect();
55+
56+
let uncompressed_size = estimate_size(sequential.as_ref());
57+
println!(" Original sequential data (10,000 values):");
58+
println!(" Uncompressed size: ~{} bytes", uncompressed_size);
59+
60+
// Compress using default strategy
61+
let compressor = BtrBlocksCompressor::default();
62+
let compressed = compressor.compress(sequential.as_ref())?;
63+
64+
let compressed_size = compressed.nbytes();
65+
let ratio = uncompressed_size as f64 / compressed_size as f64;
66+
67+
println!(" Compressed size: ~{} bytes", compressed_size);
68+
println!(" Compression ratio: {:.2}x", ratio);
69+
println!(" Encoding: {}", compressed.encoding().id());
70+
println!(" Note: Sequential data often compresses well with Delta or FoR encoding");
71+
72+
Ok(())
73+
}
74+
75+
fn compress_repetitive_data() -> Result<(), Box<dyn std::error::Error>> {
76+
// Create highly repetitive data (run-length encoding opportunity)
77+
let mut repetitive = Vec::new();
78+
for i in 0..100 {
79+
for _ in 0..100 {
80+
repetitive.push(i as u32);
81+
}
82+
}
83+
let array: PrimitiveArray = repetitive.into_iter().collect();
84+
85+
let uncompressed_size = estimate_size(array.as_ref());
86+
println!(" Repetitive data (100 values, each repeated 100 times):");
87+
println!(" Uncompressed size: ~{} bytes", uncompressed_size);
88+
89+
let compressor = BtrBlocksCompressor::default();
90+
let compressed = compressor.compress(array.as_ref())?;
91+
92+
let compressed_size = compressed.nbytes();
93+
let ratio = uncompressed_size as f64 / compressed_size as f64;
94+
95+
println!(" Compressed size: ~{} bytes", compressed_size);
96+
println!(" Compression ratio: {:.2}x", ratio);
97+
println!(" Encoding: {}", compressed.encoding().id());
98+
println!(" Note: RLE (Run-Length Encoding) is ideal for repetitive data");
99+
100+
Ok(())
101+
}
102+
103+
fn compress_string_data() -> Result<(), Box<dyn std::error::Error>> {
104+
// Create string data with patterns
105+
let categories = vec!["Electronics", "Clothing", "Food", "Books"];
106+
let mut strings = Vec::new();
107+
108+
// Repeat categories multiple times (good for dictionary encoding)
109+
for _ in 0..2500 {
110+
for category in &categories {
111+
strings.push(Some(*category));
112+
}
113+
}
114+
115+
let array = VarBinArray::from_iter(strings, DType::Utf8(Nullability::NonNullable));
116+
117+
let uncompressed_size = estimate_size(array.as_ref());
118+
println!(" Categorical string data (10,000 strings, 4 categories):");
119+
println!(" Uncompressed size: ~{} bytes", uncompressed_size);
120+
121+
let compressor = BtrBlocksCompressor::default();
122+
let compressed = compressor.compress(array.as_ref())?;
123+
124+
let compressed_size = compressed.nbytes();
125+
let ratio = uncompressed_size as f64 / compressed_size as f64;
126+
127+
println!(" Compressed size: ~{} bytes", compressed_size);
128+
println!(" Compression ratio: {:.2}x", ratio);
129+
println!(" Encoding: {}", compressed.encoding().id());
130+
println!(" Note: Dictionary encoding is excellent for categorical/repetitive strings");
131+
132+
Ok(())
133+
}
134+
135+
fn compress_float_data() -> Result<(), Box<dyn std::error::Error>> {
136+
// Create floating-point data with patterns
137+
let floats: Buffer<f64> = (0..10000).map(|i| (i as f64) * 0.1 + 100.0).collect();
138+
let array = floats.into_array();
139+
140+
let uncompressed_size = estimate_size(&array);
141+
println!(" Floating-point data (10,000 values):");
142+
println!(" Uncompressed size: ~{} bytes", uncompressed_size);
143+
144+
let compressor = BtrBlocksCompressor::default();
145+
let compressed = compressor.compress(array.as_ref())?;
146+
147+
let compressed_size = compressed.nbytes();
148+
let ratio = uncompressed_size as f64 / compressed_size as f64;
149+
150+
println!(" Compressed size: ~{} bytes", compressed_size);
151+
println!(" Compression ratio: {:.2}x", ratio);
152+
println!(" Encoding: {}", compressed.encoding().id());
153+
println!(" Note: ALP or PCO encodings are optimized for floating-point data");
154+
155+
Ok(())
156+
}
157+
158+
fn compress_sparse_data() -> Result<(), Box<dyn std::error::Error>> {
159+
// Create sparse data (mostly zeros with few non-zero values)
160+
let mut sparse = vec![0i64; 10000];
161+
for i in (0..10000).step_by(100) {
162+
sparse[i] = (i * 42) as i64;
163+
}
164+
let array: PrimitiveArray = sparse.into_iter().collect();
165+
166+
let uncompressed_size = estimate_size(array.as_ref());
167+
println!(" Sparse data (10,000 values, 99% zeros):");
168+
println!(" Uncompressed size: ~{} bytes", uncompressed_size);
169+
170+
let compressor = BtrBlocksCompressor::default();
171+
let compressed = compressor.compress(array.as_ref())?;
172+
173+
let compressed_size = compressed.nbytes();
174+
let ratio = uncompressed_size as f64 / compressed_size as f64;
175+
176+
println!(" Compressed size: ~{} bytes", compressed_size);
177+
println!(" Compression ratio: {:.2}x", ratio);
178+
println!(" Encoding: {}", compressed.encoding().id());
179+
println!(" Note: Sparse encoding stores only non-zero indices and values");
180+
181+
Ok(())
182+
}
183+
184+
fn compress_structured_data() -> Result<(), Box<dyn std::error::Error>> {
185+
// Create a struct array with multiple columns
186+
let size = 5000;
187+
188+
// ID column (sequential)
189+
let ids: PrimitiveArray = (1..=size).map(|i| i as u64).collect();
190+
191+
// Status column (categorical)
192+
let statuses: Vec<Option<&str>> = (0..size)
193+
.map(|i| match i % 3 {
194+
0 => "active",
195+
1 => "pending",
196+
_ => "completed",
197+
})
198+
.map(Some)
199+
.collect();
200+
let status_array = VarBinArray::from_iter(statuses, DType::Utf8(Nullability::NonNullable));
201+
202+
// Value column (floats)
203+
let values: PrimitiveArray = (0..size).map(|i| (i as f64) * 1.5).collect();
204+
205+
let struct_array = StructArray::try_new(
206+
["id", "status", "value"].into(),
207+
vec![
208+
ids.into_array(),
209+
status_array.into_array(),
210+
values.into_array(),
211+
],
212+
size,
213+
Validity::NonNullable,
214+
)?;
215+
216+
let uncompressed_size = estimate_size(struct_array.as_ref());
217+
println!(" Structured data (5,000 records, 3 columns):");
218+
println!(" Uncompressed size: ~{} bytes", uncompressed_size);
219+
220+
let compressor = BtrBlocksCompressor::default();
221+
let compressed = compressor.compress(struct_array.as_ref())?;
222+
223+
let compressed_size = compressed.nbytes();
224+
let ratio = uncompressed_size as f64 / compressed_size as f64;
225+
226+
println!(" Compressed size: ~{} bytes", compressed_size);
227+
println!(" Compression ratio: {:.2}x", ratio);
228+
println!(" Encoding: {}", compressed.encoding().id());
229+
println!(" Note: Each column can be compressed with its optimal strategy");
230+
231+
Ok(())
232+
}
233+
234+
/// Estimate the size of an array in bytes (approximation)
235+
#[allow(clippy::cast_possible_truncation)]
236+
fn estimate_size(array: &dyn Array) -> usize {
237+
array.nbytes() as usize
238+
}

0 commit comments

Comments
 (0)