Skip to content

Commit a200417

Browse files
committed
Merge branch 'develop' into ngates/pipeline-vector
2 parents 6aa020c + 7226a1b commit a200417

File tree

26 files changed

+689
-61
lines changed

26 files changed

+689
-61
lines changed

Cargo.lock

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,48 +9,48 @@
99
[![Maven - Version](https://img.shields.io/maven-central/v/dev.vortex/vortex-spark)](https://central.sonatype.com/artifact/dev.vortex/vortex-spark)
1010
[![codecov](https://codecov.io/github/vortex-data/vortex/graph/badge.svg)](https://codecov.io/github/vortex-data/vortex)
1111

12-
🫶 [Join the community on Slack!](https://vortex.dev/slack) | 📚 [Documentation](https://docs.vortex.dev/) | 📊 [Performance Benchmarks](https://bench.vortex.dev)
12+
[Join the community on Slack!](https://vortex.dev/slack) | [Documentation](https://docs.vortex.dev/) | [Performance Benchmarks](https://bench.vortex.dev)
1313

1414
## Overview
1515

1616
Vortex is a next-generation columnar file format and toolkit designed for high-performance data processing.
1717
It is the fastest and most extensible format for building data systems backed by object storage. It provides:
1818

19-
- **⚡️ Blazing Fast Performance**
19+
- **Blazing Fast Performance**
2020
- 100x faster random access reads (vs. modern Apache Parquet)
2121
- 10-20x faster scans
2222
- 5x faster writes
2323
- Similar compression ratios
2424
- Efficient support for wide tables with zero-copy/zero-parse metadata
2525

26-
- **🔧 Extensible Architecture**
26+
- **Extensible Architecture**
2727
- Modeled after Apache DataFusion's extensible approach
2828
- Pluggable encoding system, type system, compression strategy, & layout strategy
2929
- Zero-copy compatibility with Apache Arrow
3030

31-
- **🗳️ Open Source, Neutral Governance**
31+
- **Open Source, Neutral Governance**
3232
- A Linux Foundation (LF AI & Data) Project
3333
- Apache-2.0 Licensed
3434

35-
- **↔️ Integrations**
35+
- **Integrations**
3636
- Arrow, DataFusion, DuckDB, Spark, Pandas, Polars, & more
3737
- Apache Iceberg (coming soon)
3838

3939
> 🟢 **Development Status**: Library APIs may change from version to version, but we now consider
40-
> the file format <ins>*stable*</ins>. From release 0.36.0, all future releases of Vortex should
40+
> the file format <ins>_stable_</ins>. From release 0.36.0, all future releases of Vortex should
4141
> maintain backwards compatibility of the file format (i.e., be able to read files written by
4242
> any earlier version >= 0.36.0).
4343
4444
## Key Features
4545

4646
### Core Capabilities
4747

48-
- **Logical Types** - Clean separation between logical schema and physical layout
49-
- 🔄 **Zero-Copy Arrow Integration** - Seamless conversion to/from Apache Arrow arrays
50-
- 🧩 **Extensible Encodings** - Pluggable physical layouts with built-in optimizations
51-
- 📦 **Cascading Compression** - Support for nested encoding schemes
52-
- 🚀 **High-Performance Computing** - Optimized compute kernels for encoded data
53-
- 📊 **Rich Statistics** - Lazy-loaded summary statistics for optimization
48+
- **Logical Types** - Clean separation between logical schema and physical layout
49+
- **Zero-Copy Arrow Integration** - Seamless conversion to/from Apache Arrow arrays
50+
- **Extensible Encodings** - Pluggable physical layouts with built-in optimizations
51+
- **Cascading Compression** - Support for nested encoding schemes
52+
- **High-Performance Computing** - Optimized compute kernels for encoded data
53+
- **Rich Statistics** - Lazy-loaded summary statistics for optimization
5454

5555
### Technical Architecture
5656

@@ -152,7 +152,7 @@ If you discovery a security vulnerability, please email <[email protected]>
152152
Copyright © Vortex a Series of LF Projects, LLC.
153153
For terms of use, trademark policy, and other project policies please see <https://lfprojects.org>
154154

155-
## Acknowledgments 🏆
155+
## Acknowledgments
156156

157157
The Vortex project benefits enormously from groundbreaking work from the academic & open-source communities.
158158

encodings/pco/Cargo.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,20 @@ pco = { workspace = true }
2222
prost = { workspace = true }
2323
vortex-array = { workspace = true }
2424
vortex-buffer = { workspace = true }
25+
vortex-compute = { workspace = true }
2526
vortex-dtype = { workspace = true }
2627
vortex-error = { workspace = true }
2728
vortex-mask = { workspace = true }
2829
vortex-scalar = { workspace = true }
30+
vortex-vector = { workspace = true }
2931

3032
[dev-dependencies]
33+
divan = { workspace = true }
34+
mimalloc = { workspace = true }
35+
rand = { workspace = true }
3136
rstest = { workspace = true }
3237
vortex-array = { workspace = true, features = ["test-harness"] }
38+
39+
[[bench]]
40+
name = "pco"
41+
harness = false

encodings/pco/benches/pco.rs

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
#![allow(clippy::unwrap_used)]
5+
6+
use divan::Bencher;
7+
use mimalloc::MiMalloc;
8+
use rand::prelude::StdRng;
9+
use rand::{Rng, SeedableRng};
10+
use vortex_array::compute::{filter, warm_up_vtables};
11+
use vortex_array::{IntoArray, ToCanonical};
12+
use vortex_buffer::{BitBuffer, BufferMut};
13+
use vortex_mask::Mask;
14+
use vortex_pco::PcoArray;
15+
16+
#[global_allocator]
17+
static GLOBAL: MiMalloc = MiMalloc;
18+
19+
pub fn main() {
20+
warm_up_vtables();
21+
divan::main();
22+
}
23+
24+
#[divan::bench(args = [
25+
(10_000, 0.1),
26+
(10_000, 0.5),
27+
(10_000, 0.9),
28+
(10_000, 1.0),
29+
(50_000, 0.1),
30+
(50_000, 0.5),
31+
(50_000, 0.9),
32+
(50_000, 1.0),
33+
(100_000, 0.1),
34+
(100_000, 0.5),
35+
(100_000, 0.9),
36+
(100_000, 1.0)]
37+
)]
38+
pub fn pco_pipeline(bencher: Bencher, (size, selectivity): (usize, f64)) {
39+
let mut rng = StdRng::seed_from_u64(42);
40+
#[allow(clippy::cast_possible_truncation)]
41+
let values = (0..size)
42+
.map(|i| (i % 10000) as i32)
43+
.collect::<BufferMut<i32>>()
44+
.into_array()
45+
.to_primitive();
46+
47+
let pco_array = PcoArray::from_primitive(&values, 3, 0).unwrap();
48+
let mask = (0..size)
49+
.map(|_| rng.random_bool(selectivity))
50+
.collect::<BitBuffer>();
51+
52+
bencher
53+
.with_inputs(|| (Mask::from_buffer(mask.clone()), pco_array.clone()))
54+
.bench_refs(|(mask, pco_array)| pco_array.execute_with_selection(mask).unwrap());
55+
}
56+
57+
#[divan::bench(args = [
58+
(10_000, 0.1),
59+
(10_000, 0.5),
60+
(10_000, 0.9),
61+
(10_000, 1.0),
62+
(50_000, 0.1),
63+
(50_000, 0.5),
64+
(50_000, 0.9),
65+
(50_000, 1.0),
66+
(100_000, 0.1),
67+
(100_000, 0.5),
68+
(100_000, 0.9),
69+
(100_000, 1.0)]
70+
)]
71+
pub fn pco_canonical(bencher: Bencher, (size, selectivity): (usize, f64)) {
72+
let mut rng = StdRng::seed_from_u64(42);
73+
#[allow(clippy::cast_possible_truncation)]
74+
let values = (0..size)
75+
.map(|i| (i % 10000) as i32)
76+
.collect::<BufferMut<i32>>()
77+
.into_array()
78+
.to_primitive();
79+
80+
let pco_array = PcoArray::from_primitive(&values, 3, 0).unwrap();
81+
let mask = (0..size)
82+
.map(|_| rng.random_bool(selectivity))
83+
.collect::<BitBuffer>();
84+
85+
bencher
86+
.with_inputs(|| (Mask::from_buffer(mask.clone()), pco_array.clone()))
87+
.bench_refs(|(mask, pco_array)| filter(pco_array.to_canonical().as_ref(), mask).unwrap());
88+
}

encodings/pco/src/array.rs

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ use pco::{ChunkConfig, PagingSpec, match_number_enum};
1313
use prost::Message;
1414
use vortex_array::arrays::{PrimitiveArray, PrimitiveVTable};
1515
use vortex_array::compute::filter;
16+
use vortex_array::pipeline::PipelinedNode;
1617
use vortex_array::serde::ArrayChildren;
1718
use vortex_array::stats::{ArrayStats, StatsSetRef};
1819
use vortex_array::validity::Validity;
1920
use vortex_array::vtable::{
20-
ArrayVTable, CanonicalVTable, EncodeVTable, NotSupported, OperationsVTable, VTable,
21-
ValidityHelper, ValiditySliceHelper, ValidityVTableFromValiditySliceHelper, VisitorVTable,
21+
ArrayVTable, CanonicalVTable, EncodeVTable, NotSupported, OperationsVTable, OperatorVTable,
22+
VTable, ValidityHelper, ValiditySliceHelper, ValidityVTableFromValiditySliceHelper,
23+
VisitorVTable,
2224
};
2325
use vortex_array::{
2426
ArrayBufferVisitor, ArrayChildVisitor, ArrayEq, ArrayHash, ArrayRef, Canonical, EncodingId,
@@ -67,7 +69,7 @@ impl VTable for PcoVTable {
6769
type VisitorVTable = Self;
6870
type ComputeVTable = NotSupported;
6971
type EncodeVTable = Self;
70-
type OperatorVTable = NotSupported;
72+
type OperatorVTable = Self;
7173

7274
fn id(_encoding: &Self::Encoding) -> EncodingId {
7375
EncodingId::new_ref("vortex.pco")
@@ -129,7 +131,7 @@ impl VTable for PcoVTable {
129131
}
130132
}
131133

132-
fn number_type_from_dtype(dtype: &DType) -> NumberType {
134+
pub(crate) fn number_type_from_dtype(dtype: &DType) -> NumberType {
133135
let ptype = dtype.as_ptype();
134136
match ptype {
135137
PType::F16 => NumberType::F16,
@@ -150,7 +152,7 @@ fn collect_valid(parray: &PrimitiveArray) -> VortexResult<PrimitiveArray> {
150152
Ok(filter(&parray.to_array(), &mask)?.to_primitive())
151153
}
152154

153-
fn vortex_err_from_pco(err: PcoError) -> VortexError {
155+
pub(crate) fn vortex_err_from_pco(err: PcoError) -> VortexError {
154156
use pco::errors::ErrorKind::*;
155157
match err.kind {
156158
Io(io_kind) => VortexError::from(std::io::Error::new(io_kind, err.message)),
@@ -509,6 +511,12 @@ impl VisitorVTable<PcoVTable> for PcoVTable {
509511
}
510512
}
511513

514+
impl OperatorVTable<PcoVTable> for PcoVTable {
515+
fn pipeline_node(array: &PcoArray) -> Option<&dyn PipelinedNode> {
516+
Some(array)
517+
}
518+
}
519+
512520
#[cfg(test)]
513521
mod tests {
514522
use vortex_array::arrays::PrimitiveArray;

encodings/pco/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
mod array;
55
mod compute;
6+
mod pipeline;
67
#[cfg(test)]
78
mod test;
89

0 commit comments

Comments
 (0)