Skip to content

Commit 416d488

Browse files
authored
Compressed Statistics (#2686)
Compress the stats tables
1 parent e764a7a commit 416d488

File tree

16 files changed

+58
-47
lines changed

16 files changed

+58
-47
lines changed

bench-vortex/src/tpch/mod.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ use vortex::arrays::ChunkedArray;
2525
use vortex::arrow::FromArrowArray;
2626
use vortex::dtype::DType;
2727
use vortex::error::VortexExpect as _;
28-
use vortex::file::{DEFAULT_REGISTRY, VORTEX_FILE_EXTENSION, VortexWriteOptions};
29-
use vortex::layout::{LayoutRegistry, LayoutRegistryExt};
28+
use vortex::file::{VORTEX_FILE_EXTENSION, VortexWriteOptions};
3029
use vortex::{Array, ArrayRef, TryIntoArray};
3130
use vortex_datafusion::SessionContextExt;
3231
use vortex_datafusion::persistent::VortexFormat;
@@ -352,10 +351,7 @@ async fn register_vortex_file(
352351
.await?;
353352
}
354353

355-
let format = Arc::new(VortexFormat::new(
356-
DEFAULT_REGISTRY.clone(),
357-
Arc::new(LayoutRegistry::default()),
358-
));
354+
let format = Arc::new(VortexFormat::default());
359355
let table_url = ListingTableUrl::parse(vtx_file.as_str())?;
360356
let config = ListingTableConfig::new(table_url)
361357
.with_listing_options(ListingOptions::new(format as _))

vortex-array/src/compute/is_sorted.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ where
5959
}
6060

6161
pub fn is_sorted(array: &dyn Array) -> VortexResult<bool> {
62+
// We currently don't support sorting struct arrays.
63+
if array.dtype().is_struct() {
64+
return Ok(false);
65+
}
66+
6267
if let Some(Precision::Exact(value)) = array.statistics().get_as::<bool>(Stat::IsSorted) {
6368
return Ok(value);
6469
}
@@ -76,6 +81,11 @@ pub fn is_sorted(array: &dyn Array) -> VortexResult<bool> {
7681
Ok(is_sorted)
7782
}
7883
pub fn is_strict_sorted(array: &dyn Array) -> VortexResult<bool> {
84+
// We currently don't support sorting struct arrays.
85+
if array.dtype().is_struct() {
86+
return Ok(false);
87+
}
88+
7989
if let Some(Precision::Exact(value)) = array.statistics().get_as::<bool>(Stat::IsStrictSorted) {
8090
return Ok(value);
8191
}

vortex-datafusion/src/persistent/format.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use vortex_dtype::arrow::FromArrowType;
2828
use vortex_error::{VortexExpect, VortexResult, vortex_err};
2929
use vortex_expr::datafusion::convert_expr_to_vortex;
3030
use vortex_expr::{VortexExpr, and};
31-
use vortex_file::{VORTEX_FILE_EXTENSION, VortexOpenOptions};
31+
use vortex_file::{DEFAULT_REGISTRY, VORTEX_FILE_EXTENSION, VortexOpenOptions};
3232
use vortex_io::ObjectStoreReadAt;
3333
use vortex_layout::{LayoutRegistry, LayoutRegistryExt};
3434

@@ -69,7 +69,7 @@ impl VortexFormatFactory {
6969
// Because FileFormatFactory has a default method
7070
/// Create a new [`VortexFormatFactory`] with the default encoding context.
7171
pub fn default_config() -> Self {
72-
Self::with_registry(Arc::new(ArrayRegistry::default()))
72+
Self::with_registry(DEFAULT_REGISTRY.clone())
7373
}
7474

7575
/// Create a new [`VortexFormatFactory`] that creates [`VortexFormat`] instances with the provided [`Context`](vortex_array::ArrayContext).

vortex-file/src/strategy.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
33
use std::sync::Arc;
44

5+
use vortex_array::arcref::ArcRef;
56
use vortex_array::stats::{PRUNING_STATS, STATS_TO_WRITE};
67
use vortex_array::{Array, ArrayContext, ArrayRef};
78
use vortex_btrblocks::BtrBlocksCompressor;
@@ -34,7 +35,7 @@ impl LayoutStrategy for VortexLayoutStrategy {
3435
ctx.clone(),
3536
&DType::Null,
3637
ChunkedLayoutOptions {
37-
chunk_strategy: Arc::new(FlatLayoutOptions::default()),
38+
chunk_strategy: ArcRef::new_arc(Arc::new(FlatLayoutOptions::default()) as _),
3839
},
3940
)
4041
.boxed(),
@@ -59,7 +60,9 @@ impl LayoutStrategy for VortexLayoutStrategy {
5960
ctx.clone(),
6061
dtype,
6162
writer,
62-
Arc::new(FlatLayout),
63+
ArcRef::new_arc(Arc::new(BtrBlocksCompressedStrategy {
64+
child: ArcRef::new_ref(&FlatLayout),
65+
})),
6366
StatsLayoutOptions {
6467
block_size: 8192,
6568
stats: PRUNING_STATS.into(),
@@ -79,6 +82,17 @@ impl LayoutStrategy for VortexLayoutStrategy {
7982
}
8083
}
8184

85+
struct BtrBlocksCompressedStrategy {
86+
child: ArcRef<dyn LayoutStrategy>,
87+
}
88+
89+
impl LayoutStrategy for BtrBlocksCompressedStrategy {
90+
fn new_writer(&self, ctx: &ArrayContext, dtype: &DType) -> VortexResult<Box<dyn LayoutWriter>> {
91+
let child = self.child.new_writer(ctx, dtype)?;
92+
Ok(BtrBlocksCompressedWriter { child }.boxed())
93+
}
94+
}
95+
8296
/// A layout writer that compresses chunks using a sampling compressor, and re-uses the previous
8397
/// compressed chunk as a hint for the next.
8498
struct BtrBlocksCompressedWriter {

vortex-layout/src/data.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ impl Layout {
260260
&self,
261261
segment_reader: Arc<dyn AsyncSegmentReader>,
262262
ctx: ArrayContext,
263-
) -> VortexResult<Arc<dyn LayoutReader + 'static>> {
263+
) -> VortexResult<Arc<dyn LayoutReader>> {
264264
self.vtable().reader(self.clone(), ctx, segment_reader)
265265
}
266266

vortex-layout/src/layouts/chunked/eval_expr.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ impl ExprEvaluator for ChunkedReader {
5151
Ok(ChunkedArray::new_unchecked(chunks, dtype).into_array())
5252
}
5353

54-
async fn prune_mask(&self, row_mask: RowMask, _expr: ExprRef) -> VortexResult<RowMask> {
54+
async fn refine_mask(&self, row_mask: RowMask, _expr: ExprRef) -> VortexResult<RowMask> {
5555
// TODO(ngates): we should push-down to each child
5656
Ok(row_mask)
5757
}

vortex-layout/src/layouts/chunked/mod.rs

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ use crate::{CHUNKED_LAYOUT_ID, LayoutId};
2020
pub struct ChunkedLayout;
2121

2222
/// In-memory representation of Chunked layout.
23-
///
24-
/// First child in the list is the metadata table
25-
/// Subsequent children are consecutive chunks of this layout
2623
impl LayoutVTable for ChunkedLayout {
2724
fn id(&self) -> LayoutId {
2825
CHUNKED_LAYOUT_ID
@@ -44,9 +41,8 @@ impl LayoutVTable for ChunkedLayout {
4441
row_offset: u64,
4542
splits: &mut BTreeSet<u64>,
4643
) -> VortexResult<()> {
47-
let nchunks = layout.nchildren() - (if layout.metadata().is_some() { 1 } else { 0 });
4844
let mut offset = row_offset;
49-
for i in 0..nchunks {
45+
for i in 0..layout.nchildren() {
5046
let child = layout.child(i, layout.dtype().clone(), format!("[{}]", i))?;
5147
child.register_splits(field_mask, offset, splits)?;
5248
offset += child.row_count();
@@ -63,9 +59,8 @@ impl LayoutVTable for ChunkedLayout {
6359
projection_field_mask: &[FieldMask],
6460
segments: &mut SegmentCollector,
6561
) -> VortexResult<()> {
66-
let nchunks = layout.nchildren() - (if layout.metadata().is_some() { 1 } else { 0 });
6762
let mut offset = row_offset;
68-
for i in 0..nchunks {
63+
for i in 0..layout.nchildren() {
6964
let child = layout.child(i, layout.dtype().clone(), format!("[{i}]"))?;
7065
child.required_segments(offset, filter_field_mask, projection_field_mask, segments)?;
7166
offset += child.row_count();

vortex-layout/src/layouts/chunked/reader.rs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,7 @@ impl ChunkedReader {
3333
}
3434

3535
// The number of chunks
36-
let mut nchunks = layout.nchildren();
37-
if layout.metadata().is_some() {
38-
// The final child is the statistics table.
39-
nchunks -= 1;
40-
}
36+
let nchunks = layout.nchildren();
4137

4238
// Construct a lazy scan for each chunk of the layout.
4339
let chunk_readers = (0..nchunks).map(|_| OnceLock::new()).collect();

vortex-layout/src/layouts/chunked/writer.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
use std::sync::Arc;
2-
1+
use vortex_array::arcref::ArcRef;
32
use vortex_array::{ArrayContext, ArrayRef};
43
use vortex_dtype::DType;
54
use vortex_error::{VortexExpect, VortexResult};
@@ -14,13 +13,13 @@ use crate::writer::LayoutWriter;
1413

1514
pub struct ChunkedLayoutOptions {
1615
/// The layout strategy for each chunk.
17-
pub chunk_strategy: Arc<dyn LayoutStrategy>,
16+
pub chunk_strategy: ArcRef<dyn LayoutStrategy>,
1817
}
1918

2019
impl Default for ChunkedLayoutOptions {
2120
fn default() -> Self {
2221
Self {
23-
chunk_strategy: Arc::new(FlatLayout),
22+
chunk_strategy: ArcRef::new_ref(&FlatLayout),
2423
}
2524
}
2625
}

vortex-layout/src/layouts/flat/eval_expr.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ impl ExprEvaluator for FlatReader {
4040
Ok(array)
4141
}
4242

43-
async fn prune_mask(&self, row_mask: RowMask, _expr: ExprRef) -> VortexResult<RowMask> {
43+
async fn refine_mask(&self, row_mask: RowMask, _expr: ExprRef) -> VortexResult<RowMask> {
4444
// No cheap pruning for us to do without fetching data.
4545
Ok(row_mask)
4646
}

0 commit comments

Comments
 (0)