Skip to content

Commit b5127b8

Browse files
authored
Store present stats as a bitset in metadata of chunked layout and remove inline dtype layout (#1555)
1 parent a017de9 commit b5127b8

File tree

12 files changed

+140
-257
lines changed

12 files changed

+140
-257
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-array/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ humansize = { workspace = true }
4242
itertools = { workspace = true }
4343
log = { workspace = true }
4444
num-traits = { workspace = true }
45+
num_enum = { workspace = true }
4546
paste = { workspace = true }
4647
pin-project = { workspace = true }
4748
rand = { workspace = true }

vortex-array/src/stats/mod.rs

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
33
use std::fmt::{Display, Formatter};
44
use std::hash::Hash;
5+
use std::sync::Arc;
56

6-
use enum_iterator::Sequence;
7+
use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder, Buffer, MutableBuffer};
8+
use enum_iterator::{cardinality, Sequence};
79
use enum_map::Enum;
810
use itertools::Itertools;
11+
use num_enum::{IntoPrimitive, TryFromPrimitive};
912
pub use statsset::*;
1013
use vortex_dtype::Nullability::NonNullable;
11-
use vortex_dtype::{DType, NativePType};
14+
use vortex_dtype::{DType, NativePType, PType};
1215
use vortex_error::{vortex_err, vortex_panic, VortexError, VortexResult};
1316
use vortex_scalar::Scalar;
1417

@@ -21,7 +24,10 @@ mod statsset;
2124
/// Statistics that are used for pruning files (i.e., we want to ensure they are computed when compressing/writing).
2225
pub const PRUNING_STATS: &[Stat] = &[Stat::Min, Stat::Max, Stat::TrueCount, Stat::NullCount];
2326

24-
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Sequence, Enum)]
27+
#[derive(
28+
Debug, Clone, Copy, PartialEq, Eq, Hash, Sequence, Enum, IntoPrimitive, TryFromPrimitive,
29+
)]
30+
#[repr(u8)]
2531
pub enum Stat {
2632
/// Frequency of each bit width (nulls are treated as 0)
2733
BitWidthFreq,
@@ -69,6 +75,53 @@ impl Stat {
6975
pub fn has_same_dtype_as_array(&self) -> bool {
7076
matches!(self, Stat::Min | Stat::Max)
7177
}
78+
79+
pub fn dtype(&self, data_type: &DType) -> DType {
80+
match self {
81+
Stat::BitWidthFreq => DType::List(
82+
Arc::new(DType::Primitive(PType::U64, NonNullable)),
83+
NonNullable,
84+
),
85+
Stat::TrailingZeroFreq => DType::List(
86+
Arc::new(DType::Primitive(PType::U64, NonNullable)),
87+
NonNullable,
88+
),
89+
Stat::IsConstant => DType::Bool(NonNullable),
90+
Stat::IsSorted => DType::Bool(NonNullable),
91+
Stat::IsStrictSorted => DType::Bool(NonNullable),
92+
Stat::Max => data_type.clone(),
93+
Stat::Min => data_type.clone(),
94+
Stat::RunCount => DType::Primitive(PType::U64, NonNullable),
95+
Stat::TrueCount => DType::Primitive(PType::U64, NonNullable),
96+
Stat::NullCount => DType::Primitive(PType::U64, NonNullable),
97+
Stat::UncompressedSizeInBytes => DType::Primitive(PType::U64, NonNullable),
98+
}
99+
}
100+
}
101+
102+
pub fn as_stat_bitset_bytes(stats: &[Stat]) -> Vec<u8> {
103+
let stat_count = cardinality::<Stat>();
104+
let mut stat_bitset = BooleanBufferBuilder::new_from_buffer(
105+
MutableBuffer::from_len_zeroed(stat_count.div_ceil(8)),
106+
stat_count,
107+
);
108+
for stat in stats {
109+
stat_bitset.set_bit(u8::from(*stat) as usize, true);
110+
}
111+
112+
stat_bitset
113+
.finish()
114+
.into_inner()
115+
.into_vec()
116+
.unwrap_or_else(|b| b.to_vec())
117+
}
118+
119+
pub fn stats_from_bitset_bytes(bytes: &[u8]) -> Vec<Stat> {
120+
BooleanBuffer::new(Buffer::from(bytes), 0, bytes.len() * 8)
121+
.set_indices()
122+
// Filter out indices failing conversion, these are stats written by newer version of library
123+
.filter_map(|i| Stat::try_from(i as u8).ok())
124+
.collect::<Vec<_>>()
72125
}
73126

74127
impl Display for Stat {

vortex-file/src/lib.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,6 @@ mod forever_constant {
171171
pub const CHUNKED_LAYOUT_ID: LayoutId = LayoutId(2);
172172
/// The layout ID for a column layout
173173
pub const COLUMNAR_LAYOUT_ID: LayoutId = LayoutId(3);
174-
/// The layout ID for an inline schema layout
175-
pub const INLINE_SCHEMA_LAYOUT_ID: LayoutId = LayoutId(4);
176174

177175
#[cfg(test)]
178176
mod test {
@@ -187,7 +185,6 @@ mod forever_constant {
187185
assert_eq!(FLAT_LAYOUT_ID, LayoutId(1));
188186
assert_eq!(CHUNKED_LAYOUT_ID, LayoutId(2));
189187
assert_eq!(COLUMNAR_LAYOUT_ID, LayoutId(3));
190-
assert_eq!(INLINE_SCHEMA_LAYOUT_ID, LayoutId(4));
191188
}
192189
}
193190
}

vortex-file/src/read/context.rs

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use vortex_array::Context;
77
use vortex_error::{vortex_err, VortexResult};
88
use vortex_flatbuffers::footer as fb;
99

10-
use crate::layouts::{ChunkedLayout, ColumnarLayout, FlatLayout, InlineDTypeLayout};
10+
use crate::layouts::{ChunkedLayout, ColumnarLayout, FlatLayout};
1111
use crate::{LayoutReader, RelativeLayoutCache, Scan};
1212

1313
#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
@@ -52,15 +52,10 @@ impl LayoutContext {
5252
impl Default for LayoutContext {
5353
fn default() -> Self {
5454
Self::new(
55-
[
56-
&ColumnarLayout as LayoutRef,
57-
&ChunkedLayout,
58-
&InlineDTypeLayout,
59-
&FlatLayout,
60-
]
61-
.into_iter()
62-
.map(|l| (l.id(), l))
63-
.collect(),
55+
[&ColumnarLayout as LayoutRef, &ChunkedLayout, &FlatLayout]
56+
.into_iter()
57+
.map(|l| (l.id(), l))
58+
.collect(),
6459
)
6560
}
6661
}

vortex-file/src/read/layouts/chunked.rs

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,25 @@
11
use std::collections::BTreeSet;
2-
use std::sync::{OnceLock, RwLock};
2+
use std::sync::{Arc, OnceLock, RwLock};
33

44
use bytes::Bytes;
55
use itertools::Itertools;
66
use vortex_array::aliases::hash_map::HashMap;
77
use vortex_array::array::ChunkedArray;
88
use vortex_array::compute::{scalar_at, take, TakeOptions};
9-
use vortex_array::stats::{ArrayStatistics as _, Stat};
9+
use vortex_array::stats::{stats_from_bitset_bytes, ArrayStatistics as _, Stat};
1010
use vortex_array::{ArrayDType, ArrayData, IntoArrayData};
11+
use vortex_dtype::{DType, Nullability, StructDType};
1112
use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexExpect as _, VortexResult};
13+
use vortex_expr::Select;
1214
use vortex_flatbuffers::footer;
1315

1416
use crate::layouts::RangedLayoutReader;
1517
use crate::pruning::PruningPredicate;
1618
use crate::read::cache::RelativeLayoutCache;
1719
use crate::read::mask::RowMask;
1820
use crate::{
19-
BatchRead, Layout, LayoutDeserializer, LayoutId, LayoutPartId, LayoutReader, MessageLocator,
20-
MetadataRead, PruningRead, Scan, CHUNKED_LAYOUT_ID,
21+
BatchRead, Layout, LayoutDeserializer, LayoutId, LayoutPartId, LayoutReader, LazyDType,
22+
MessageLocator, MetadataRead, PruningRead, Scan, CHUNKED_LAYOUT_ID,
2123
};
2224

2325
#[derive(Default, Debug)]
@@ -76,38 +78,44 @@ impl ChunkedLayoutBuilder {
7678
}
7779

7880
fn metadata_layout(&self) -> VortexResult<Option<Box<dyn LayoutReader>>> {
79-
self.has_metadata()
80-
.then(|| {
81+
self.flatbuffer()
82+
.metadata()
83+
.map(|m| {
84+
let set_stats = stats_from_bitset_bytes(m.bytes());
8185
let metadata_fb = self
8286
.flatbuffer()
8387
.children()
84-
.ok_or_else(|| vortex_err!("must have metadata"))?
88+
.ok_or_else(|| vortex_err!("Must have children if layout has metadata"))?
8589
.get(0);
8690
self.layout_builder.read_layout(
8791
self.fb_bytes.clone(),
8892
metadata_fb._tab.loc(),
89-
// TODO(robert): Create stats projection
90-
Scan::empty(),
91-
self.message_cache.unknown_dtype(METADATA_LAYOUT_PART_ID),
93+
Scan::new(Some(Arc::new(Select::include(
94+
set_stats.iter().map(|s| s.to_string().into()).collect(),
95+
)))),
96+
self.message_cache.relative(
97+
METADATA_LAYOUT_PART_ID,
98+
Arc::new(LazyDType::from_dtype(stats_table_dtype(
99+
&set_stats,
100+
self.message_cache.dtype().value()?,
101+
))),
102+
),
92103
)
93104
})
94105
.transpose()
95106
}
96107

97-
fn has_metadata(&self) -> bool {
98-
self.flatbuffer()
99-
.metadata()
100-
.map(|b| b.bytes()[0] != 0)
101-
.unwrap_or(false)
102-
}
103-
104108
fn children(&self) -> impl Iterator<Item = (usize, footer::Layout)> {
105109
self.flatbuffer()
106110
.children()
107111
.unwrap_or_default()
108112
.iter()
109113
.enumerate()
110-
.skip(if self.has_metadata() { 1 } else { 0 })
114+
.skip(if self.flatbuffer().metadata().is_some() {
115+
1
116+
} else {
117+
0
118+
})
111119
}
112120

113121
fn children_ranges(&self) -> Vec<(usize, usize)> {
@@ -146,6 +154,15 @@ impl ChunkedLayoutBuilder {
146154
}
147155
}
148156

157+
fn stats_table_dtype(stats: &[Stat], dtype: &DType) -> DType {
158+
let dtypes = stats.iter().map(|s| s.dtype(dtype).as_nullable()).collect();
159+
160+
DType::Struct(
161+
StructDType::new(stats.iter().map(|s| s.to_string().into()).collect(), dtypes),
162+
Nullability::NonNullable,
163+
)
164+
}
165+
149166
#[derive(Debug, Default, Clone)]
150167
enum ChildRead {
151168
#[default]
@@ -457,7 +474,7 @@ mod tests {
457474
let written = writer.into_inner();
458475

459476
let mut fb = FlatBufferBuilder::new();
460-
let chunked_layout = write::LayoutSpec::chunked(flat_layouts.into(), len as u64, false);
477+
let chunked_layout = write::LayoutSpec::chunked(flat_layouts.into(), len as u64, None);
461478
let flat_buf = chunked_layout.write_flatbuffer(&mut fb);
462479
fb.finish_minimal(flat_buf);
463480
let fb_bytes = Bytes::copy_from_slice(fb.finished_data());

0 commit comments

Comments
 (0)