Skip to content

Commit 852f1d2

Browse files
authored
Array Metadata (#1985)
The current ArrayMetadata system was put together somewhat quickly and uses `serde` + Flexbuffers to serialize all metadata. The end state looks like this: - [ ] An array has a fixed 8 bytes of metadata. If it needs more, it should use a buffer (made possible by #1743) - [ ] Rkyv can optionally be used to help with serde for these bytes. - [ ] No eager deserialization of metadata is performed, although arrays should validate metadata in the `ValidateVTable` (see #1979). To support 8 byte metadata, we need to: - [ ] Move scalars and scalar values out of metadata (e.g. ConstantArray) - [ ] Move shift from FoR into BitPacking (this is a bit cheeky, it's not strictly necessary, but FoR is then left with a 8-byte PValue for metadata, and shifting feels like it should live in BitPacking anyway?) - [ ] All other metadata should easily fit into 8 bytes.
1 parent e1a7d47 commit 852f1d2

File tree

59 files changed

+1004
-521
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+1004
-521
lines changed

Cargo.lock

Lines changed: 122 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,12 @@ prost-build = "0.13.0"
114114
prost-types = "0.13.0"
115115
pyo3 = { version = ">= 0.22", features = ["extension-module", "abi3-py310"] }
116116
pyo3-log = ">= 0.11"
117+
rancor = "0.1.0"
117118
rand = "0.8.5"
118119
rayon = "1.10.0"
119120
regex = "1.11.0"
120121
reqwest = { version = "0.12.0", features = ["blocking"] }
122+
rkyv = { version = "0.8", features = ["little_endian", "pointer_width_32", "bytecheck"] }
121123
rstest = "0.24"
122124
serde = "1.0.197"
123125
serde_json = "1.0.116"

docs/quickstart.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Vortex array:
3535
>>> parquet = pq.read_table("_static/example.parquet")
3636
>>> vtx = vortex.array(parquet)
3737
>>> vtx.nbytes
38-
141069
38+
141057
3939

4040
Compress
4141
^^^^^^^^
@@ -46,7 +46,7 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the
4646

4747
>>> cvtx = vortex.compress(vtx)
4848
>>> cvtx.nbytes
49-
16791
49+
15888
5050
>>> cvtx.nbytes / vtx.nbytes
5151
0.11...
5252

encodings/alp/src/alp/array.rs

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::fmt::{Debug, Display};
1+
use std::fmt::Debug;
22

33
use serde::{Deserialize, Serialize};
44
use vortex_array::array::PrimitiveArray;
@@ -10,27 +10,22 @@ use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable};
1010
use vortex_array::variants::{PrimitiveArrayTrait, VariantsVTable};
1111
use vortex_array::visitor::{ArrayVisitor, VisitorVTable};
1212
use vortex_array::{
13-
impl_encoding, ArrayDType, ArrayData, ArrayLen, Canonical, IntoArrayData, IntoCanonical,
13+
impl_encoding, ArrayDType, ArrayData, ArrayLen, Canonical, DeserializeMetadata, IntoArrayData,
14+
IntoCanonical, SerdeMetadata,
1415
};
1516
use vortex_dtype::{DType, PType};
1617
use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult};
1718

1819
use crate::alp::{alp_encode, decompress, Exponents};
1920

20-
impl_encoding!("vortex.alp", ids::ALP, ALP);
21+
impl_encoding!("vortex.alp", ids::ALP, ALP, SerdeMetadata<ALPMetadata>);
2122

2223
#[derive(Debug, Clone, Serialize, Deserialize)]
2324
pub struct ALPMetadata {
2425
pub(crate) exponents: Exponents,
2526
pub(crate) patches: Option<PatchesMetadata>,
2627
}
2728

28-
impl Display for ALPMetadata {
29-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
30-
Debug::fmt(self, f)
31-
}
32-
}
33-
3429
impl ALPArray {
3530
pub fn try_new(
3631
encoded: ArrayData,
@@ -60,7 +55,7 @@ impl ALPArray {
6055
Self::try_from_parts(
6156
dtype,
6257
length,
63-
ALPMetadata { exponents, patches },
58+
SerdeMetadata(ALPMetadata { exponents, patches }),
6459
None,
6560
Some(children.into()),
6661
Default::default(),
@@ -75,6 +70,12 @@ impl ALPArray {
7570
}
7671
}
7772

73+
fn metadata(&self) -> ALPMetadata {
74+
SerdeMetadata::<ALPMetadata>::deserialize(self.as_ref().metadata_bytes())
75+
.vortex_expect("ALPMetadata metadata")
76+
.0
77+
}
78+
7879
pub fn encoded(&self) -> ArrayData {
7980
self.as_ref()
8081
.child(0, &self.encoded_dtype(), self.len())
@@ -156,6 +157,7 @@ impl StatisticsVTable<ALPArray> for ALPEncoding {}
156157
mod tests {
157158
use vortex_array::patches::PatchesMetadata;
158159
use vortex_array::test_harness::check_metadata;
160+
use vortex_array::SerdeMetadata;
159161
use vortex_dtype::PType;
160162

161163
use crate::{ALPMetadata, Exponents};
@@ -165,13 +167,13 @@ mod tests {
165167
fn test_alp_metadata() {
166168
check_metadata(
167169
"alp.metadata",
168-
ALPMetadata {
170+
SerdeMetadata(ALPMetadata {
169171
patches: Some(PatchesMetadata::new(usize::MAX, PType::U64)),
170172
exponents: Exponents {
171173
e: u8::MAX,
172174
f: u8::MAX,
173175
},
174-
},
176+
}),
175177
);
176178
}
177179
}

0 commit comments

Comments
 (0)