Skip to content

Commit 01f4f92

Browse files
authored
fix: FlexBuffer serialization ambiguity for binary/string/list(u8) (#1859)
After the recent switch to ByteBuffer, FlexBuffer would have am ambiguity serializing the different sequence variants. This was resulting in Binary ScalarValues written to files to be reinterpreted as List(u8) on read. Instead, we explicitly handle serialization.
1 parent c16e4de commit 01f4f92

File tree

5 files changed

+50
-12
lines changed

5 files changed

+50
-12
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-array/src/array/list/mod.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use crate::array::PrimitiveArray;
1919
use crate::builders::{ArrayBuilder, ListBuilder};
2020
use crate::compute::{scalar_at, slice};
2121
use crate::encoding::ids;
22-
use crate::stats::{Stat, StatisticsVTable, StatsSet};
22+
use crate::stats::{StatisticsVTable, StatsSet};
2323
use crate::validity::{LogicalValidity, Validity, ValidityMetadata, ValidityVTable};
2424
use crate::variants::{ListArrayTrait, PrimitiveArrayTrait, VariantsVTable};
2525
use crate::visitor::{ArrayVisitor, VisitorVTable};
@@ -174,11 +174,7 @@ impl IntoCanonical for ListArray {
174174
}
175175
}
176176

177-
impl StatisticsVTable<ListArray> for ListEncoding {
178-
fn compute_statistics(&self, _array: &ListArray, _stat: Stat) -> VortexResult<StatsSet> {
179-
Ok(StatsSet::default())
180-
}
181-
}
177+
impl StatisticsVTable<ListArray> for ListEncoding {}
182178

183179
impl ListArrayTrait for ListArray {}
184180

vortex-sampling-compressor/src/sampling_compressor.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,16 +62,16 @@ impl Default for SamplingCompressor<'_> {
6262
}
6363

6464
impl<'a> SamplingCompressor<'a> {
65-
pub fn new(compressors: HashSet<CompressorRef<'a>>) -> Self {
65+
pub fn new(compressors: impl Into<HashSet<CompressorRef<'a>>>) -> Self {
6666
Self::new_with_options(compressors, Default::default())
6767
}
6868

6969
pub fn new_with_options(
70-
compressors: HashSet<CompressorRef<'a>>,
70+
compressors: impl Into<HashSet<CompressorRef<'a>>>,
7171
options: CompressConfig,
7272
) -> Self {
7373
Self {
74-
compressors,
74+
compressors: compressors.into(),
7575
options,
7676
path: Vec::new(),
7777
depth: 0,

vortex-scalar/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ vortex-error = { workspace = true }
3232
vortex-flatbuffers = { workspace = true, optional = true }
3333
vortex-proto = { workspace = true, optional = true }
3434

35+
[dev-dependencies]
36+
flexbuffers = { workspace = true }
37+
rstest = { workspace = true }
38+
3539
[lints]
3640
workspace = true
3741

vortex-scalar/src/serde/serde.rs

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use std::fmt::Formatter;
22

33
use serde::de::{Error, SeqAccess, Visitor};
4+
use serde::ser::SerializeSeq;
45
use serde::{Deserialize, Deserializer, Serialize, Serializer};
56
use vortex_buffer::{BufferString, ByteBuffer};
67

@@ -25,9 +26,19 @@ impl Serialize for InnerScalarValue {
2526
Self::Null => ().serialize(serializer),
2627
Self::Bool(b) => b.serialize(serializer),
2728
Self::Primitive(p) => p.serialize(serializer),
28-
Self::Buffer(buffer) => buffer.as_slice().serialize(serializer),
29-
Self::BufferString(buffer) => buffer.as_str().serialize(serializer),
30-
Self::List(l) => l.serialize(serializer),
29+
// NOTE: we explicitly handle the serialization of bytes, strings and lists so as not
30+
// to create ambiguities amongst them. The serde data model has specific representations
31+
// of binary data, UTF-8 strings and sequences.
32+
// See https://serde.rs/data-model.html.
33+
Self::Buffer(buffer) => serializer.serialize_bytes(buffer.as_slice()),
34+
Self::BufferString(buffer) => serializer.serialize_str(buffer.as_str()),
35+
Self::List(l) => {
36+
let mut seq = serializer.serialize_seq(Some(l.len()))?;
37+
for item in l.iter() {
38+
seq.serialize_element(item)?;
39+
}
40+
seq.end()
41+
}
3142
}
3243
}
3344
}
@@ -197,3 +208,29 @@ impl<'de> Deserialize<'de> for PValue {
197208
})
198209
}
199210
}
211+
212+
#[cfg(test)]
213+
mod tests {
214+
use std::sync::Arc;
215+
216+
use flexbuffers::{FlexbufferSerializer, Reader};
217+
use rstest::rstest;
218+
use vortex_dtype::{Nullability, PType};
219+
220+
use super::*;
221+
use crate::Scalar;
222+
223+
#[rstest]
224+
#[case(Scalar::binary(ByteBuffer::copy_from(b"hello"), Nullability::NonNullable).into_value())]
225+
#[case(Scalar::utf8("hello", Nullability::NonNullable).into_value())]
226+
#[case(Scalar::primitive(1u8, Nullability::NonNullable).into_value())]
227+
#[case(Scalar::list(Arc::new(PType::U8.into()), vec![Scalar::primitive(1u8, Nullability::NonNullable)], Nullability::NonNullable).into_value())]
228+
fn test_scalar_value_serde_roundtrip(#[case] scalar_value: ScalarValue) {
229+
let mut serializer = FlexbufferSerializer::new();
230+
scalar_value.serialize(&mut serializer).unwrap();
231+
let written = serializer.take_buffer();
232+
let reader = Reader::get_root(written.as_ref()).unwrap();
233+
let scalar_read_back = ScalarValue::deserialize(reader).unwrap();
234+
assert_eq!(scalar_value, scalar_read_back);
235+
}
236+
}

0 commit comments

Comments
 (0)