Skip to content

Commit 468714b

Browse files
committed
varriable size array headers
1 parent 14c91e9 commit 468714b

File tree

6 files changed

+240
-43
lines changed

6 files changed

+240
-43
lines changed

crates/batson/src/array.rs

Lines changed: 179 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,61 @@
1-
use std::mem::size_of;
2-
use std::sync::Arc;
3-
1+
use bytemuck::NoUninit;
42
use jiter::{JsonArray, JsonValue};
53
use smallvec::SmallVec;
4+
use std::mem::size_of;
5+
use std::sync::Arc;
66

77
use crate::decoder::Decoder;
88
use crate::encoder::Encoder;
99
use crate::errors::{DecodeResult, EncodeResult, ToJsonResult};
1010
use crate::header::{Category, Header, Length, NumberHint, Primitive};
1111
use crate::json_writer::JsonWriter;
12+
use crate::object::minimum_value_size_estimate;
13+
use crate::EncodeError;
1214

1315
#[cfg(target_endian = "big")]
1416
compile_error!("big-endian architectures are not yet supported as we use `bytemuck` for zero-copy header decoding.");
1517

1618
/// Batson heterogeneous array representation
1719
#[derive(Debug)]
1820
pub(crate) struct HetArray<'b> {
19-
offsets: &'b [u32],
21+
offsets: HetArrayOffsets<'b>,
2022
}
2123

2224
impl<'b> HetArray<'b> {
2325
pub fn decode_header(d: &mut Decoder<'b>, length: Length) -> DecodeResult<Self> {
24-
if matches!(length, Length::Empty) {
25-
Ok(Self { offsets: &[] })
26-
} else {
27-
let length = length.decode(d)?;
28-
let positions = d.take_slice_as(length)?;
29-
Ok(Self { offsets: positions })
30-
}
26+
let offsets = match length {
27+
Length::Empty => HetArrayOffsets::U8(&[]),
28+
Length::U32 => HetArrayOffsets::U32(take_slice_as(d, length)?),
29+
Length::U16 => HetArrayOffsets::U16(take_slice_as(d, length)?),
30+
_ => HetArrayOffsets::U8(take_slice_as(d, length)?),
31+
};
32+
Ok(Self { offsets })
3133
}
3234

3335
pub fn len(&self) -> usize {
34-
self.offsets.len()
36+
match self.offsets {
37+
HetArrayOffsets::U8(v) => v.len(),
38+
HetArrayOffsets::U16(v) => v.len(),
39+
HetArrayOffsets::U32(v) => v.len(),
40+
}
3541
}
3642

3743
pub fn get(&self, d: &mut Decoder<'b>, index: usize) -> bool {
38-
if let Some(offset) = self.offsets.get(index) {
39-
d.index += *offset as usize;
44+
let opt_offset = match &self.offsets {
45+
HetArrayOffsets::U8(v) => v.get(index).map(|&o| o as usize),
46+
HetArrayOffsets::U16(v) => v.get(index).map(|&o| o as usize),
47+
HetArrayOffsets::U32(v) => v.get(index).map(|&o| o as usize),
48+
};
49+
if let Some(offset) = opt_offset {
50+
d.index += offset;
4051
true
4152
} else {
4253
false
4354
}
4455
}
4556

4657
pub fn to_json(&self, d: &mut Decoder<'b>) -> DecodeResult<JsonArray<'b>> {
47-
self.offsets
48-
.iter()
58+
(0..self.len())
4959
.map(|_| d.take_value())
5060
.collect::<DecodeResult<SmallVec<_, 8>>>()
5161
.map(Arc::new)
@@ -66,6 +76,18 @@ impl<'b> HetArray<'b> {
6676
}
6777
}
6878

79+
fn take_slice_as<'b, T: bytemuck::Pod>(d: &mut Decoder<'b>, length: Length) -> DecodeResult<&'b [T]> {
80+
let length = length.decode(d)?;
81+
d.take_slice_as(length)
82+
}
83+
84+
#[derive(Debug)]
85+
enum HetArrayOffsets<'b> {
86+
U8(&'b [u8]),
87+
U16(&'b [u16]),
88+
U32(&'b [u32]),
89+
}
90+
6991
pub(crate) fn header_array_get(d: &mut Decoder, length: Length, index: usize) -> DecodeResult<Option<Header>> {
7092
u8_array_get(d, length, index)?
7193
.map(|b| Header::decode(b, d))
@@ -161,22 +183,58 @@ pub(crate) fn encode_array(encoder: &mut Encoder, array: &JsonArray) -> EncodeRe
161183
}
162184
Ok(())
163185
} else {
164-
encoder.encode_length(Category::HetArray, array.len())?;
186+
let min_size = minimum_array_size_estimate(array);
187+
let encoder_position = encoder.position();
165188

166-
let mut offsets: Vec<u32> = Vec::with_capacity(array.len());
167-
encoder.align::<u32>();
168-
let positions_start = encoder.ring_fence(array.len() * size_of::<u32>());
189+
if min_size <= u8::MAX as usize {
190+
encoder.encode_length(Category::HetArray, array.len())?;
191+
if encode_array_sized::<u8>(encoder, array)? {
192+
return Ok(());
193+
}
194+
encoder.reset_position(encoder_position);
195+
}
169196

170-
let offset_start = encoder.position();
171-
for value in array.iter() {
172-
offsets.push((encoder.position() - offset_start) as u32);
173-
encoder.encode_value(value)?;
197+
if min_size <= u16::MAX as usize {
198+
encoder.encode_len_u16(Category::HetArray, u16::try_from(array.len()).unwrap());
199+
if encode_array_sized::<u16>(encoder, array)? {
200+
return Ok(());
201+
}
202+
encoder.reset_position(encoder_position);
203+
}
204+
205+
encoder.encode_len_u32(Category::HetArray, array.len())?;
206+
if encode_array_sized::<u32>(encoder, array)? {
207+
Ok(())
208+
} else {
209+
Err(EncodeError::ArrayTooLarge)
174210
}
175-
encoder.set_range(positions_start, bytemuck::cast_slice(&offsets));
176-
Ok(())
177211
}
178212
}
179213

214+
fn encode_array_sized<T: TryFrom<usize> + NoUninit>(encoder: &mut Encoder, array: &JsonArray) -> EncodeResult<bool> {
215+
let mut offsets: Vec<T> = Vec::with_capacity(array.len());
216+
encoder.align::<T>();
217+
let positions_start = encoder.ring_fence(array.len() * size_of::<T>());
218+
219+
let offset_start = encoder.position();
220+
for value in array.iter() {
221+
let Ok(offset) = T::try_from(encoder.position() - offset_start) else {
222+
return Ok(false);
223+
};
224+
offsets.push(offset);
225+
encoder.encode_value(value)?;
226+
}
227+
encoder.set_range(positions_start, bytemuck::cast_slice(&offsets));
228+
Ok(true)
229+
}
230+
231+
/// Estimate the minimize amount of space needed to encode the object.
232+
///
233+
/// This is NOT recursive, instead it makes very optimistic guesses about how long arrays and objects might be.
234+
fn minimum_array_size_estimate(array: &JsonArray) -> usize {
235+
array.iter().map(minimum_value_size_estimate).sum()
236+
}
237+
180238
#[derive(Debug)]
181239
enum PackedArray {
182240
Header(Vec<u8>),
@@ -295,6 +353,9 @@ mod test {
295353
#[test]
296354
fn array_round_trip() {
297355
let array = Arc::new(smallvec![JsonValue::Null, JsonValue::Int(123), JsonValue::Bool(false),]);
356+
let min_size = minimum_array_size_estimate(&array);
357+
assert_eq!(min_size, 4);
358+
298359
let mut encoder = Encoder::new();
299360
encoder.encode_array(&array).unwrap();
300361
let bytes: Vec<u8> = encoder.into();
@@ -305,7 +366,13 @@ mod test {
305366

306367
let het_array = HetArray::decode_header(&mut decoder, 3.into()).unwrap();
307368
assert_eq!(het_array.len(), 3);
308-
assert_eq!(het_array.offsets, &[0, 1, 3]);
369+
370+
let offsets = match het_array.offsets {
371+
HetArrayOffsets::U8(v) => v,
372+
_ => panic!("expected u8 offsets"),
373+
};
374+
375+
assert_eq!(offsets, &[0, 1, 3]);
309376
let decode_array = het_array.to_json(&mut decoder).unwrap();
310377
assert_arrays_eq!(decode_array, array);
311378
}
@@ -388,4 +455,89 @@ mod test {
388455
let i64_array = i64_array_to_json(&mut decoder, 5.into()).unwrap();
389456
assert_arrays_eq!(i64_array, array);
390457
}
458+
459+
#[test]
460+
fn test_u16_array() {
461+
let mut array = vec![JsonValue::Bool(true); 100];
462+
array.extend(vec![JsonValue::Int(i64::MAX); 100]);
463+
let array = Arc::new(array.into());
464+
465+
let mut encoder = Encoder::new();
466+
encoder.encode_array(&array).unwrap();
467+
let bytes: Vec<u8> = encoder.into();
468+
469+
let mut decoder = Decoder::new(&bytes);
470+
let header = decoder.take_header().unwrap();
471+
assert_eq!(header, Header::HetArray(Length::U16));
472+
473+
let het_array = HetArray::decode_header(&mut decoder, Length::U16).unwrap();
474+
assert_eq!(het_array.len(), 200);
475+
476+
let offsets = match het_array.offsets {
477+
HetArrayOffsets::U16(v) => v,
478+
_ => panic!("expected U16 offsets"),
479+
};
480+
assert_eq!(offsets.len(), 200);
481+
assert_eq!(offsets[0], 0);
482+
assert_eq!(offsets[1], 1);
483+
484+
let mut d = decoder.clone();
485+
assert!(het_array.get(&mut d, 0));
486+
assert!(compare_json_values(&d.take_value().unwrap(), &JsonValue::Bool(true)));
487+
488+
let mut d = decoder.clone();
489+
assert!(het_array.get(&mut d, 99));
490+
assert!(compare_json_values(&d.take_value().unwrap(), &JsonValue::Bool(true)));
491+
492+
let mut d = decoder.clone();
493+
assert!(het_array.get(&mut d, 100));
494+
assert!(compare_json_values(&d.take_value().unwrap(), &JsonValue::Int(i64::MAX)));
495+
496+
let mut d = decoder.clone();
497+
assert!(het_array.get(&mut d, 199));
498+
assert!(compare_json_values(&d.take_value().unwrap(), &JsonValue::Int(i64::MAX)));
499+
500+
let mut d = decoder.clone();
501+
assert!(!het_array.get(&mut d, 200));
502+
503+
let decode_array = het_array.to_json(&mut decoder).unwrap();
504+
assert_arrays_eq!(decode_array, array);
505+
}
506+
507+
#[test]
508+
fn test_u32_array() {
509+
let long_string = "a".repeat(u16::MAX as usize);
510+
let array = Arc::new(smallvec![
511+
JsonValue::Str(long_string.clone().into()),
512+
JsonValue::Int(42),
513+
]);
514+
515+
let mut encoder = Encoder::new();
516+
encoder.encode_array(&array).unwrap();
517+
let bytes: Vec<u8> = encoder.into();
518+
519+
let mut decoder = Decoder::new(&bytes);
520+
let header = decoder.take_header().unwrap();
521+
assert_eq!(header, Header::HetArray(Length::U32));
522+
523+
let het_array = HetArray::decode_header(&mut decoder, Length::U32).unwrap();
524+
assert_eq!(het_array.len(), 2);
525+
526+
let offsets = match het_array.offsets {
527+
HetArrayOffsets::U32(v) => v,
528+
_ => panic!("expected U32 offsets"),
529+
};
530+
assert_eq!(offsets, [0, 65538]);
531+
532+
let mut d = decoder.clone();
533+
assert!(het_array.get(&mut d, 0));
534+
assert!(compare_json_values(
535+
&d.take_value().unwrap(),
536+
&JsonValue::Str(long_string.into())
537+
));
538+
539+
let mut d = decoder.clone();
540+
assert!(het_array.get(&mut d, 1));
541+
assert!(compare_json_values(&d.take_value().unwrap(), &JsonValue::Int(42)));
542+
}
391543
}

crates/batson/src/encoder.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,12 @@ impl Encoder {
160160
}
161161

162162
pub fn encode_len_u32(&mut self, cat: Category, len: usize) -> EncodeResult<()> {
163+
let int = u32::try_from(len).map_err(|_| match cat {
164+
Category::Str => EncodeError::StrTooLong,
165+
Category::HetArray => EncodeError::ObjectTooLarge,
166+
_ => EncodeError::ArrayTooLarge,
167+
})?;
163168
self.push(cat.encode_with(Length::U32 as u8));
164-
let int = u32::try_from(len).map_err(|_| EncodeError::StrTooLong)?;
165169
self.extend(&int.to_le_bytes());
166170
Ok(())
167171
}

crates/batson/src/errors.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ pub type EncodeResult<T> = Result<T, EncodeError>;
1010
pub enum EncodeError {
1111
StrTooLong,
1212
ObjectTooLarge,
13+
ArrayTooLarge,
1314
}
1415

1516
pub type DecodeResult<T> = Result<T, DecodeError>;

0 commit comments

Comments
 (0)