Skip to content

Commit 49768f3

Browse files
committed
Implement Variant::List (renamed from Variant::Array)
1 parent 04bedf9 commit 49768f3

File tree

2 files changed

+275
-47
lines changed

2 files changed

+275
-47
lines changed

parquet-variant/src/variant.rs

Lines changed: 274 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -346,18 +346,24 @@ impl VariantObjectHeader {
346346
let last_field_offset_byte =
347347
field_offsets_start_byte + (num_elements + 1) * field_offset_size as usize;
348348
if last_field_offset_byte > value.len() {
349-
return Err(ArrowError::InvalidArgumentError(
350-
"Last field offset array entry is outside the value slice".to_string(),
351-
));
349+
return Err(ArrowError::InvalidArgumentError(format!(
350+
"Last field offset array entry at offset {} with length {} is outside the value slice of length {}",
351+
last_field_offset_byte,
352+
field_offset_size as usize,
353+
value.len()
354+
)));
352355
}
353356

354357
// Verify that the value of the last field offset array entry fits inside the value slice
355358
let last_field_offset =
356359
field_offset_size.unpack_usize(value, field_offsets_start_byte, num_elements)?;
357360
if values_start_byte + last_field_offset > value.len() {
358-
return Err(ArrowError::InvalidArgumentError(
359-
"Last field offset value is outside the value slice".to_string(),
360-
));
361+
return Err(ArrowError::InvalidArgumentError(format!(
362+
"Last field offset value {} at offset {} is outside the value slice of length {}",
363+
last_field_offset,
364+
values_start_byte,
365+
value.len()
366+
)));
361367
}
362368
Ok(Self {
363369
field_offset_size,
@@ -482,46 +488,33 @@ impl<'m, 'v> VariantObject<'m, 'v> {
482488
}
483489
}
484490

485-
#[derive(Clone, Copy, Debug, PartialEq)]
486-
pub struct VariantArray<'m, 'v> {
487-
pub metadata: &'m VariantMetadata<'m>,
488-
pub value: &'v [u8],
491+
#[derive(Clone, Debug, PartialEq)]
492+
pub struct VariantListHeader {
493+
offset_size: OffsetSizeBytes,
494+
is_large: bool,
495+
num_elements: usize,
496+
first_offset_byte: usize,
497+
first_value_byte: usize,
489498
}
490499

491-
impl<'m, 'v> VariantArray<'m, 'v> {
492-
/// Return the length of this array
493-
pub fn len(&self) -> usize {
494-
todo!()
495-
}
496-
497-
/// Is the array of zero length
498-
pub fn is_empty(&self) -> bool {
499-
self.len() == 0
500-
}
501-
502-
pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, ArrowError> {
503-
todo!();
504-
#[allow(unreachable_code)] // Just to infer the return type
505-
Ok(vec![].into_iter())
506-
}
507-
508-
pub fn get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
500+
impl VariantListHeader {
501+
pub fn try_new(value: &[u8]) -> Result<Self, ArrowError> {
509502
// The 6 first bits to the left are the value_header and the 2 bits
510503
// to the right are the basic type, so we shift to get only the value_header
511-
let value_header = first_byte_from_slice(self.value)? >> 2;
504+
let value_header = first_byte_from_slice(value)? >> 2;
512505
let is_large = (value_header & 0x04) != 0; // 3rd bit from the right
513506
let field_offset_size_minus_one = value_header & 0x03; // Last two bits
514507
let offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?;
508+
515509
// The size of the num_elements entry in the array value_data is 4 bytes if
516510
// is_large is true, otherwise 1 byte.
517511
let num_elements_size = match is_large {
518512
true => OffsetSizeBytes::Four,
519513
false => OffsetSizeBytes::One,
520514
};
515+
521516
// Skip the header byte to read the num_elements
522-
// The size of the num_elements entry in the array value_data is 4 bytes if
523-
// is_large is true, otherwise 1 byte.
524-
let num_elements = num_elements_size.unpack_usize(self.value, 1, 0)?;
517+
let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
525518
let first_offset_byte = 1 + num_elements_size as usize;
526519

527520
let overflow =
@@ -540,28 +533,132 @@ impl<'m, 'v> VariantArray<'m, 'v> {
540533
.checked_add(value_bytes)
541534
.ok_or_else(overflow)?;
542535

536+
// Verify that the last offset array entry is inside the value slice
537+
let last_offset_byte = first_offset_byte + n_offsets * offset_size as usize;
538+
if last_offset_byte > value.len() {
539+
return Err(ArrowError::InvalidArgumentError(format!(
540+
"Last offset array entry at offset {} with length {} is outside the value slice of length {}",
541+
last_offset_byte,
542+
offset_size as usize,
543+
value.len()
544+
)));
545+
}
546+
547+
// Verify that the value of the last offset array entry fits inside the value slice
548+
let last_offset = offset_size.unpack_usize(value, first_offset_byte, num_elements)?;
549+
if first_value_byte + last_offset > value.len() {
550+
return Err(ArrowError::InvalidArgumentError(format!(
551+
"Last offset value {} at offset {} is outside the value slice of length {}",
552+
last_offset,
553+
first_value_byte,
554+
value.len()
555+
)));
556+
}
557+
558+
Ok(Self {
559+
offset_size,
560+
is_large,
561+
num_elements,
562+
first_offset_byte,
563+
first_value_byte,
564+
})
565+
}
566+
567+
/// Returns the number of elements in this list
568+
pub fn num_elements(&self) -> usize {
569+
self.num_elements
570+
}
571+
572+
/// Returns the offset size in bytes
573+
pub fn offset_size(&self) -> usize {
574+
self.offset_size as _
575+
}
576+
577+
/// Returns whether this is a large list
578+
pub fn is_large(&self) -> bool {
579+
self.is_large
580+
}
581+
582+
/// Returns the byte offset where the offset array starts
583+
pub fn first_offset_byte(&self) -> usize {
584+
self.first_offset_byte
585+
}
586+
587+
/// Returns the byte offset where the values start
588+
pub fn first_value_byte(&self) -> usize {
589+
self.first_value_byte
590+
}
591+
}
592+
593+
// NOTE: We differ from the variant spec and call it "list" instead of "array" in order to be
594+
// consistent with parquet and arrow type naming. Otherwise, the name would conflict with the
595+
// `VariantArray : Array` we must eventually define for variant-typed arrow arrays.
596+
#[derive(Clone, Debug, PartialEq)]
597+
pub struct VariantList<'m, 'v> {
598+
pub metadata: &'m VariantMetadata<'m>,
599+
pub value: &'v [u8],
600+
header: VariantListHeader,
601+
}
602+
603+
impl<'m, 'v> VariantList<'m, 'v> {
604+
pub fn try_new(metadata: &'m VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> {
605+
Ok(Self {
606+
metadata,
607+
value,
608+
header: VariantListHeader::try_new(value)?,
609+
})
610+
}
611+
612+
/// Return the length of this array
613+
pub fn len(&self) -> usize {
614+
self.header.num_elements()
615+
}
616+
617+
/// Is the array of zero length
618+
pub fn is_empty(&self) -> bool {
619+
self.len() == 0
620+
}
621+
622+
pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, ArrowError> {
623+
let len = self.len();
624+
let values = (0..len)
625+
.map(move |i| self.get(i))
626+
.collect::<Result<Vec<_>, _>>()?;
627+
Ok(values.into_iter())
628+
}
629+
630+
pub fn get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
631+
if index >= self.header.num_elements() {
632+
return Err(ArrowError::InvalidArgumentError(format!(
633+
"Index {} out of bounds for list of length {}",
634+
index,
635+
self.header.num_elements()
636+
)));
637+
}
638+
543639
// Skip header and num_elements bytes to read the offsets
544-
let start_field_offset_from_first_value_byte =
545-
offset_size.unpack_usize(self.value, first_offset_byte, index)?;
546-
let end_field_offset_from_first_value_byte =
547-
offset_size.unpack_usize(self.value, first_offset_byte, index + 1)?;
640+
let start_field_offset_from_first_value_byte = self.header.offset_size.unpack_usize(
641+
self.value,
642+
self.header.first_offset_byte(),
643+
index,
644+
)?;
645+
let end_field_offset_from_first_value_byte = self.header.offset_size.unpack_usize(
646+
self.value,
647+
self.header.first_offset_byte(),
648+
index + 1,
649+
)?;
548650

549651
// Read the value bytes from the offsets
550652
let variant_value_bytes = slice_from_slice(
551653
self.value,
552-
first_value_byte + start_field_offset_from_first_value_byte
553-
..first_value_byte + end_field_offset_from_first_value_byte,
654+
self.header.first_value_byte() + start_field_offset_from_first_value_byte
655+
..self.header.first_value_byte() + end_field_offset_from_first_value_byte,
554656
)?;
555657
let variant = Variant::try_new(self.metadata, variant_value_bytes)?;
556658
Ok(variant)
557659
}
558660
}
559661

560-
// impl<'m, 'v> Index<usize> for VariantArray<'m, 'v> {
561-
// type Output = Variant<'m, 'v>;
562-
//
563-
// }
564-
565662
/// Variant value. May contain references to metadata and value
566663
#[derive(Clone, Debug, PartialEq)]
567664
pub enum Variant<'m, 'v> {
@@ -578,7 +675,7 @@ pub enum Variant<'m, 'v> {
578675

579676
// need both metadata & value
580677
Object(VariantObject<'m, 'v>),
581-
Array(VariantArray<'m, 'v>),
678+
List(VariantList<'m, 'v>),
582679
}
583680

584681
impl<'m, 'v> Variant<'m, 'v> {
@@ -600,7 +697,7 @@ impl<'m, 'v> Variant<'m, 'v> {
600697
Variant::ShortString(decoder::decode_short_string(value)?)
601698
}
602699
VariantBasicType::Object => Variant::Object(VariantObject::try_new(metadata, value)?),
603-
VariantBasicType::Array => Variant::Array(VariantArray { metadata, value }),
700+
VariantBasicType::Array => Variant::List(VariantList::try_new(metadata, value)?),
604701
};
605702
Ok(new_self)
606703
}
@@ -637,7 +734,7 @@ impl<'m, 'v> Variant<'m, 'v> {
637734
pub fn metadata(&self) -> Option<&'m VariantMetadata> {
638735
match self {
639736
Variant::Object(VariantObject { metadata, .. })
640-
| Variant::Array(VariantArray { metadata, .. }) => Some(*metadata),
737+
| Variant::List(VariantList { metadata, .. }) => Some(*metadata),
641738
_ => None,
642739
}
643740
}
@@ -1005,4 +1102,135 @@ mod tests {
10051102
let fields: Vec<_> = variant_obj.fields().unwrap().collect();
10061103
assert_eq!(fields.len(), 0);
10071104
}
1105+
1106+
#[test]
1107+
fn test_variant_list_simple() {
1108+
// Create simple metadata (empty dictionary for this test)
1109+
let metadata_bytes = vec![
1110+
0x01, // header: version=1, sorted=0, offset_size_minus_one=0
1111+
0, // dictionary_size = 0
1112+
0, // offset[0] = 0 (end of dictionary)
1113+
];
1114+
let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
1115+
1116+
// Create list value data for: [42, true, "hi"]
1117+
// Header: basic_type=3 (array), field_offset_size_minus_one=0, is_large=0
1118+
// value_header = 0000_0_0_00 = 0x00
1119+
// So header byte = (0x00 << 2) | 3 = 0x03
1120+
let list_value = vec![
1121+
0x03, // header: basic_type=3, value_header=0x00
1122+
3, // num_elements = 3
1123+
// Offsets (1 byte each): 4 offsets total
1124+
0, // offset to first value (int8)
1125+
2, // offset to second value (boolean true)
1126+
3, // offset to third value (short string)
1127+
6, // end offset
1128+
// Values:
1129+
0x0C,
1130+
42, // int8: primitive_header=3, basic_type=0 -> (3 << 2) | 0 = 0x0C, then value 42
1131+
0x04, // boolean true: primitive_header=1, basic_type=0 -> (1 << 2) | 0 = 0x04
1132+
0x09, b'h', b'i', // short string: length=2, basic_type=1 -> (2 << 2) | 1 = 0x09
1133+
];
1134+
1135+
let variant_list = VariantList::try_new(&metadata, &list_value).unwrap();
1136+
1137+
// Test basic properties
1138+
assert_eq!(variant_list.len(), 3);
1139+
assert!(!variant_list.is_empty());
1140+
1141+
// Test individual element access
1142+
let elem0 = variant_list.get(0).unwrap();
1143+
assert_eq!(elem0.as_int8(), Some(42));
1144+
1145+
let elem1 = variant_list.get(1).unwrap();
1146+
assert_eq!(elem1.as_boolean(), Some(true));
1147+
1148+
let elem2 = variant_list.get(2).unwrap();
1149+
assert_eq!(elem2.as_string(), Some("hi"));
1150+
1151+
// Test out of bounds access
1152+
let out_of_bounds = variant_list.get(3);
1153+
assert!(out_of_bounds.is_err());
1154+
assert!(matches!(
1155+
out_of_bounds.unwrap_err(),
1156+
ArrowError::InvalidArgumentError(ref msg) if msg.contains("out of bounds")
1157+
));
1158+
1159+
// Test values iterator
1160+
let values: Vec<_> = variant_list.values().unwrap().collect();
1161+
assert_eq!(values.len(), 3);
1162+
assert_eq!(values[0].as_int8(), Some(42));
1163+
assert_eq!(values[1].as_boolean(), Some(true));
1164+
assert_eq!(values[2].as_string(), Some("hi"));
1165+
}
1166+
1167+
#[test]
1168+
fn test_variant_list_empty() {
1169+
// Create simple metadata (empty dictionary)
1170+
let metadata_bytes = vec![
1171+
0x01, // header: version=1, sorted=0, offset_size_minus_one=0
1172+
0, // dictionary_size = 0
1173+
0, // offset[0] = 0 (end of dictionary)
1174+
];
1175+
let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
1176+
1177+
// Create empty list value data: []
1178+
let list_value = vec![
1179+
0x03, // header: basic_type=3, value_header=0x00
1180+
0, // num_elements = 0
1181+
0, // single offset pointing to end
1182+
// No values
1183+
];
1184+
1185+
let variant_list = VariantList::try_new(&metadata, &list_value).unwrap();
1186+
1187+
// Test basic properties
1188+
assert_eq!(variant_list.len(), 0);
1189+
assert!(variant_list.is_empty());
1190+
1191+
// Test out of bounds access on empty list
1192+
let out_of_bounds = variant_list.get(0);
1193+
assert!(out_of_bounds.is_err());
1194+
1195+
// Test values iterator on empty list
1196+
let values: Vec<_> = variant_list.values().unwrap().collect();
1197+
assert_eq!(values.len(), 0);
1198+
}
1199+
1200+
#[test]
1201+
fn test_variant_list_large() {
1202+
// Create simple metadata (empty dictionary)
1203+
let metadata_bytes = vec![
1204+
0x01, // header: version=1, sorted=0, offset_size_minus_one=0
1205+
0, // dictionary_size = 0
1206+
0, // offset[0] = 0 (end of dictionary)
1207+
];
1208+
let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
1209+
1210+
// Create large list value data with 2-byte offsets: [null, false]
1211+
// Header: is_large=1, field_offset_size_minus_one=1, basic_type=3 (array)
1212+
let list_bytes = vec![
1213+
0x17, // header = 000_1_01_11 = 0x17
1214+
2, 0, 0, 0, // num_elements = 2 (4 bytes because is_large=1)
1215+
// Offsets (2 bytes each): 3 offsets total
1216+
0x00, 0x00, 0x01, 0x00, // first value (null)
1217+
0x02, 0x00, // second value (boolean false)
1218+
// Values:
1219+
0x00, // null: primitive_header=0, basic_type=0 -> (0 << 2) | 0 = 0x00
1220+
0x08, // boolean false: primitive_header=2, basic_type=0 -> (2 << 2) | 0 = 0x08
1221+
];
1222+
1223+
let variant_list = VariantList::try_new(&metadata, &list_bytes).unwrap();
1224+
1225+
// Test basic properties
1226+
assert_eq!(variant_list.len(), 2);
1227+
assert!(!variant_list.is_empty());
1228+
1229+
// Test individual element access
1230+
let elem0 = variant_list.get(0).unwrap();
1231+
assert_eq!(elem0.as_null(), Some(()));
1232+
1233+
let elem1 = variant_list.get(1).unwrap();
1234+
assert_eq!(elem1.as_boolean(), Some(false));
1235+
}
10081236
}

0 commit comments

Comments
 (0)