@@ -346,18 +346,24 @@ impl VariantObjectHeader {
346346 let last_field_offset_byte =
347347 field_offsets_start_byte + ( num_elements + 1 ) * field_offset_size as usize ;
348348 if last_field_offset_byte > value. len ( ) {
349- return Err ( ArrowError :: InvalidArgumentError (
350- "Last field offset array entry is outside the value slice" . to_string ( ) ,
351- ) ) ;
349+ return Err ( ArrowError :: InvalidArgumentError ( format ! (
350+ "Last field offset array entry at offset {} with length {} is outside the value slice of length {}" ,
351+ last_field_offset_byte,
352+ field_offset_size as usize ,
353+ value. len( )
354+ ) ) ) ;
352355 }
353356
354357 // Verify that the value of the last field offset array entry fits inside the value slice
355358 let last_field_offset =
356359 field_offset_size. unpack_usize ( value, field_offsets_start_byte, num_elements) ?;
357360 if values_start_byte + last_field_offset > value. len ( ) {
358- return Err ( ArrowError :: InvalidArgumentError (
359- "Last field offset value is outside the value slice" . to_string ( ) ,
360- ) ) ;
361+ return Err ( ArrowError :: InvalidArgumentError ( format ! (
362+ "Last field offset value {} at offset {} is outside the value slice of length {}" ,
363+ last_field_offset,
364+ values_start_byte,
365+ value. len( )
366+ ) ) ) ;
361367 }
362368 Ok ( Self {
363369 field_offset_size,
@@ -482,46 +488,33 @@ impl<'m, 'v> VariantObject<'m, 'v> {
482488 }
483489}
484490
485- #[ derive( Clone , Copy , Debug , PartialEq ) ]
486- pub struct VariantArray < ' m , ' v > {
487- pub metadata : & ' m VariantMetadata < ' m > ,
488- pub value : & ' v [ u8 ] ,
491+ #[ derive( Clone , Debug , PartialEq ) ]
492+ pub struct VariantListHeader {
493+ offset_size : OffsetSizeBytes ,
494+ is_large : bool ,
495+ num_elements : usize ,
496+ first_offset_byte : usize ,
497+ first_value_byte : usize ,
489498}
490499
491- impl < ' m , ' v > VariantArray < ' m , ' v > {
492- /// Return the length of this array
493- pub fn len ( & self ) -> usize {
494- todo ! ( )
495- }
496-
497- /// Is the array of zero length
498- pub fn is_empty ( & self ) -> bool {
499- self . len ( ) == 0
500- }
501-
502- pub fn values ( & self ) -> Result < impl Iterator < Item = Variant < ' m , ' v > > , ArrowError > {
503- todo ! ( ) ;
504- #[ allow( unreachable_code) ] // Just to infer the return type
505- Ok ( vec ! [ ] . into_iter ( ) )
506- }
507-
508- pub fn get ( & self , index : usize ) -> Result < Variant < ' m , ' v > , ArrowError > {
500+ impl VariantListHeader {
501+ pub fn try_new ( value : & [ u8 ] ) -> Result < Self , ArrowError > {
509502 // The 6 first bits to the left are the value_header and the 2 bits
510503 // to the right are the basic type, so we shift to get only the value_header
511- let value_header = first_byte_from_slice ( self . value ) ? >> 2 ;
504+ let value_header = first_byte_from_slice ( value) ? >> 2 ;
512505 let is_large = ( value_header & 0x04 ) != 0 ; // 3rd bit from the right
513506 let field_offset_size_minus_one = value_header & 0x03 ; // Last two bits
514507 let offset_size = OffsetSizeBytes :: try_new ( field_offset_size_minus_one) ?;
508+
515509 // The size of the num_elements entry in the array value_data is 4 bytes if
516510 // is_large is true, otherwise 1 byte.
517511 let num_elements_size = match is_large {
518512 true => OffsetSizeBytes :: Four ,
519513 false => OffsetSizeBytes :: One ,
520514 } ;
515+
521516 // Skip the header byte to read the num_elements
522- // The size of the num_elements entry in the array value_data is 4 bytes if
523- // is_large is true, otherwise 1 byte.
524- let num_elements = num_elements_size. unpack_usize ( self . value , 1 , 0 ) ?;
517+ let num_elements = num_elements_size. unpack_usize ( value, 1 , 0 ) ?;
525518 let first_offset_byte = 1 + num_elements_size as usize ;
526519
527520 let overflow =
@@ -540,28 +533,132 @@ impl<'m, 'v> VariantArray<'m, 'v> {
540533 . checked_add ( value_bytes)
541534 . ok_or_else ( overflow) ?;
542535
536+ // Verify that the last offset array entry is inside the value slice
537+ let last_offset_byte = first_offset_byte + n_offsets * offset_size as usize ;
538+ if last_offset_byte > value. len ( ) {
539+ return Err ( ArrowError :: InvalidArgumentError ( format ! (
540+ "Last offset array entry at offset {} with length {} is outside the value slice of length {}" ,
541+ last_offset_byte,
542+ offset_size as usize ,
543+ value. len( )
544+ ) ) ) ;
545+ }
546+
547+ // Verify that the value of the last offset array entry fits inside the value slice
548+ let last_offset = offset_size. unpack_usize ( value, first_offset_byte, num_elements) ?;
549+ if first_value_byte + last_offset > value. len ( ) {
550+ return Err ( ArrowError :: InvalidArgumentError ( format ! (
551+ "Last offset value {} at offset {} is outside the value slice of length {}" ,
552+ last_offset,
553+ first_value_byte,
554+ value. len( )
555+ ) ) ) ;
556+ }
557+
558+ Ok ( Self {
559+ offset_size,
560+ is_large,
561+ num_elements,
562+ first_offset_byte,
563+ first_value_byte,
564+ } )
565+ }
566+
567+ /// Returns the number of elements in this list
568+ pub fn num_elements ( & self ) -> usize {
569+ self . num_elements
570+ }
571+
572+ /// Returns the offset size in bytes
573+ pub fn offset_size ( & self ) -> usize {
574+ self . offset_size as _
575+ }
576+
577+ /// Returns whether this is a large list
578+ pub fn is_large ( & self ) -> bool {
579+ self . is_large
580+ }
581+
582+ /// Returns the byte offset where the offset array starts
583+ pub fn first_offset_byte ( & self ) -> usize {
584+ self . first_offset_byte
585+ }
586+
587+ /// Returns the byte offset where the values start
588+ pub fn first_value_byte ( & self ) -> usize {
589+ self . first_value_byte
590+ }
591+ }
592+
593+ // NOTE: We differ from the variant spec and call it "list" instead of "array" in order to be
594+ // consistent with parquet and arrow type naming. Otherwise, the name would conflict with the
595+ // `VariantArray : Array` we must eventually define for variant-typed arrow arrays.
596+ #[ derive( Clone , Debug , PartialEq ) ]
597+ pub struct VariantList < ' m , ' v > {
598+ pub metadata : & ' m VariantMetadata < ' m > ,
599+ pub value : & ' v [ u8 ] ,
600+ header : VariantListHeader ,
601+ }
602+
603+ impl < ' m , ' v > VariantList < ' m , ' v > {
604+ pub fn try_new ( metadata : & ' m VariantMetadata < ' m > , value : & ' v [ u8 ] ) -> Result < Self , ArrowError > {
605+ Ok ( Self {
606+ metadata,
607+ value,
608+ header : VariantListHeader :: try_new ( value) ?,
609+ } )
610+ }
611+
612+ /// Return the length of this array
613+ pub fn len ( & self ) -> usize {
614+ self . header . num_elements ( )
615+ }
616+
617+ /// Is the array of zero length
618+ pub fn is_empty ( & self ) -> bool {
619+ self . len ( ) == 0
620+ }
621+
622+ pub fn values ( & self ) -> Result < impl Iterator < Item = Variant < ' m , ' v > > , ArrowError > {
623+ let len = self . len ( ) ;
624+ let values = ( 0 ..len)
625+ . map ( move |i| self . get ( i) )
626+ . collect :: < Result < Vec < _ > , _ > > ( ) ?;
627+ Ok ( values. into_iter ( ) )
628+ }
629+
630+ pub fn get ( & self , index : usize ) -> Result < Variant < ' m , ' v > , ArrowError > {
631+ if index >= self . header . num_elements ( ) {
632+ return Err ( ArrowError :: InvalidArgumentError ( format ! (
633+ "Index {} out of bounds for list of length {}" ,
634+ index,
635+ self . header. num_elements( )
636+ ) ) ) ;
637+ }
638+
543639 // Skip header and num_elements bytes to read the offsets
544- let start_field_offset_from_first_value_byte =
545- offset_size. unpack_usize ( self . value , first_offset_byte, index) ?;
546- let end_field_offset_from_first_value_byte =
547- offset_size. unpack_usize ( self . value , first_offset_byte, index + 1 ) ?;
640+ let start_field_offset_from_first_value_byte = self . header . offset_size . unpack_usize (
641+ self . value ,
642+ self . header . first_offset_byte ( ) ,
643+ index,
644+ ) ?;
645+ let end_field_offset_from_first_value_byte = self . header . offset_size . unpack_usize (
646+ self . value ,
647+ self . header . first_offset_byte ( ) ,
648+ index + 1 ,
649+ ) ?;
548650
549651 // Read the value bytes from the offsets
550652 let variant_value_bytes = slice_from_slice (
551653 self . value ,
552- first_value_byte + start_field_offset_from_first_value_byte
553- ..first_value_byte + end_field_offset_from_first_value_byte,
654+ self . header . first_value_byte ( ) + start_field_offset_from_first_value_byte
655+ ..self . header . first_value_byte ( ) + end_field_offset_from_first_value_byte,
554656 ) ?;
555657 let variant = Variant :: try_new ( self . metadata , variant_value_bytes) ?;
556658 Ok ( variant)
557659 }
558660}
559661
560- // impl<'m, 'v> Index<usize> for VariantArray<'m, 'v> {
561- // type Output = Variant<'m, 'v>;
562- //
563- // }
564-
565662/// Variant value. May contain references to metadata and value
566663#[ derive( Clone , Debug , PartialEq ) ]
567664pub enum Variant < ' m , ' v > {
@@ -578,7 +675,7 @@ pub enum Variant<'m, 'v> {
578675
579676 // need both metadata & value
580677 Object ( VariantObject < ' m , ' v > ) ,
581- Array ( VariantArray < ' m , ' v > ) ,
678+ List ( VariantList < ' m , ' v > ) ,
582679}
583680
584681impl < ' m , ' v > Variant < ' m , ' v > {
@@ -600,7 +697,7 @@ impl<'m, 'v> Variant<'m, 'v> {
600697 Variant :: ShortString ( decoder:: decode_short_string ( value) ?)
601698 }
602699 VariantBasicType :: Object => Variant :: Object ( VariantObject :: try_new ( metadata, value) ?) ,
603- VariantBasicType :: Array => Variant :: Array ( VariantArray { metadata, value } ) ,
700+ VariantBasicType :: Array => Variant :: List ( VariantList :: try_new ( metadata, value) ? ) ,
604701 } ;
605702 Ok ( new_self)
606703 }
@@ -637,7 +734,7 @@ impl<'m, 'v> Variant<'m, 'v> {
637734 pub fn metadata ( & self ) -> Option < & ' m VariantMetadata > {
638735 match self {
639736 Variant :: Object ( VariantObject { metadata, .. } )
640- | Variant :: Array ( VariantArray { metadata, .. } ) => Some ( * metadata) ,
737+ | Variant :: List ( VariantList { metadata, .. } ) => Some ( * metadata) ,
641738 _ => None ,
642739 }
643740 }
@@ -1005,4 +1102,135 @@ mod tests {
10051102 let fields: Vec < _ > = variant_obj. fields ( ) . unwrap ( ) . collect ( ) ;
10061103 assert_eq ! ( fields. len( ) , 0 ) ;
10071104 }
1105+
1106+ #[ test]
1107+ fn test_variant_list_simple ( ) {
1108+ // Create simple metadata (empty dictionary for this test)
1109+ let metadata_bytes = vec ! [
1110+ 0x01 , // header: version=1, sorted=0, offset_size_minus_one=0
1111+ 0 , // dictionary_size = 0
1112+ 0 , // offset[0] = 0 (end of dictionary)
1113+ ] ;
1114+ let metadata = VariantMetadata :: try_new ( & metadata_bytes) . unwrap ( ) ;
1115+
1116+ // Create list value data for: [42, true, "hi"]
1117+ // Header: basic_type=3 (array), field_offset_size_minus_one=0, is_large=0
1118+ // value_header = 0000_0_0_00 = 0x00
1119+ // So header byte = (0x00 << 2) | 3 = 0x03
1120+ let list_value = vec ! [
1121+ 0x03 , // header: basic_type=3, value_header=0x00
1122+ 3 , // num_elements = 3
1123+ // Offsets (1 byte each): 4 offsets total
1124+ 0 , // offset to first value (int8)
1125+ 2 , // offset to second value (boolean true)
1126+ 3 , // offset to third value (short string)
1127+ 6 , // end offset
1128+ // Values:
1129+ 0x0C ,
1130+ 42 , // int8: primitive_header=3, basic_type=0 -> (3 << 2) | 0 = 0x0C, then value 42
1131+ 0x04 , // boolean true: primitive_header=1, basic_type=0 -> (1 << 2) | 0 = 0x04
1132+ 0x09 , b'h' , b'i' , // short string: length=2, basic_type=1 -> (2 << 2) | 1 = 0x09
1133+ ] ;
1134+
1135+ let variant_list = VariantList :: try_new ( & metadata, & list_value) . unwrap ( ) ;
1136+
1137+ // Test basic properties
1138+ assert_eq ! ( variant_list. len( ) , 3 ) ;
1139+ assert ! ( !variant_list. is_empty( ) ) ;
1140+
1141+ // Test individual element access
1142+ let elem0 = variant_list. get ( 0 ) . unwrap ( ) ;
1143+ assert_eq ! ( elem0. as_int8( ) , Some ( 42 ) ) ;
1144+
1145+ let elem1 = variant_list. get ( 1 ) . unwrap ( ) ;
1146+ assert_eq ! ( elem1. as_boolean( ) , Some ( true ) ) ;
1147+
1148+ let elem2 = variant_list. get ( 2 ) . unwrap ( ) ;
1149+ assert_eq ! ( elem2. as_string( ) , Some ( "hi" ) ) ;
1150+
1151+ // Test out of bounds access
1152+ let out_of_bounds = variant_list. get ( 3 ) ;
1153+ assert ! ( out_of_bounds. is_err( ) ) ;
1154+ assert ! ( matches!(
1155+ out_of_bounds. unwrap_err( ) ,
1156+ ArrowError :: InvalidArgumentError ( ref msg) if msg. contains( "out of bounds" )
1157+ ) ) ;
1158+
1159+ // Test values iterator
1160+ let values: Vec < _ > = variant_list. values ( ) . unwrap ( ) . collect ( ) ;
1161+ assert_eq ! ( values. len( ) , 3 ) ;
1162+ assert_eq ! ( values[ 0 ] . as_int8( ) , Some ( 42 ) ) ;
1163+ assert_eq ! ( values[ 1 ] . as_boolean( ) , Some ( true ) ) ;
1164+ assert_eq ! ( values[ 2 ] . as_string( ) , Some ( "hi" ) ) ;
1165+ }
1166+
1167+ #[ test]
1168+ fn test_variant_list_empty ( ) {
1169+ // Create simple metadata (empty dictionary)
1170+ let metadata_bytes = vec ! [
1171+ 0x01 , // header: version=1, sorted=0, offset_size_minus_one=0
1172+ 0 , // dictionary_size = 0
1173+ 0 , // offset[0] = 0 (end of dictionary)
1174+ ] ;
1175+ let metadata = VariantMetadata :: try_new ( & metadata_bytes) . unwrap ( ) ;
1176+
1177+ // Create empty list value data: []
1178+ let list_value = vec ! [
1179+ 0x03 , // header: basic_type=3, value_header=0x00
1180+ 0 , // num_elements = 0
1181+ 0 , // single offset pointing to end
1182+ // No values
1183+ ] ;
1184+
1185+ let variant_list = VariantList :: try_new ( & metadata, & list_value) . unwrap ( ) ;
1186+
1187+ // Test basic properties
1188+ assert_eq ! ( variant_list. len( ) , 0 ) ;
1189+ assert ! ( variant_list. is_empty( ) ) ;
1190+
1191+ // Test out of bounds access on empty list
1192+ let out_of_bounds = variant_list. get ( 0 ) ;
1193+ assert ! ( out_of_bounds. is_err( ) ) ;
1194+
1195+ // Test values iterator on empty list
1196+ let values: Vec < _ > = variant_list. values ( ) . unwrap ( ) . collect ( ) ;
1197+ assert_eq ! ( values. len( ) , 0 ) ;
1198+ }
1199+
1200+ #[ test]
1201+ fn test_variant_list_large ( ) {
1202+ // Create simple metadata (empty dictionary)
1203+ let metadata_bytes = vec ! [
1204+ 0x01 , // header: version=1, sorted=0, offset_size_minus_one=0
1205+ 0 , // dictionary_size = 0
1206+ 0 , // offset[0] = 0 (end of dictionary)
1207+ ] ;
1208+ let metadata = VariantMetadata :: try_new ( & metadata_bytes) . unwrap ( ) ;
1209+
1210+ // Create large list value data with 2-byte offsets: [null, false]
1211+ // Header: is_large=1, field_offset_size_minus_one=1, basic_type=3 (array)
1212+ let list_bytes = vec ! [
1213+ 0x17 , // header = 000_1_01_11 = 0x17
1214+ 2 , 0 , 0 , 0 , // num_elements = 2 (4 bytes because is_large=1)
1215+ // Offsets (2 bytes each): 3 offsets total
1216+ 0x00 , 0x00 , 0x01 , 0x00 , // first value (null)
1217+ 0x02 , 0x00 , // second value (boolean false)
1218+ // Values:
1219+ 0x00 , // null: primitive_header=0, basic_type=0 -> (0 << 2) | 0 = 0x00
1220+ 0x08 , // boolean false: primitive_header=2, basic_type=0 -> (2 << 2) | 0 = 0x08
1221+ ] ;
1222+
1223+ let variant_list = VariantList :: try_new ( & metadata, & list_bytes) . unwrap ( ) ;
1224+
1225+ // Test basic properties
1226+ assert_eq ! ( variant_list. len( ) , 2 ) ;
1227+ assert ! ( !variant_list. is_empty( ) ) ;
1228+
1229+ // Test individual element access
1230+ let elem0 = variant_list. get ( 0 ) . unwrap ( ) ;
1231+ assert_eq ! ( elem0. as_null( ) , Some ( ( ) ) ) ;
1232+
1233+ let elem1 = variant_list. get ( 1 ) . unwrap ( ) ;
1234+ assert_eq ! ( elem1. as_boolean( ) , Some ( false ) ) ;
1235+ }
10081236}
0 commit comments