@@ -25,6 +25,7 @@ use arrow_schema::ArrowError;
2525use chrono:: { DateTime , NaiveDate , NaiveDateTime , Utc } ;
2626use std:: num:: TryFromIntError ;
2727
28+ /// The number of bytes used to store offsets in the [`VariantMetadataHeader`]
2829#[ derive( Clone , Debug , Copy , PartialEq ) ]
2930enum OffsetSizeBytes {
3031 One = 1 ,
@@ -91,7 +92,7 @@ impl OffsetSizeBytes {
9192 }
9293}
9394
94- /// A parsed version of the variant metadata header byte.
95+ /// Header structure for [`VariantMetadata`]
9596#[ derive( Clone , Debug , Copy , PartialEq ) ]
9697pub ( crate ) struct VariantMetadataHeader {
9798 version : u8 ,
@@ -140,8 +141,12 @@ impl VariantMetadataHeader {
140141 }
141142}
142143
144+ /// [`Variant`] Metadata
145+ ///
146+ /// See the [Variant Spec] file for more information
147+ ///
148+ /// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
143149#[ derive( Clone , Copy , Debug , PartialEq ) ]
144- /// Encodes the Variant Metadata, see the Variant spec file for more information
145150pub struct VariantMetadata < ' m > {
146151 bytes : & ' m [ u8 ] ,
147152 header : VariantMetadataHeader ,
@@ -238,7 +243,7 @@ impl<'m> VariantMetadata<'m> {
238243 }
239244}
240245
241- /// A parsed version of the variant object value header byte.
246+ /// Header structure for [`VariantObject`]
242247#[ derive( Clone , Debug , PartialEq ) ]
243248pub ( crate ) struct VariantObjectHeader {
244249 field_offset_size : OffsetSizeBytes ,
@@ -262,6 +267,7 @@ impl VariantObjectHeader {
262267 }
263268}
264269
270+ /// A [`Variant`] Object (struct with named fields).
265271#[ derive( Clone , Debug , PartialEq ) ]
266272pub struct VariantObject < ' m , ' v > {
267273 pub metadata : VariantMetadata < ' m > ,
@@ -282,6 +288,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
282288 /// particular, that all field ids exist in `metadata`, and all offsets are in-bounds and point
283289 /// to valid objects.
284290 // TODO: How to make the validation non-recursive while still making iterators safely infallible??
291+ // See https://github.com/apache/arrow-rs/issues/7711
285292 pub fn try_new ( metadata : VariantMetadata < ' m > , value : & ' v [ u8 ] ) -> Result < Self , ArrowError > {
286293 let header_byte = first_byte_from_slice ( value) ?;
287294 let header = VariantObjectHeader :: try_new ( header_byte) ?;
@@ -420,10 +427,10 @@ impl VariantListHeader {
420427 }
421428}
422429
423- /// Represents a variant array .
430+ /// [`Variant`] Array .
424431///
425432/// NOTE: The "list" naming differs from the variant spec -- which calls it "array" -- in order to be
426- /// consistent with parquet and arrow type naming. Otherwise, the name would conflict with the
433+ /// consistent with Parquet and Arrow type naming. Otherwise, the name would conflict with the
427434/// `VariantArray : Array` we must eventually define for variant-typed arrow arrays.
428435#[ derive( Clone , Debug , PartialEq ) ]
429436pub struct VariantList < ' m , ' v > {
@@ -443,6 +450,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
443450 /// This constructor verifies that `value` points to a valid variant array value. In particular,
444451 /// that all offsets are in-bounds and point to valid objects.
445452 // TODO: How to make the validation non-recursive while still making iterators safely infallible??
453+ // See https://github.com/apache/arrow-rs/issues/7711
446454 pub fn try_new ( metadata : VariantMetadata < ' m > , value : & ' v [ u8 ] ) -> Result < Self , ArrowError > {
447455 let header_byte = first_byte_from_slice ( value) ?;
448456 let header = VariantListHeader :: try_new ( header_byte) ?;
@@ -536,33 +544,134 @@ impl<'m, 'v> VariantList<'m, 'v> {
536544 }
537545}
538546
539- /// Variant value. May contain references to metadata and value
547+ /// Represents a [Parquet Variant]
548+ ///
549+ /// The lifetimes `'m` and `'v` are for metadata and value buffers, respectively.
550+ ///
551+ /// # Background
552+ ///
553+ /// The [specification] says:
554+ ///
555+ /// The Variant Binary Encoding allows representation of semi-structured data
556+ /// (e.g. JSON) in a form that can be efficiently queried by path. The design is
557+ /// intended to allow efficient access to nested data even in the presence of
558+ /// very wide or deep structures.
559+ ///
560+ /// Another motivation for the representation is that (aside from metadata) each
561+ /// nested Variant value is contiguous and self-contained. For example, in a
562+ /// Variant containing an Array of Variant values, the representation of an
563+ /// inner Variant value, when paired with the metadata of the full variant, is
564+ /// itself a valid Variant.
565+ ///
566+ /// When stored in Parquet files, Variant fields can also be *shredded*. Shredding
567+ /// refers to extracting some elements of the variant into separate columns for
568+ /// more efficient extraction/filter pushdown. The [Variant Shredding
569+ /// specification] describes the details of shredding Variant values as typed
570+ /// Parquet columns.
571+ ///
572+ /// A Variant represents a type that contains one of:
573+ ///
574+ /// * Primitive: A type and corresponding value (e.g. INT, STRING)
575+ ///
576+ /// * Array: An ordered list of Variant values
577+ ///
578+ /// * Object: An unordered collection of string/Variant pairs (i.e. key/value
579+ /// pairs). An object may not contain duplicate keys.
580+ ///
581+ /// # Encoding
582+ ///
583+ /// A Variant is encoded with 2 binary values, the value and the metadata. The
584+ /// metadata stores a header and an optional dictionary of field names which are
585+ /// referred to by offset in the value. The value is a binary representation of
586+ /// the actual data, and varies depending on the type.
587+ ///
588+ /// # Design Goals
589+ ///
590+ /// The design goals of the Rust API are as follows:
591+ /// 1. Speed / Zero copy access (no `clone`ing is required)
592+ /// 2. Safety
593+ /// 3. Follow standard Rust conventions
594+ ///
595+ /// [Parquet Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
596+ /// [specification]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
597+ /// [Variant Shredding specification]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md
598+ ///
599+ /// # Examples:
600+ ///
601+ /// ## Creating `Variant` from Rust Types
602+ /// ```
603+ /// # use parquet_variant::Variant;
604+ /// // variants can be directly constructed
605+ /// let variant = Variant::Int32(123);
606+ /// // or constructed via `From` impls
607+ /// assert_eq!(variant, Variant::from(123i32));
608+ /// ```
609+ /// ## Creating `Variant` from metadata and value
610+ /// ```
611+ /// # use parquet_variant::{Variant, VariantMetadata};
612+ /// let metadata = [0x01, 0x00, 0x00];
613+ /// let value = [0x09, 0x48, 0x49];
614+ /// // parse the header metadata
615+ /// assert_eq!(
616+ /// Variant::ShortString("HI"),
617+ /// Variant::try_new(&metadata, &value).unwrap()
618+ /// );
619+ /// ```
620+ ///
621+ /// ## Using `Variant` values
622+ /// ```
623+ /// # use parquet_variant::Variant;
624+ /// # let variant = Variant::Int32(123);
625+ /// // variants can be used in match statements like normal enums
626+ /// match variant {
627+ /// Variant::Int32(i) => println!("Integer: {}", i),
628+ /// Variant::String(s) => println!("String: {}", s),
629+ /// _ => println!("Other variant"),
630+ /// }
631+ /// ```
540632#[ derive( Clone , Debug , PartialEq ) ]
541633pub enum Variant < ' m , ' v > {
542- // TODO: Add types for the rest of the primitive types, once API is agreed upon
634+ /// Primitive type: Null
543635 Null ,
636+ /// Primitive (type_id=1): INT(8, SIGNED)
544637 Int8 ( i8 ) ,
638+ /// Primitive (type_id=1): INT(16, SIGNED)
545639 Int16 ( i16 ) ,
640+ /// Primitive (type_id=1): INT(32, SIGNED)
546641 Int32 ( i32 ) ,
642+ /// Primitive (type_id=1): INT(64, SIGNED)
547643 Int64 ( i64 ) ,
644+ /// Primitive (type_id=1): DATE
548645 Date ( NaiveDate ) ,
646+ /// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=true, MICROS)
549647 TimestampMicros ( DateTime < Utc > ) ,
648+ /// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=false, MICROS)
550649 TimestampNtzMicros ( NaiveDateTime ) ,
650+ /// Primitive (type_id=1): DECIMAL(precision, scale) 32-bits
551651 Decimal4 { integer : i32 , scale : u8 } ,
652+ /// Primitive (type_id=1): DECIMAL(precision, scale) 64-bits
552653 Decimal8 { integer : i64 , scale : u8 } ,
654+ /// Primitive (type_id=1): DECIMAL(precision, scale) 128-bits
553655 Decimal16 { integer : i128 , scale : u8 } ,
656+ /// Primitive (type_id=1): FLOAT
554657 Float ( f32 ) ,
658+ /// Primitive (type_id=1): DOUBLE
555659 Double ( f64 ) ,
660+ /// Primitive (type_id=1): BOOLEAN (true)
556661 BooleanTrue ,
662+ /// Primitive (type_id=1): BOOLEAN (false)
557663 BooleanFalse ,
558-
559- // Note: only need the *value* buffer
664+ // Note: only need the *value* buffer for these types
665+ /// Primitive (type_id=1): BINARY
560666 Binary ( & ' v [ u8 ] ) ,
667+ /// Primitive (type_id=1): STRING
561668 String ( & ' v str ) ,
669+ /// Short String (type_id=2): STRING
562670 ShortString ( & ' v str ) ,
563-
564671 // need both metadata & value
672+ /// Object (type_id=3): N/A
565673 Object ( VariantObject < ' m , ' v > ) ,
674+ /// Array (type_id=4): N/A
566675 List ( VariantList < ' m , ' v > ) ,
567676}
568677
@@ -574,6 +683,7 @@ impl<'m, 'v> Variant<'m, 'v> {
574683 /// # use parquet_variant::{Variant, VariantMetadata};
575684 /// let metadata = [0x01, 0x00, 0x00];
576685 /// let value = [0x09, 0x48, 0x49];
686+ /// // parse the header metadata
577687 /// assert_eq!(
578688 /// Variant::ShortString("HI"),
579689 /// Variant::try_new(&metadata, &value).unwrap()
@@ -629,7 +739,6 @@ impl<'m, 'v> Variant<'m, 'v> {
629739 }
630740 VariantPrimitiveType :: BooleanTrue => Variant :: BooleanTrue ,
631741 VariantPrimitiveType :: BooleanFalse => Variant :: BooleanFalse ,
632- // TODO: Add types for the rest, once API is agreed upon
633742 VariantPrimitiveType :: Date => Variant :: Date ( decoder:: decode_date ( value_data) ?) ,
634743 VariantPrimitiveType :: TimestampMicros => {
635744 Variant :: TimestampMicros ( decoder:: decode_timestamp_micros ( value_data) ?)
0 commit comments