-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[Variant] Implement new VariantValueArrayBuilder #8360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
ab77733
428aae1
97f99da
fb455c0
2238b49
474fa31
5959577
f286c52
a476c72
2126c7a
fa2cded
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,8 +22,11 @@ use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuild | |
| use arrow_schema::{ArrowError, DataType, Field, Fields}; | ||
| use parquet_variant::{ | ||
| BuilderSpecificState, ListBuilder, MetadataBuilder, ObjectBuilder, Variant, VariantBuilderExt, | ||
| EMPTY_VARIANT_METADATA, | ||
| }; | ||
| use parquet_variant::{ | ||
| ParentState, ReadOnlyMetadataBuilder, ValueBuilder, WritableMetadataBuilder, | ||
| }; | ||
| use parquet_variant::{ParentState, ValueBuilder, WritableMetadataBuilder}; | ||
| use std::sync::Arc; | ||
|
|
||
| /// A builder for [`VariantArray`] | ||
|
|
@@ -205,6 +208,134 @@ impl VariantBuilderExt for VariantArrayBuilder { | |
| } | ||
| } | ||
|
|
||
| /// A builder for creating only the value column of a [`VariantArray`] | ||
| /// | ||
| /// This builder is used when you have existing metadata and only need to build | ||
| /// the value column. It's useful for scenarios like variant unshredding, data | ||
| /// transformation, or filtering where you want to reuse existing metadata. | ||
| /// | ||
| /// The builder produces a [`BinaryViewArray`] that can be combined with existing | ||
| /// metadata to create a complete [`VariantArray`]. | ||
| /// | ||
| /// # Example: | ||
| /// ``` | ||
| /// # use arrow::array::Array; | ||
| /// # use parquet_variant::{Variant, EMPTY_VARIANT_METADATA}; | ||
| /// # use parquet_variant_compute::VariantValueArrayBuilder; | ||
| /// // Create a variant value builder for 10 rows | ||
| /// let mut builder = VariantValueArrayBuilder::new(10); | ||
| /// | ||
| /// // Append some values with their corresponding metadata | ||
| /// // In practice, you should use the existing metadata you have access to. | ||
| /// builder.append_value(Variant::from(42), EMPTY_VARIANT_METADATA).unwrap(); | ||
| /// builder.append_null(); | ||
| /// builder.append_value(Variant::from("hello"), EMPTY_VARIANT_METADATA).unwrap(); | ||
| /// | ||
| /// // Build the final value array | ||
| /// let value_array = builder.build(); | ||
| /// assert_eq!(value_array.len(), 3); | ||
| /// ``` | ||
| #[derive(Debug)] | ||
| #[allow(unused)] | ||
| pub struct VariantValueArrayBuilder { | ||
| value_builder: ValueBuilder, | ||
| value_offsets: Vec<usize>, | ||
| nulls: NullBufferBuilder, | ||
| } | ||
|
|
||
| #[allow(unused)] | ||
| impl VariantValueArrayBuilder { | ||
| /// Create a new `VariantValueArrayBuilder` with the specified row capacity | ||
| pub fn new(row_capacity: usize) -> Self { | ||
| Self { | ||
| value_builder: ValueBuilder::new(), | ||
| value_offsets: Vec::with_capacity(row_capacity), | ||
| nulls: NullBufferBuilder::new(row_capacity), | ||
| } | ||
| } | ||
|
|
||
| /// Build the final value array | ||
| /// | ||
| /// Returns a [`BinaryViewArray`] containing the serialized variant values. | ||
| /// This can be combined with existing metadata to create a complete [`VariantArray`]. | ||
| pub fn build(mut self) -> Result<BinaryViewArray, ArrowError> { | ||
| let value_buffer = self.value_builder.into_inner(); | ||
| let mut array = binary_view_array_from_buffers(value_buffer, self.value_offsets); | ||
| if let Some(nulls) = self.nulls.finish() { | ||
| let (views, buffers, _) = array.into_parts(); | ||
| array = BinaryViewArray::try_new(views, buffers, Some(nulls))?; | ||
| } | ||
| Ok(array) | ||
| } | ||
|
|
||
| /// Append a null row to the builder | ||
| /// | ||
| /// WARNING: It is only safe to call this method when building the `value` field of a shredded | ||
scovich marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| /// variant column (which is nullable). The `value` field of a binary (unshredded) variant | ||
| /// column is non-nullable, and callers should instead invoke [`Self::append_value`] with | ||
| /// `Variant::Null`, passing the appropriate metadata value. | ||
| pub fn append_null(&mut self) { | ||
| self.value_offsets.push(self.value_builder.offset()); | ||
| self.nulls.append_null(); | ||
| } | ||
|
|
||
| /// Append a variant value with its corresponding metadata | ||
| /// | ||
| /// # Arguments | ||
| /// * `value` - The variant value to append | ||
| /// * `metadata` - The metadata dictionary for this variant (used for field name resolution) | ||
| /// | ||
| /// # Returns | ||
| /// * `Ok(())` if the value was successfully appended | ||
| /// * `Err(ArrowError)` if the variant contains field names not found in the metadata | ||
| /// | ||
| /// # Example | ||
| /// ``` | ||
| /// # use parquet_variant::{Variant, EMPTY_VARIANT_METADATA}; | ||
| /// # use parquet_variant_compute::VariantValueArrayBuilder; | ||
| /// let mut builder = VariantValueArrayBuilder::new(10); | ||
| /// builder.append_value(Variant::from(42), EMPTY_VARIANT_METADATA).unwrap(); | ||
| /// ``` | ||
| pub fn append_value(&mut self, value: Variant<'_, '_>) { | ||
| let metadata = value.metadata().cloned().unwrap_or(EMPTY_VARIANT_METADATA); | ||
|
||
| let mut metadata_builder = ReadOnlyMetadataBuilder::new(metadata); | ||
| ValueBuilder::append_variant_bytes(self.parent_state(&mut metadata_builder), value); | ||
| } | ||
|
|
||
| /// Creates a builder-specific parent state | ||
| pub fn parent_state<'a>( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NOTE: This is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (also see the updated unit test for a realistic example of how to use it) |
||
| &'a mut self, | ||
| metadata_builder: &'a mut dyn MetadataBuilder, | ||
| ) -> ParentState<'a, ValueArrayBuilderState<'a>> { | ||
| let state = ValueArrayBuilderState { | ||
| value_offsets: &mut self.value_offsets, | ||
| nulls: &mut self.nulls, | ||
| }; | ||
|
|
||
| ParentState::new(&mut self.value_builder, metadata_builder, state) | ||
| } | ||
| } | ||
|
|
||
| /// Builder-specific state for array building that manages array-level offsets and nulls. See | ||
| /// [`VariantBuilderExt`] for details. | ||
| #[derive(Debug)] | ||
| pub struct ValueArrayBuilderState<'a> { | ||
| value_offsets: &'a mut Vec<usize>, | ||
| nulls: &'a mut NullBufferBuilder, | ||
| } | ||
|
|
||
| // All changes are pending until finalized | ||
| impl BuilderSpecificState for ValueArrayBuilderState<'_> { | ||
| fn finish( | ||
| &mut self, | ||
| _metadata_builder: &mut dyn MetadataBuilder, | ||
| value_builder: &mut ValueBuilder, | ||
| ) { | ||
| self.value_offsets.push(value_builder.offset()); | ||
| self.nulls.append_non_null(); | ||
| } | ||
| } | ||
|
|
||
| fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray { | ||
| // All offsets are less than or equal to the buffer length, so we can safely cast all offsets | ||
| // inside the loop below, as long as the buffer length fits in u32. | ||
|
|
@@ -228,6 +359,7 @@ fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> Binar | |
| mod test { | ||
| use super::*; | ||
| use arrow::array::Array; | ||
| use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; | ||
|
|
||
| /// Test that both the metadata and value buffers are non nullable | ||
| #[test] | ||
|
|
@@ -288,4 +420,46 @@ mod test { | |
| let list = variant.as_list().expect("variant to be a list"); | ||
| assert_eq!(list.len(), 2); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_variant_value_array_builder_basic() { | ||
| let mut builder = VariantValueArrayBuilder::new(10); | ||
|
|
||
| // Append some values | ||
| builder.append_value(Variant::from(42i32)); | ||
| builder.append_null(); | ||
| builder.append_value(Variant::from("hello")); | ||
|
|
||
| let value_array = builder.build().unwrap(); | ||
| assert_eq!(value_array.len(), 3); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_variant_value_array_builder_with_objects() { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice test! |
||
| // Create metadata with field names | ||
| let mut metadata_builder = WritableMetadataBuilder::default(); | ||
| metadata_builder.upsert_field_name("name"); | ||
| metadata_builder.upsert_field_name("age"); | ||
| metadata_builder.finish(); | ||
| let metadata_bytes = metadata_builder.into_inner(); | ||
| let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap(); | ||
|
|
||
| // Create a variant with an object using the same metadata | ||
| let mut variant_builder = VariantBuilder::new().with_metadata(metadata); | ||
| variant_builder | ||
| .new_object() | ||
| .with_field("name", "Alice") | ||
| .with_field("age", 30i32) | ||
| .finish(); | ||
| let (_, value_bytes) = variant_builder.finish(); | ||
scovich marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| let variant = Variant::try_new(&metadata_bytes, &value_bytes).unwrap(); | ||
|
|
||
| // Now use the value array builder | ||
| let mut builder = VariantValueArrayBuilder::new(10); | ||
| builder.append_value(variant); | ||
| builder.append_null(); | ||
|
|
||
| let value_array = builder.build().unwrap(); | ||
| assert_eq!(value_array.len(), 2); | ||
scovich marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -562,7 +562,7 @@ pub struct WritableMetadataBuilder { | |
|
|
||
| impl WritableMetadataBuilder { | ||
| /// Upsert field name to dictionary, return its ID | ||
| fn upsert_field_name(&mut self, field_name: &str) -> u32 { | ||
| pub fn upsert_field_name(&mut self, field_name: &str) -> u32 { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It probably should have been public all along, but now it's needed. |
||
| let (id, new_entry) = self.field_names.insert_full(field_name.to_string()); | ||
|
|
||
| if new_entry { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,7 @@ | |
|
|
||
| pub use self::decimal::{VariantDecimal16, VariantDecimal4, VariantDecimal8}; | ||
| pub use self::list::VariantList; | ||
| pub use self::metadata::VariantMetadata; | ||
| pub use self::metadata::{VariantMetadata, EMPTY_VARIANT_METADATA, EMPTY_VARIANT_METADATA_BYTES}; | ||
| pub use self::object::VariantObject; | ||
| use crate::decoder::{ | ||
| self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, | ||
|
|
@@ -1320,7 +1320,7 @@ impl<'m, 'v> Variant<'m, 'v> { | |
| /// Return the metadata associated with this variant, if any. | ||
| /// | ||
| /// Returns `Some(&VariantMetadata)` for object and list variants, | ||
| pub fn metadata(&self) -> Option<&'m VariantMetadata<'_>> { | ||
| pub fn metadata(&self) -> Option<&VariantMetadata<'m>> { | ||
|
||
| match self { | ||
| Variant::Object(VariantObject { metadata, .. }) | ||
| | Variant::List(VariantList { metadata, .. }) => Some(metadata), | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
makes sense