Skip to content
176 changes: 175 additions & 1 deletion parquet-variant-compute/src/variant_array_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@ use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuild
use arrow_schema::{ArrowError, DataType, Field, Fields};
use parquet_variant::{
BuilderSpecificState, ListBuilder, MetadataBuilder, ObjectBuilder, Variant, VariantBuilderExt,
EMPTY_VARIANT_METADATA,
};
use parquet_variant::{
ParentState, ReadOnlyMetadataBuilder, ValueBuilder, WritableMetadataBuilder,
};
use parquet_variant::{ParentState, ValueBuilder, WritableMetadataBuilder};
use std::sync::Arc;

/// A builder for [`VariantArray`]
Expand Down Expand Up @@ -205,6 +208,134 @@ impl VariantBuilderExt for VariantArrayBuilder {
}
}

/// A builder for creating only the value column of a [`VariantArray`]
///
/// This builder is used when you have existing metadata and only need to build
/// the value column. It's useful for scenarios like variant unshredding, data
/// transformation, or filtering where you want to reuse existing metadata.
///
/// The builder produces a [`BinaryViewArray`] that can be combined with existing
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense

/// metadata to create a complete [`VariantArray`].
///
/// # Example:
/// ```
/// # use arrow::array::Array;
/// # use parquet_variant::{Variant, EMPTY_VARIANT_METADATA};
/// # use parquet_variant_compute::VariantValueArrayBuilder;
/// // Create a variant value builder for 10 rows
/// let mut builder = VariantValueArrayBuilder::new(10);
///
/// // Append some values with their corresponding metadata
/// // In practice, you should use the existing metadata you have access to.
/// builder.append_value(Variant::from(42), EMPTY_VARIANT_METADATA).unwrap();
/// builder.append_null();
/// builder.append_value(Variant::from("hello"), EMPTY_VARIANT_METADATA).unwrap();
///
/// // Build the final value array
/// let value_array = builder.build();
/// assert_eq!(value_array.len(), 3);
/// ```
#[derive(Debug)]
#[allow(unused)]
pub struct VariantValueArrayBuilder {
value_builder: ValueBuilder,
value_offsets: Vec<usize>,
nulls: NullBufferBuilder,
}

#[allow(unused)]
impl VariantValueArrayBuilder {
/// Create a new `VariantValueArrayBuilder` with the specified row capacity
pub fn new(row_capacity: usize) -> Self {
Self {
value_builder: ValueBuilder::new(),
value_offsets: Vec::with_capacity(row_capacity),
nulls: NullBufferBuilder::new(row_capacity),
}
}

/// Build the final value array
///
/// Returns a [`BinaryViewArray`] containing the serialized variant values.
/// This can be combined with existing metadata to create a complete [`VariantArray`].
pub fn build(mut self) -> Result<BinaryViewArray, ArrowError> {
let value_buffer = self.value_builder.into_inner();
let mut array = binary_view_array_from_buffers(value_buffer, self.value_offsets);
if let Some(nulls) = self.nulls.finish() {
let (views, buffers, _) = array.into_parts();
array = BinaryViewArray::try_new(views, buffers, Some(nulls))?;
}
Ok(array)
}

/// Append a null row to the builder
///
/// WARNING: It is only safe to call this method when building the `value` field of a shredded
/// variant column (which is nullable). The `value` field of a binary (unshredded) variant
/// column is non-nullable, and callers should instead invoke [`Self::append_value`] with
/// `Variant::Null`, passing the appropriate metadata value.
pub fn append_null(&mut self) {
self.value_offsets.push(self.value_builder.offset());
self.nulls.append_null();
}

/// Append a variant value with its corresponding metadata
///
/// # Arguments
/// * `value` - The variant value to append
/// * `metadata` - The metadata dictionary for this variant (used for field name resolution)
///
/// # Returns
/// * `Ok(())` if the value was successfully appended
/// * `Err(ArrowError)` if the variant contains field names not found in the metadata
///
/// # Example
/// ```
/// # use parquet_variant::{Variant, EMPTY_VARIANT_METADATA};
/// # use parquet_variant_compute::VariantValueArrayBuilder;
/// let mut builder = VariantValueArrayBuilder::new(10);
/// builder.append_value(Variant::from(42), EMPTY_VARIANT_METADATA).unwrap();
/// ```
pub fn append_value(&mut self, value: Variant<'_, '_>) {
let metadata = value.metadata().cloned().unwrap_or(EMPTY_VARIANT_METADATA);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe not related to the current pr. Currently, we'll return None for Variant::metadata() if it's not object or list, do we need to return EMPTY_VARIANT_METDATA?

Copy link
Contributor Author

@scovich scovich Sep 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a great question!

Until now, that method was only used in tests, so it wasn't clear what the best semantics might be (plus, we didn't have the constant back then). I like the idea a lot tho -- it would eliminate the unwrap_or (which my next PR also has to use).

let mut metadata_builder = ReadOnlyMetadataBuilder::new(metadata);
ValueBuilder::append_variant_bytes(self.parent_state(&mut metadata_builder), value);
}

/// Creates a builder-specific parent state
pub fn parent_state<'a>(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTE: This is pub to compensate for the fact that VariantValueArrayBuilder cannot impl VariantBuilderExt (because the metadata builder is created on-demand instead of once up front). See PR description for details.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(also see the updated unit test for a realistic example of how to use it)

&'a mut self,
metadata_builder: &'a mut dyn MetadataBuilder,
) -> ParentState<'a, ValueArrayBuilderState<'a>> {
let state = ValueArrayBuilderState {
value_offsets: &mut self.value_offsets,
nulls: &mut self.nulls,
};

ParentState::new(&mut self.value_builder, metadata_builder, state)
}
}

/// Builder-specific state for array building that manages array-level offsets and nulls. See
/// [`VariantBuilderExt`] for details.
#[derive(Debug)]
pub struct ValueArrayBuilderState<'a> {
value_offsets: &'a mut Vec<usize>,
nulls: &'a mut NullBufferBuilder,
}

// All changes are pending until finalized
impl BuilderSpecificState for ValueArrayBuilderState<'_> {
fn finish(
&mut self,
_metadata_builder: &mut dyn MetadataBuilder,
value_builder: &mut ValueBuilder,
) {
self.value_offsets.push(value_builder.offset());
self.nulls.append_non_null();
}
}

fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray {
// All offsets are less than or equal to the buffer length, so we can safely cast all offsets
// inside the loop below, as long as the buffer length fits in u32.
Expand All @@ -228,6 +359,7 @@ fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> Binar
mod test {
use super::*;
use arrow::array::Array;
use parquet_variant::{Variant, VariantBuilder, VariantMetadata};

/// Test that both the metadata and value buffers are non nullable
#[test]
Expand Down Expand Up @@ -288,4 +420,46 @@ mod test {
let list = variant.as_list().expect("variant to be a list");
assert_eq!(list.len(), 2);
}

#[test]
fn test_variant_value_array_builder_basic() {
let mut builder = VariantValueArrayBuilder::new(10);

// Append some values
builder.append_value(Variant::from(42i32));
builder.append_null();
builder.append_value(Variant::from("hello"));

let value_array = builder.build().unwrap();
assert_eq!(value_array.len(), 3);
}

#[test]
fn test_variant_value_array_builder_with_objects() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice test!

// Create metadata with field names
let mut metadata_builder = WritableMetadataBuilder::default();
metadata_builder.upsert_field_name("name");
metadata_builder.upsert_field_name("age");
metadata_builder.finish();
let metadata_bytes = metadata_builder.into_inner();
let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();

// Create a variant with an object using the same metadata
let mut variant_builder = VariantBuilder::new().with_metadata(metadata);
variant_builder
.new_object()
.with_field("name", "Alice")
.with_field("age", 30i32)
.finish();
let (_, value_bytes) = variant_builder.finish();
let variant = Variant::try_new(&metadata_bytes, &value_bytes).unwrap();

// Now use the value array builder
let mut builder = VariantValueArrayBuilder::new(10);
builder.append_value(variant);
builder.append_null();

let value_array = builder.build().unwrap();
assert_eq!(value_array.len(), 2);
}
}
17 changes: 9 additions & 8 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ mod test {
use arrow::buffer::NullBuffer;
use arrow::compute::CastOptions;
use arrow_schema::{DataType, Field, FieldRef, Fields};
use parquet_variant::{Variant, VariantPath};
use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES};

use crate::json_to_variant;
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
Expand Down Expand Up @@ -701,8 +701,10 @@ mod test {
fn $func() -> ArrayRef {
// At the time of writing, the `VariantArrayBuilder` does not support shredding.
// so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895
let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() };
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3));
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
EMPTY_VARIANT_METADATA_BYTES,
3,
));
let typed_value = $array_type::from(vec![
Some(<$primitive_type>::try_from(1u8).unwrap()),
Some(<$primitive_type>::try_from(2u8).unwrap()),
Expand Down Expand Up @@ -1032,16 +1034,15 @@ mod test {
/// }
/// ```
fn all_null_variant_array() -> ArrayRef {
let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() };

let nulls = NullBuffer::from(vec![
false, // row 0 is null
false, // row 1 is null
false, // row 2 is null
]);

// metadata is the same for all rows (though they're all null)
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3));
let metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3));

let struct_array = StructArrayBuilder::new()
.with_field("metadata", Arc::new(metadata), false)
Expand Down Expand Up @@ -2502,8 +2503,8 @@ mod test {
.build();

// Build final VariantArray with top-level nulls
let (metadata, _) = parquet_variant::VariantBuilder::new().finish();
let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));
let metadata_array =
BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 4));
let nulls = NullBuffer::from(vec![
true, // row 0: inner struct exists with typed_value=42
true, // row 1: inner field NULL
Expand Down
2 changes: 1 addition & 1 deletion parquet-variant/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ pub struct WritableMetadataBuilder {

impl WritableMetadataBuilder {
/// Upsert field name to dictionary, return its ID
fn upsert_field_name(&mut self, field_name: &str) -> u32 {
pub fn upsert_field_name(&mut self, field_name: &str) -> u32 {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It probably should have been public all along, but now it's needed.

let (id, new_entry) = self.field_names.insert_full(field_name.to_string());

if new_entry {
Expand Down
4 changes: 2 additions & 2 deletions parquet-variant/src/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

pub use self::decimal::{VariantDecimal16, VariantDecimal4, VariantDecimal8};
pub use self::list::VariantList;
pub use self::metadata::VariantMetadata;
pub use self::metadata::{VariantMetadata, EMPTY_VARIANT_METADATA, EMPTY_VARIANT_METADATA_BYTES};
pub use self::object::VariantObject;
use crate::decoder::{
self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType,
Expand Down Expand Up @@ -1320,7 +1320,7 @@ impl<'m, 'v> Variant<'m, 'v> {
/// Return the metadata associated with this variant, if any.
///
/// Returns `Some(&VariantMetadata)` for object and list variants,
pub fn metadata(&self) -> Option<&'m VariantMetadata<'_>> {
pub fn metadata(&self) -> Option<&VariantMetadata<'m>> {
Copy link
Contributor Author

@scovich scovich Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This had the wrong lifetime specification, which led to weird borrow checker issues in this PR because the resulting VariantMetadata was tied to the lifetime of self instead of 'm -- even if cloned.

match self {
Variant::Object(VariantObject { metadata, .. })
| Variant::List(VariantList { metadata, .. }) => Some(metadata),
Expand Down
33 changes: 33 additions & 0 deletions parquet-variant/src/variant/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,39 @@ pub struct VariantMetadata<'m> {
// could increase the size of Variant. All those size increases could hurt performance.
const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);

/// The canonical byte slice corresponding to an empty metadata dictionary.
///
/// ```
/// # use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantMetadata, WritableMetadataBuilder};
/// let mut metadata_builder = WritableMetadataBuilder::default();
/// metadata_builder.finish();
/// let metadata_bytes = metadata_builder.into_inner();
/// assert_eq!(&metadata_bytes, EMPTY_VARIANT_METADATA_BYTES);
/// ```
pub const EMPTY_VARIANT_METADATA_BYTES: &[u8] = &[1, 0, 0];

/// The empty metadata dictionary.
///
/// ```
/// # use parquet_variant::{EMPTY_VARIANT_METADATA, VariantMetadata, WritableMetadataBuilder};
/// let mut metadata_builder = WritableMetadataBuilder::default();
/// metadata_builder.finish();
/// let metadata_bytes = metadata_builder.into_inner();
/// let empty_metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
/// assert_eq!(empty_metadata, EMPTY_VARIANT_METADATA);
/// ```
pub const EMPTY_VARIANT_METADATA: VariantMetadata = VariantMetadata {
bytes: EMPTY_VARIANT_METADATA_BYTES,
header: VariantMetadataHeader {
version: CORRECT_VERSION_VALUE,
is_sorted: false,
offset_size: OffsetSizeBytes::One,
},
dictionary_size: 0,
first_value_byte: 3,
validated: true,
};

impl<'m> VariantMetadata<'m> {
/// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
/// dictionary entries.
Expand Down
Loading