From 9bfc3726f6231c440cb862133662ca1d76bf41ec Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 14 Aug 2025 20:17:52 -0700 Subject: [PATCH 1/4] [Variant] Support typed access for numeric types in variant_get --- .../src/variant_get/mod.rs | 478 +++++++++++++----- .../src/variant_get/output/variant.rs | 163 ++++-- parquet-variant/Cargo.toml | 1 + parquet-variant/src/variant.rs | 6 + 4 files changed, 490 insertions(+), 158 deletions(-) diff --git a/parquet-variant-compute/src/variant_get/mod.rs b/parquet-variant-compute/src/variant_get/mod.rs index 4460705cba0b..2fb02909ab97 100644 --- a/parquet-variant-compute/src/variant_get/mod.rs +++ b/parquet-variant-compute/src/variant_get/mod.rs @@ -108,8 +108,9 @@ mod test { use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BinaryViewArray, Int16Array, Int32Array, PrimitiveArray, StringArray, - StructArray, + Array, ArrayRef, BinaryViewArray, Float16Array, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, StringArray, StructArray, UInt16Array, UInt32Array, + UInt64Array, UInt8Array, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; @@ -202,29 +203,95 @@ mod test { ); } - /// Shredding: extract a value as a VariantArray - #[test] - fn get_variant_shredded_int32_as_variant() { - let array = shredded_int32_variant_array(); - let options = GetOptions::new(); - let result = variant_get(&array, options).unwrap(); - - // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); - assert_eq!(result.len(), 4); - - // Expect the values are the same as the original values - assert_eq!(result.value(0), Variant::Int32(34)); - assert!(!result.is_valid(1)); - assert_eq!(result.value(2), Variant::from("n/a")); - assert_eq!(result.value(3), Variant::Int32(100)); + /// Partial Shredding: extract a value as a VariantArray + macro_rules! numeric_partially_shredded_test { + ($test:ident, $data_fn:ident, $primitive_type:ty) => { + #[test] + fn $test() { + let array = $data_fn(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 4); + + // Expect the values are the same as the original values + assert_eq!( + result.value(0), + Variant::from(<$primitive_type>::try_from(34u8).unwrap()) + ); + assert!(!result.is_valid(1)); + assert_eq!(result.value(2), Variant::from("n/a")); + assert_eq!( + result.value(3), + Variant::from(<$primitive_type>::try_from(100u8).unwrap()) + ); + } + }; } + numeric_partially_shredded_test!( + get_variant_partially_shredded_int8_as_variant, + partially_shredded_int8_variant_array, + i8 + ); + numeric_partially_shredded_test!( + get_variant_partially_shredded_int16_as_variant, + partially_shredded_int16_variant_array, + i16 + ); + numeric_partially_shredded_test!( + get_variant_partially_shredded_int32_as_variant, + partially_shredded_int32_variant_array, + i32 + ); + numeric_partially_shredded_test!( + get_variant_partially_shredded_int64_as_variant, + partially_shredded_int64_variant_array, + i64 + ); + numeric_partially_shredded_test!( + get_variant_partially_shredded_uint8_as_variant, + partially_shredded_uint8_variant_array, + u8 + ); + numeric_partially_shredded_test!( + get_variant_partially_shredded_uint16_as_variant, + partially_shredded_uint16_variant_array, + u16 + ); + numeric_partially_shredded_test!( + get_variant_partially_shredded_uint32_as_variant, + partially_shredded_uint32_variant_array, + u32 + ); + numeric_partially_shredded_test!( + get_variant_partially_shredded_uint64_as_variant, + partially_shredded_uint64_variant_array, + u64 + ); + numeric_partially_shredded_test!( + get_variant_partially_shredded_float16_as_variant, + partially_shredded_float16_variant_array, + half::f16 + ); + numeric_partially_shredded_test!( + get_variant_partially_shredded_float32_as_variant, + partially_shredded_float32_variant_array, + f32 + ); + numeric_partially_shredded_test!( + get_variant_partially_shredded_float64_as_variant, + partially_shredded_float64_variant_array, + f64 + ); + /// Shredding: extract a value as an Int32Array #[test] fn get_variant_shredded_int32_as_int32_safe_cast() { // Extract the typed value as Int32Array - let array = shredded_int32_variant_array(); + let array = partially_shredded_int32_variant_array(); // specify we want the typed value as Int32 let field = Field::new("typed_value", DataType::Int32, true); let options = GetOptions::new().with_as_type(Some(FieldRef::from(field))); @@ -243,7 +310,7 @@ mod test { #[test] fn get_variant_shredded_int32_as_int32_unsafe_cast() { // Extract the typed value as Int32Array - let array = shredded_int32_variant_array(); + let array = partially_shredded_int32_variant_array(); let field = Field::new("typed_value", DataType::Int32, true); let cast_options = CastOptions { safe: false, // unsafe cast @@ -259,29 +326,96 @@ mod test { } /// Perfect Shredding: extract the typed value as a VariantArray - #[test] - fn get_variant_perfectly_shredded_int32_as_variant() { - let array = - perfectly_shredded_variant_array(Int32Array::from(vec![Some(1), Some(2), Some(3)])); - let options = GetOptions::new(); - let result = variant_get(&array, options).unwrap(); - - // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); - assert_eq!(result.len(), 3); - - // Expect the values are the same as the original values - assert_eq!(result.value(0), Variant::Int32(1)); - assert_eq!(result.value(1), Variant::Int32(2)); - assert_eq!(result.value(2), Variant::Int32(3)); + macro_rules! numeric_perfectly_shredded_test { + ($test:ident, $data_fn:ident, $primitive_type:ty) => { + #[test] + fn $test() { + let array = $data_fn(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 3); + + // Expect the values are the same as the original values + assert_eq!( + result.value(0), + Variant::from(<$primitive_type>::try_from(1u8).unwrap()) + ); + assert_eq!( + result.value(1), + Variant::from(<$primitive_type>::try_from(2u8).unwrap()) + ); + assert_eq!( + result.value(2), + Variant::from(<$primitive_type>::try_from(3u8).unwrap()) + ); + } + }; } + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_int8_as_variant, + perfectly_shredded_int8_variant_array, + i8 + ); + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_int16_as_variant, + perfectly_shredded_int16_variant_array, + i16 + ); + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_int32_as_variant, + perfectly_shredded_int32_variant_array, + i32 + ); + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_int64_as_variant, + perfectly_shredded_int64_variant_array, + i64 + ); + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_uint8_as_variant, + perfectly_shredded_uint8_variant_array, + u8 + ); + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_uint16_as_variant, + perfectly_shredded_uint16_variant_array, + u16 + ); + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_uint32_as_variant, + perfectly_shredded_uint32_variant_array, + u32 + ); + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_uint64_as_variant, + perfectly_shredded_uint64_variant_array, + u64 + ); + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_float16_as_variant, + perfectly_shredded_float16_variant_array, + half::f16 + ); + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_float32_as_variant, + perfectly_shredded_float32_variant_array, + f32 + ); + numeric_perfectly_shredded_test!( + get_variant_perfectly_shredded_float64_as_variant, + perfectly_shredded_float64_variant_array, + f64 + ); + /// Shredding: Extract the typed value as Int32Array #[test] fn get_variant_perfectly_shredded_int32_as_int32() { // Extract the typed value as Int32Array - let array = - perfectly_shredded_variant_array(Int32Array::from(vec![Some(1), Some(2), Some(3)])); + let array = perfectly_shredded_int32_variant_array(); // specify we want the typed value as Int32 let field = Field::new("typed_value", DataType::Int32, true); let options = GetOptions::new().with_as_type(Some(FieldRef::from(field))); @@ -324,28 +458,10 @@ mod test { assert_eq!(&result, &expected) } - #[test] - fn get_variant_perfectly_shredded_int16_as_variant() { - let array = - perfectly_shredded_variant_array(Int16Array::from(vec![Some(1), Some(2), Some(3)])); - let options = GetOptions::new(); - let result = variant_get(&array, options).unwrap(); - - // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); - assert_eq!(result.len(), 3); - - // Expect the values are the same as the original values - assert_eq!(result.value(0), Variant::Int16(1)); - assert_eq!(result.value(1), Variant::Int16(2)); - assert_eq!(result.value(2), Variant::Int16(3)); - } - #[test] fn get_variant_perfectly_shredded_int16_as_int16() { // Extract the typed value as Int16Array - let array = - perfectly_shredded_variant_array(Int16Array::from(vec![Some(1), Some(2), Some(3)])); + let array = perfectly_shredded_int16_variant_array(); // specify we want the typed value as Int16 let field = Field::new("typed_value", DataType::Int16, true); let options = GetOptions::new().with_as_type(Some(FieldRef::from(field))); @@ -365,27 +481,88 @@ mod test { /// typed_value: Int32Array, /// } /// ``` - fn perfectly_shredded_variant_array(typed_value: PrimitiveArray) -> ArrayRef - where - T: arrow::datatypes::ArrowPrimitiveType, - { - // At the time of writing, the `VariantArrayBuilder` does not support shredding. - // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 - let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() }; - - let metadata = - BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, typed_value.len())); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata)) - .with_field("typed_value", Arc::new(typed_value)) - .build(); - - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + macro_rules! numeric_perfectly_shredded_variant_array_fn { + ($func:ident, $array_type:ident, $primitive_type:ty) => { + fn $func() -> ArrayRef { + // At the time of writing, the `VariantArrayBuilder` does not support shredding. + // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 + let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() }; + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3)); + let typed_value = $array_type::from(vec![ + Some(<$primitive_type>::try_from(1u8).unwrap()), + Some(<$primitive_type>::try_from(2u8).unwrap()), + Some(<$primitive_type>::try_from(3u8).unwrap()), + ]); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata)) + .with_field("typed_value", Arc::new(typed_value)) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)) + .expect("should create variant array"), + ) + } + }; } + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_int8_variant_array, + Int8Array, + i8 + ); + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_int16_variant_array, + Int16Array, + i16 + ); + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_int32_variant_array, + Int32Array, + i32 + ); + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_int64_variant_array, + Int64Array, + i64 + ); + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_uint8_variant_array, + UInt8Array, + u8 + ); + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_uint16_variant_array, + UInt16Array, + u16 + ); + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_uint32_variant_array, + UInt32Array, + u32 + ); + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_uint64_variant_array, + UInt64Array, + u64 + ); + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_float16_variant_array, + Float16Array, + half::f16 + ); + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_float32_variant_array, + Float32Array, + f32 + ); + numeric_perfectly_shredded_variant_array_fn!( + perfectly_shredded_float64_variant_array, + Float64Array, + f64 + ); + /// Return a VariantArray that represents a normal "shredded" variant /// for the following example /// @@ -409,53 +586,114 @@ mod test { /// typed_value: Int32Array, /// } /// ``` - fn shredded_int32_variant_array() -> ArrayRef { - // At the time of writing, the `VariantArrayBuilder` does not support shredding. - // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() + macro_rules! numeric_partially_shredded_variant_array_fn { + ($func:ident, $array_type:ident, $primitive_type:ty) => { + fn $func() -> ArrayRef { + // At the time of writing, the `VariantArrayBuilder` does not support shredding. + // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 + let (metadata, string_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + builder.append_value("n/a"); + builder.finish() + }; + + let nulls = NullBuffer::from(vec![ + true, // row 0 non null + false, // row 1 is null + true, // row 2 non null + true, // row 3 non null + ]); + + // metadata is the same for all rows + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + + // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY + // about why row1 is an empty but non null, value. + let values = BinaryViewArray::from(vec![ + None, // row 0 is shredded, so no value + Some(b"" as &[u8]), // row 1 is null, so empty value (why?) + Some(&string_value), // copy the string value "N/A" + None, // row 3 is shredded, so no value + ]); + + let typed_value = $array_type::from(vec![ + Some(<$primitive_type>::try_from(34u8).unwrap()), // row 0 is shredded, so it has a value + None, // row 1 is null, so no value + None, // row 2 is a string, so no typed value + Some(<$primitive_type>::try_from(100u8).unwrap()), // row 3 is shredded, so it has a value + ]); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata)) + .with_field("typed_value", Arc::new(typed_value)) + .with_field("value", Arc::new(values)) + .with_nulls(nulls) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)) + .expect("should create variant array"), + ) + } }; - - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value (why?) - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); - - let typed_value = Int32Array::from(vec![ - Some(34), // row 0 is shredded, so it has a value - None, // row 1 is null, so no value - None, // row 2 is a string, so no typed value - Some(100), // row 3 is shredded, so it has a value - ]); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata)) - .with_field("typed_value", Arc::new(typed_value)) - .with_field("value", Arc::new(values)) - .with_nulls(nulls) - .build(); - - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) } + numeric_partially_shredded_variant_array_fn!( + partially_shredded_int8_variant_array, + Int8Array, + i8 + ); + numeric_partially_shredded_variant_array_fn!( + partially_shredded_int16_variant_array, + Int16Array, + i16 + ); + numeric_partially_shredded_variant_array_fn!( + partially_shredded_int32_variant_array, + Int32Array, + i32 + ); + numeric_partially_shredded_variant_array_fn!( + partially_shredded_int64_variant_array, + Int64Array, + i64 + ); + numeric_partially_shredded_variant_array_fn!( + partially_shredded_uint8_variant_array, + UInt8Array, + u8 + ); + numeric_partially_shredded_variant_array_fn!( + partially_shredded_uint16_variant_array, + UInt16Array, + u16 + ); + numeric_partially_shredded_variant_array_fn!( + partially_shredded_uint32_variant_array, + UInt32Array, + u32 + ); + numeric_partially_shredded_variant_array_fn!( + partially_shredded_uint64_variant_array, + UInt64Array, + u64 + ); + numeric_partially_shredded_variant_array_fn!( + partially_shredded_float16_variant_array, + Float16Array, + half::f16 + ); + numeric_partially_shredded_variant_array_fn!( + partially_shredded_float32_variant_array, + Float32Array, + f32 + ); + numeric_partially_shredded_variant_array_fn!( + partially_shredded_float64_variant_array, + Float64Array, + f64 + ); + /// Builds struct arrays from component fields /// /// TODO: move to arrow crate @@ -500,7 +738,7 @@ mod test { /// /// ```text /// null - /// null + /// null /// null /// ``` /// diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs b/parquet-variant-compute/src/variant_get/output/variant.rs index 203fab233b02..e9d325e6d955 100644 --- a/parquet-variant-compute/src/variant_get/output/variant.rs +++ b/parquet-variant-compute/src/variant_get/output/variant.rs @@ -18,11 +18,36 @@ use crate::variant_get::output::OutputBuilder; use crate::{type_conversion::primitive_conversion_array, VariantArray, VariantArrayBuilder}; use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray}; -use arrow::datatypes::{Int16Type, Int32Type}; +use arrow::datatypes::{ + Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, +}; use arrow_schema::{ArrowError, DataType}; use parquet_variant::{Variant, VariantPath}; use std::sync::Arc; +macro_rules! cast_partially_shredded_primitive { + ($typed_value:expr, $variant_array:expr, $arrow_type:ty, $data_type:expr) => {{ + let mut array_builder = VariantArrayBuilder::new($variant_array.len()); + let primitive_array = $typed_value.as_primitive::<$arrow_type>(); + for i in 0..$variant_array.len() { + if $variant_array.is_null(i) { + array_builder.append_null(); + } else if $typed_value.is_null(i) { + // fall back to the value (variant) field + // (TODO could copy the variant bytes directly) + let value = $variant_array.value(i); + array_builder.append_variant(value); + } else { + // otherwise we have a typed value, so we can use it directly + let value = primitive_array.value(i); + array_builder.append_variant(Variant::from(value)); + } + } + Ok(Arc::new(array_builder.build())) + }}; +} + /// Outputs VariantArrays pub(super) struct VariantOutputBuilder<'a> { /// What path to extract @@ -44,40 +69,92 @@ impl OutputBuilder for VariantOutputBuilder<'_> { _value_field: &BinaryViewArray, typed_value: &ArrayRef, ) -> arrow::error::Result { - // in this case dispatch on the typed_value and - // TODO macro'ize this using downcast! to handle all other primitive types // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same) - let mut array_builder = VariantArrayBuilder::new(variant_array.len()); match typed_value.data_type() { - DataType::Int32 => { - let primitive_array = typed_value.as_primitive::(); - for i in 0..variant_array.len() { - if variant_array.is_null(i) { - array_builder.append_null(); - continue; - } - - if typed_value.is_null(i) { - // fall back to the value (variant) field - // (TODO could copy the variant bytes directly) - let value = variant_array.value(i); - array_builder.append_variant(value); - continue; - } - - // otherwise we have a typed value, so we can use it directly - let int_value = primitive_array.value(i); - array_builder.append_variant(Variant::from(int_value)); - } - } + DataType::Int8 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + Int8Type, + DataType::Int8 + ), + + DataType::Int16 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + Int16Type, + DataType::Int16 + ), + + DataType::Int32 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + Int32Type, + DataType::Int32 + ), + + DataType::Int64 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + Int64Type, + DataType::Int64 + ), + + DataType::UInt8 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + UInt8Type, + DataType::UInt8 + ), + + DataType::UInt16 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + UInt16Type, + DataType::Int16 + ), + + DataType::UInt32 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + UInt32Type, + DataType::UInt32 + ), + + DataType::UInt64 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + UInt64Type, + DataType::UInt64 + ), + + DataType::Float16 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + Float16Type, + DataType::Float16 + ), + + DataType::Float32 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + Float32Type, + DataType::Float32 + ), + + DataType::Float64 => cast_partially_shredded_primitive!( + typed_value, + variant_array, + Float64Type, + DataType::Float64 + ), + dt => { // https://github.com/apache/arrow-rs/issues/8086 - return Err(ArrowError::NotYetImplemented(format!( - "variant_get fully_shredded with typed_value={dt} is not implemented yet", - ))); + Err(ArrowError::NotYetImplemented(format!( + "variant_get partially shredded with typed_value={dt} is not implemented yet", + ))) } - }; - Ok(Arc::new(array_builder.build())) + } } fn typed( @@ -87,24 +164,34 @@ impl OutputBuilder for VariantOutputBuilder<'_> { _metadata: &BinaryViewArray, typed_value: &ArrayRef, ) -> arrow::error::Result { - // in this case dispatch on the typed_value and - // TODO macro'ize this using downcast! to handle all other primitive types // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same) let mut array_builder = VariantArrayBuilder::new(variant_array.len()); match typed_value.data_type() { - DataType::Int32 => { - primitive_conversion_array!(Int32Type, typed_value, array_builder); + DataType::Int8 => primitive_conversion_array!(Int8Type, typed_value, array_builder), + DataType::Int16 => primitive_conversion_array!(Int16Type, typed_value, array_builder), + DataType::Int32 => primitive_conversion_array!(Int32Type, typed_value, array_builder), + DataType::Int64 => primitive_conversion_array!(Int64Type, typed_value, array_builder), + DataType::UInt8 => primitive_conversion_array!(UInt8Type, typed_value, array_builder), + DataType::UInt16 => primitive_conversion_array!(UInt16Type, typed_value, array_builder), + DataType::UInt32 => primitive_conversion_array!(UInt32Type, typed_value, array_builder), + DataType::UInt64 => primitive_conversion_array!(UInt64Type, typed_value, array_builder), + DataType::Float16 => { + primitive_conversion_array!(Float16Type, typed_value, array_builder) } - DataType::Int16 => { - primitive_conversion_array!(Int16Type, typed_value, array_builder); + DataType::Float32 => { + primitive_conversion_array!(Float32Type, typed_value, array_builder) } + DataType::Float64 => { + primitive_conversion_array!(Float64Type, typed_value, array_builder) + } + dt => { // https://github.com/apache/arrow-rs/issues/8087 return Err(ArrowError::NotYetImplemented(format!( - "variant_get fully_shredded with typed_value={dt} is not implemented yet", + "variant_get perfectly shredded with typed_value={dt} is not implemented yet", ))); } - }; + } Ok(Arc::new(array_builder.build())) } diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index a4d4792e09f5..6e88bff6bd3a 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -33,6 +33,7 @@ rust-version = { workspace = true } [dependencies] arrow-schema = { workspace = true } chrono = { workspace = true } +half = { version = "2.1", default-features = false } indexmap = "2.10.0" uuid = { version = "1.18.0", features = ["v4"]} diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 64458c669eed..a36752bf9ca0 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1278,6 +1278,12 @@ impl From for Variant<'_, '_> { } } +impl From for Variant<'_, '_> { + fn from(value: half::f16) -> Self { + Variant::Float(value.into()) + } +} + impl From for Variant<'_, '_> { fn from(value: f32) -> Self { Variant::Float(value) From 49390f454def24ea005d384ea3c1b8b946a7fb29 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 21 Aug 2025 17:24:16 -0700 Subject: [PATCH 2/4] Fix formatting --- parquet-variant-compute/src/variant_get/output/variant.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs b/parquet-variant-compute/src/variant_get/output/variant.rs index e9d325e6d955..374e2d829df3 100644 --- a/parquet-variant-compute/src/variant_get/output/variant.rs +++ b/parquet-variant-compute/src/variant_get/output/variant.rs @@ -184,7 +184,6 @@ impl OutputBuilder for VariantOutputBuilder<'_> { DataType::Float64 => { primitive_conversion_array!(Float64Type, typed_value, array_builder) } - dt => { // https://github.com/apache/arrow-rs/issues/8087 return Err(ArrowError::NotYetImplemented(format!( From df497605c39200f19c486a0e3acf1868117d6b09 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Wed, 27 Aug 2025 17:44:19 -0700 Subject: [PATCH 3/4] Remove unnecessary macro arg --- .../src/variant_get/output/variant.rs | 121 +++++++----------- 1 file changed, 44 insertions(+), 77 deletions(-) diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs b/parquet-variant-compute/src/variant_get/output/variant.rs index 374e2d829df3..8a1fe8335fde 100644 --- a/parquet-variant-compute/src/variant_get/output/variant.rs +++ b/parquet-variant-compute/src/variant_get/output/variant.rs @@ -27,7 +27,7 @@ use parquet_variant::{Variant, VariantPath}; use std::sync::Arc; macro_rules! cast_partially_shredded_primitive { - ($typed_value:expr, $variant_array:expr, $arrow_type:ty, $data_type:expr) => {{ + ($typed_value:expr, $variant_array:expr, $arrow_type:ty) => {{ let mut array_builder = VariantArrayBuilder::new($variant_array.len()); let primitive_array = $typed_value.as_primitive::<$arrow_type>(); for i in 0..$variant_array.len() { @@ -71,82 +71,49 @@ impl OutputBuilder for VariantOutputBuilder<'_> { ) -> arrow::error::Result { // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same) match typed_value.data_type() { - DataType::Int8 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - Int8Type, - DataType::Int8 - ), - - DataType::Int16 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - Int16Type, - DataType::Int16 - ), - - DataType::Int32 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - Int32Type, - DataType::Int32 - ), - - DataType::Int64 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - Int64Type, - DataType::Int64 - ), - - DataType::UInt8 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - UInt8Type, - DataType::UInt8 - ), - - DataType::UInt16 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - UInt16Type, - DataType::Int16 - ), - - DataType::UInt32 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - UInt32Type, - DataType::UInt32 - ), - - DataType::UInt64 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - UInt64Type, - DataType::UInt64 - ), - - DataType::Float16 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - Float16Type, - DataType::Float16 - ), - - DataType::Float32 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - Float32Type, - DataType::Float32 - ), - - DataType::Float64 => cast_partially_shredded_primitive!( - typed_value, - variant_array, - Float64Type, - DataType::Float64 - ), + DataType::Int8 => { + cast_partially_shredded_primitive!(typed_value, variant_array, Int8Type) + } + + DataType::Int16 => { + cast_partially_shredded_primitive!(typed_value, variant_array, Int16Type) + } + + DataType::Int32 => { + cast_partially_shredded_primitive!(typed_value, variant_array, Int32Type) + } + + DataType::Int64 => { + cast_partially_shredded_primitive!(typed_value, variant_array, Int64Type) + } + + DataType::UInt8 => { + cast_partially_shredded_primitive!(typed_value, variant_array, UInt8Type) + } + + DataType::UInt16 => { + cast_partially_shredded_primitive!(typed_value, variant_array, UInt16Type) + } + + DataType::UInt32 => { + cast_partially_shredded_primitive!(typed_value, variant_array, UInt32Type) + } + + DataType::UInt64 => { + cast_partially_shredded_primitive!(typed_value, variant_array, UInt64Type) + } + + DataType::Float16 => { + cast_partially_shredded_primitive!(typed_value, variant_array, Float16Type) + } + + DataType::Float32 => { + cast_partially_shredded_primitive!(typed_value, variant_array, Float32Type) + } + + DataType::Float64 => { + cast_partially_shredded_primitive!(typed_value, variant_array, Float64Type) + } dt => { // https://github.com/apache/arrow-rs/issues/8086 From e3fefd1252e1619fc5b0984298178257efca2cad Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Wed, 27 Aug 2025 18:05:03 -0700 Subject: [PATCH 4/4] Improve readability of tests --- .../src/variant_get/mod.rs | 308 +++++++++--------- 1 file changed, 150 insertions(+), 158 deletions(-) diff --git a/parquet-variant-compute/src/variant_get/mod.rs b/parquet-variant-compute/src/variant_get/mod.rs index 2fb02909ab97..585c4462c37b 100644 --- a/parquet-variant-compute/src/variant_get/mod.rs +++ b/parquet-variant-compute/src/variant_get/mod.rs @@ -205,87 +205,83 @@ mod test { /// Partial Shredding: extract a value as a VariantArray macro_rules! numeric_partially_shredded_test { - ($test:ident, $data_fn:ident, $primitive_type:ty) => { - #[test] - fn $test() { - let array = $data_fn(); - let options = GetOptions::new(); - let result = variant_get(&array, options).unwrap(); - - // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); - assert_eq!(result.len(), 4); - - // Expect the values are the same as the original values - assert_eq!( - result.value(0), - Variant::from(<$primitive_type>::try_from(34u8).unwrap()) - ); - assert!(!result.is_valid(1)); - assert_eq!(result.value(2), Variant::from("n/a")); - assert_eq!( - result.value(3), - Variant::from(<$primitive_type>::try_from(100u8).unwrap()) - ); - } + ($primitive_type:ty, $data_fn:ident) => { + let array = $data_fn(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 4); + + // Expect the values are the same as the original values + assert_eq!( + result.value(0), + Variant::from(<$primitive_type>::try_from(34u8).unwrap()) + ); + assert!(!result.is_valid(1)); + assert_eq!(result.value(2), Variant::from("n/a")); + assert_eq!( + result.value(3), + Variant::from(<$primitive_type>::try_from(100u8).unwrap()) + ); }; } - numeric_partially_shredded_test!( - get_variant_partially_shredded_int8_as_variant, - partially_shredded_int8_variant_array, - i8 - ); - numeric_partially_shredded_test!( - get_variant_partially_shredded_int16_as_variant, - partially_shredded_int16_variant_array, - i16 - ); - numeric_partially_shredded_test!( - get_variant_partially_shredded_int32_as_variant, - partially_shredded_int32_variant_array, - i32 - ); - numeric_partially_shredded_test!( - get_variant_partially_shredded_int64_as_variant, - partially_shredded_int64_variant_array, - i64 - ); - numeric_partially_shredded_test!( - get_variant_partially_shredded_uint8_as_variant, - partially_shredded_uint8_variant_array, - u8 - ); - numeric_partially_shredded_test!( - get_variant_partially_shredded_uint16_as_variant, - partially_shredded_uint16_variant_array, - u16 - ); - numeric_partially_shredded_test!( - get_variant_partially_shredded_uint32_as_variant, - partially_shredded_uint32_variant_array, - u32 - ); - numeric_partially_shredded_test!( - get_variant_partially_shredded_uint64_as_variant, - partially_shredded_uint64_variant_array, - u64 - ); - numeric_partially_shredded_test!( - get_variant_partially_shredded_float16_as_variant, - partially_shredded_float16_variant_array, - half::f16 - ); - numeric_partially_shredded_test!( - get_variant_partially_shredded_float32_as_variant, - partially_shredded_float32_variant_array, - f32 - ); - numeric_partially_shredded_test!( - get_variant_partially_shredded_float64_as_variant, - partially_shredded_float64_variant_array, - f64 - ); + #[test] + fn get_variant_partially_shredded_int8_as_variant() { + numeric_partially_shredded_test!(i8, partially_shredded_int8_variant_array); + } + + #[test] + fn get_variant_partially_shredded_int16_as_variant() { + numeric_partially_shredded_test!(i16, partially_shredded_int16_variant_array); + } + + #[test] + fn get_variant_partially_shredded_int32_as_variant() { + numeric_partially_shredded_test!(i32, partially_shredded_int32_variant_array); + } + + #[test] + fn get_variant_partially_shredded_int64_as_variant() { + numeric_partially_shredded_test!(i64, partially_shredded_int64_variant_array); + } + + #[test] + fn get_variant_partially_shredded_uint8_as_variant() { + numeric_partially_shredded_test!(u8, partially_shredded_uint8_variant_array); + } + + #[test] + fn get_variant_partially_shredded_uint16_as_variant() { + numeric_partially_shredded_test!(u16, partially_shredded_uint16_variant_array); + } + + #[test] + fn get_variant_partially_shredded_uint32_as_variant() { + numeric_partially_shredded_test!(u32, partially_shredded_uint32_variant_array); + } + + #[test] + fn get_variant_partially_shredded_uint64_as_variant() { + numeric_partially_shredded_test!(u64, partially_shredded_uint64_variant_array); + } + + #[test] + fn get_variant_partially_shredded_float16_as_variant() { + numeric_partially_shredded_test!(half::f16, partially_shredded_float16_variant_array); + } + + #[test] + fn get_variant_partially_shredded_float32_as_variant() { + numeric_partially_shredded_test!(f32, partially_shredded_float32_variant_array); + } + + #[test] + fn get_variant_partially_shredded_float64_as_variant() { + numeric_partially_shredded_test!(f64, partially_shredded_float64_variant_array); + } /// Shredding: extract a value as an Int32Array #[test] @@ -327,89 +323,85 @@ mod test { /// Perfect Shredding: extract the typed value as a VariantArray macro_rules! numeric_perfectly_shredded_test { - ($test:ident, $data_fn:ident, $primitive_type:ty) => { - #[test] - fn $test() { - let array = $data_fn(); - let options = GetOptions::new(); - let result = variant_get(&array, options).unwrap(); - - // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); - assert_eq!(result.len(), 3); - - // Expect the values are the same as the original values - assert_eq!( - result.value(0), - Variant::from(<$primitive_type>::try_from(1u8).unwrap()) - ); - assert_eq!( - result.value(1), - Variant::from(<$primitive_type>::try_from(2u8).unwrap()) - ); - assert_eq!( - result.value(2), - Variant::from(<$primitive_type>::try_from(3u8).unwrap()) - ); - } + ($primitive_type:ty, $data_fn:ident) => { + let array = $data_fn(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 3); + + // Expect the values are the same as the original values + assert_eq!( + result.value(0), + Variant::from(<$primitive_type>::try_from(1u8).unwrap()) + ); + assert_eq!( + result.value(1), + Variant::from(<$primitive_type>::try_from(2u8).unwrap()) + ); + assert_eq!( + result.value(2), + Variant::from(<$primitive_type>::try_from(3u8).unwrap()) + ); }; } - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_int8_as_variant, - perfectly_shredded_int8_variant_array, - i8 - ); - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_int16_as_variant, - perfectly_shredded_int16_variant_array, - i16 - ); - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_int32_as_variant, - perfectly_shredded_int32_variant_array, - i32 - ); - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_int64_as_variant, - perfectly_shredded_int64_variant_array, - i64 - ); - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_uint8_as_variant, - perfectly_shredded_uint8_variant_array, - u8 - ); - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_uint16_as_variant, - perfectly_shredded_uint16_variant_array, - u16 - ); - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_uint32_as_variant, - perfectly_shredded_uint32_variant_array, - u32 - ); - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_uint64_as_variant, - perfectly_shredded_uint64_variant_array, - u64 - ); - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_float16_as_variant, - perfectly_shredded_float16_variant_array, - half::f16 - ); - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_float32_as_variant, - perfectly_shredded_float32_variant_array, - f32 - ); - numeric_perfectly_shredded_test!( - get_variant_perfectly_shredded_float64_as_variant, - perfectly_shredded_float64_variant_array, - f64 - ); + #[test] + fn get_variant_perfectly_shredded_int8_as_variant() { + numeric_perfectly_shredded_test!(i8, perfectly_shredded_int8_variant_array); + } + + #[test] + fn get_variant_perfectly_shredded_int16_as_variant() { + numeric_perfectly_shredded_test!(i16, perfectly_shredded_int16_variant_array); + } + + #[test] + fn get_variant_perfectly_shredded_int32_as_variant() { + numeric_perfectly_shredded_test!(i32, perfectly_shredded_int32_variant_array); + } + + #[test] + fn get_variant_perfectly_shredded_int64_as_variant() { + numeric_perfectly_shredded_test!(i64, perfectly_shredded_int64_variant_array); + } + + #[test] + fn get_variant_perfectly_shredded_uint8_as_variant() { + numeric_perfectly_shredded_test!(u8, perfectly_shredded_uint8_variant_array); + } + + #[test] + fn get_variant_perfectly_shredded_uint16_as_variant() { + numeric_perfectly_shredded_test!(u16, perfectly_shredded_uint16_variant_array); + } + + #[test] + fn get_variant_perfectly_shredded_uint32_as_variant() { + numeric_perfectly_shredded_test!(u32, perfectly_shredded_uint32_variant_array); + } + + #[test] + fn get_variant_perfectly_shredded_uint64_as_variant() { + numeric_perfectly_shredded_test!(u64, perfectly_shredded_uint64_variant_array); + } + + #[test] + fn get_variant_perfectly_shredded_float16_as_variant() { + numeric_perfectly_shredded_test!(half::f16, perfectly_shredded_float16_variant_array); + } + + #[test] + fn get_variant_perfectly_shredded_float32_as_variant() { + numeric_perfectly_shredded_test!(f32, perfectly_shredded_float32_variant_array); + } + + #[test] + fn get_variant_perfectly_shredded_float64_as_variant() { + numeric_perfectly_shredded_test!(f64, perfectly_shredded_float64_variant_array); + } /// Shredding: Extract the typed value as Int32Array #[test]