diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs b/parquet-variant-compute/src/variant_get/output/primitive.rs deleted file mode 100644 index ff3e58c3c340..000000000000 --- a/parquet-variant-compute/src/variant_get/output/primitive.rs +++ /dev/null @@ -1,184 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::variant_get::output::OutputBuilder; -use crate::VariantArray; -use arrow::error::Result; - -use arrow::array::{ - new_null_array, Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, - NullBufferBuilder, PrimitiveArray, -}; -use arrow::compute::{cast_with_options, CastOptions}; -use arrow::datatypes::{Int16Type, Int32Type}; -use arrow_schema::{ArrowError, FieldRef}; -use parquet_variant::{Variant, VariantPath}; -use std::marker::PhantomData; -use std::sync::Arc; - -/// Trait for Arrow primitive types that can be used in the output builder -/// -/// This just exists to add a generic way to convert from Variant to the primitive type -pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType { - /// Try to extract the primitive value from a Variant, returning None if it - /// cannot be converted - /// - /// TODO: figure out how to handle coercion/casting - fn from_variant(variant: &Variant) -> Option; -} - -/// Outputs Primitive arrays -pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> { - /// What path to extract - path: VariantPath<'a>, - /// Returned output type - as_type: FieldRef, - /// Controls the casting behavior (e.g. error vs substituting null on cast error). - cast_options: CastOptions<'a>, - /// Phantom data for the primitive type - _phantom: PhantomData, -} - -impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> { - pub(super) fn new( - path: VariantPath<'a>, - as_type: FieldRef, - cast_options: CastOptions<'a>, - ) -> Self { - Self { - path, - as_type, - cast_options, - _phantom: PhantomData, - } - } -} - -impl OutputBuilder for PrimitiveOutputBuilder<'_, T> { - fn partially_shredded( - &self, - variant_array: &VariantArray, - _metadata: &BinaryViewArray, - _value_field: &BinaryViewArray, - typed_value: &ArrayRef, - ) -> arrow::error::Result { - // build up the output array element by element - let mut nulls = NullBufferBuilder::new(variant_array.len()); - let mut values = Vec::with_capacity(variant_array.len()); - let typed_value = - cast_with_options(typed_value, self.as_type.data_type(), &self.cast_options)?; - // downcast to the primitive array (e.g. Int32Array, Float64Array, etc) - let typed_value = typed_value.as_primitive::(); - - for i in 0..variant_array.len() { - if variant_array.is_null(i) { - nulls.append_null(); - values.push(T::default_value()); // not used, placeholder - continue; - } - - // if the typed value is null, decode the variant and extract the value - if typed_value.is_null(i) { - // TODO follow path - // https://github.com/apache/arrow-rs/issues/8086 - let variant = variant_array.value(i); - let Some(value) = T::from_variant(&variant) else { - if self.cast_options.safe { - // safe mode: append null if we can't convert - nulls.append_null(); - values.push(T::default_value()); // not used, placeholder - continue; - } else { - return Err(ArrowError::CastError(format!( - "Failed to extract primitive of type {} from variant {:?} at path {:?}", - self.as_type.data_type(), - variant, - self.path - ))); - } - }; - - nulls.append_non_null(); - values.push(value) - } else { - // otherwise we have a typed value, so we can use it directly - nulls.append_non_null(); - values.push(typed_value.value(i)); - } - } - - let nulls = nulls.finish(); - let array = PrimitiveArray::::new(values.into(), nulls) - .with_data_type(self.as_type.data_type().clone()); - Ok(Arc::new(array)) - } - - fn typed( - &self, - _variant_array: &VariantArray, - _metadata: &BinaryViewArray, - typed_value: &ArrayRef, - ) -> arrow::error::Result { - // if the types match exactly, we can just return the typed_value - if typed_value.data_type() == self.as_type.data_type() { - Ok(typed_value.clone()) - } else { - // TODO: try to cast the typed_value to the desired type? - // https://github.com/apache/arrow-rs/issues/8086 - Err(ArrowError::NotYetImplemented(format!( - "variant_get fully_shredded as {:?} with typed_value={:?} is not implemented yet", - self.as_type.data_type(), - typed_value.data_type() - ))) - } - } - - fn unshredded( - &self, - _variant_array: &VariantArray, - _metadata: &BinaryViewArray, - _value_field: &BinaryViewArray, - ) -> Result { - Err(ArrowError::NotYetImplemented(String::from( - "variant_get unshredded to primitive types is not implemented yet", - ))) - } - - fn all_null( - &self, - variant_array: &VariantArray, - _metadata: &BinaryViewArray, - ) -> Result { - // For all-null case, create a primitive array with all null values - Ok(Arc::new(new_null_array( - self.as_type.data_type(), - variant_array.len(), - ))) - } -} - -impl ArrowPrimitiveVariant for Int32Type { - fn from_variant(variant: &Variant) -> Option { - variant.as_int32() - } -} - -impl ArrowPrimitiveVariant for Int16Type { - fn from_variant(variant: &Variant) -> Option { - variant.as_int16() - } -} diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs b/parquet-variant-compute/src/variant_get/output/variant.rs deleted file mode 100644 index 8a1fe8335fde..000000000000 --- a/parquet-variant-compute/src/variant_get/output/variant.rs +++ /dev/null @@ -1,208 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::variant_get::output::OutputBuilder; -use crate::{type_conversion::primitive_conversion_array, VariantArray, VariantArrayBuilder}; -use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray}; -use arrow::datatypes::{ - Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, -}; -use arrow_schema::{ArrowError, DataType}; -use parquet_variant::{Variant, VariantPath}; -use std::sync::Arc; - -macro_rules! cast_partially_shredded_primitive { - ($typed_value:expr, $variant_array:expr, $arrow_type:ty) => {{ - let mut array_builder = VariantArrayBuilder::new($variant_array.len()); - let primitive_array = $typed_value.as_primitive::<$arrow_type>(); - for i in 0..$variant_array.len() { - if $variant_array.is_null(i) { - array_builder.append_null(); - } else if $typed_value.is_null(i) { - // fall back to the value (variant) field - // (TODO could copy the variant bytes directly) - let value = $variant_array.value(i); - array_builder.append_variant(value); - } else { - // otherwise we have a typed value, so we can use it directly - let value = primitive_array.value(i); - array_builder.append_variant(Variant::from(value)); - } - } - Ok(Arc::new(array_builder.build())) - }}; -} - -/// Outputs VariantArrays -pub(super) struct VariantOutputBuilder<'a> { - /// What path to extract - path: VariantPath<'a>, -} - -impl<'a> VariantOutputBuilder<'a> { - pub(super) fn new(path: VariantPath<'a>) -> Self { - Self { path } - } -} - -impl OutputBuilder for VariantOutputBuilder<'_> { - fn partially_shredded( - &self, - variant_array: &VariantArray, - // TODO(perf): can reuse the metadata field here to avoid re-creating it - _metadata: &BinaryViewArray, - _value_field: &BinaryViewArray, - typed_value: &ArrayRef, - ) -> arrow::error::Result { - // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same) - match typed_value.data_type() { - DataType::Int8 => { - cast_partially_shredded_primitive!(typed_value, variant_array, Int8Type) - } - - DataType::Int16 => { - cast_partially_shredded_primitive!(typed_value, variant_array, Int16Type) - } - - DataType::Int32 => { - cast_partially_shredded_primitive!(typed_value, variant_array, Int32Type) - } - - DataType::Int64 => { - cast_partially_shredded_primitive!(typed_value, variant_array, Int64Type) - } - - DataType::UInt8 => { - cast_partially_shredded_primitive!(typed_value, variant_array, UInt8Type) - } - - DataType::UInt16 => { - cast_partially_shredded_primitive!(typed_value, variant_array, UInt16Type) - } - - DataType::UInt32 => { - cast_partially_shredded_primitive!(typed_value, variant_array, UInt32Type) - } - - DataType::UInt64 => { - cast_partially_shredded_primitive!(typed_value, variant_array, UInt64Type) - } - - DataType::Float16 => { - cast_partially_shredded_primitive!(typed_value, variant_array, Float16Type) - } - - DataType::Float32 => { - cast_partially_shredded_primitive!(typed_value, variant_array, Float32Type) - } - - DataType::Float64 => { - cast_partially_shredded_primitive!(typed_value, variant_array, Float64Type) - } - - dt => { - // https://github.com/apache/arrow-rs/issues/8086 - Err(ArrowError::NotYetImplemented(format!( - "variant_get partially shredded with typed_value={dt} is not implemented yet", - ))) - } - } - } - - fn typed( - &self, - variant_array: &VariantArray, - // TODO(perf): can reuse the metadata field here to avoid re-creating it - _metadata: &BinaryViewArray, - typed_value: &ArrayRef, - ) -> arrow::error::Result { - // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same) - let mut array_builder = VariantArrayBuilder::new(variant_array.len()); - match typed_value.data_type() { - DataType::Int8 => primitive_conversion_array!(Int8Type, typed_value, array_builder), - DataType::Int16 => primitive_conversion_array!(Int16Type, typed_value, array_builder), - DataType::Int32 => primitive_conversion_array!(Int32Type, typed_value, array_builder), - DataType::Int64 => primitive_conversion_array!(Int64Type, typed_value, array_builder), - DataType::UInt8 => primitive_conversion_array!(UInt8Type, typed_value, array_builder), - DataType::UInt16 => primitive_conversion_array!(UInt16Type, typed_value, array_builder), - DataType::UInt32 => primitive_conversion_array!(UInt32Type, typed_value, array_builder), - DataType::UInt64 => primitive_conversion_array!(UInt64Type, typed_value, array_builder), - DataType::Float16 => { - primitive_conversion_array!(Float16Type, typed_value, array_builder) - } - DataType::Float32 => { - primitive_conversion_array!(Float32Type, typed_value, array_builder) - } - DataType::Float64 => { - primitive_conversion_array!(Float64Type, typed_value, array_builder) - } - dt => { - // https://github.com/apache/arrow-rs/issues/8087 - return Err(ArrowError::NotYetImplemented(format!( - "variant_get perfectly shredded with typed_value={dt} is not implemented yet", - ))); - } - } - Ok(Arc::new(array_builder.build())) - } - - fn unshredded( - &self, - variant_array: &VariantArray, - _metadata: &BinaryViewArray, - _value_field: &BinaryViewArray, - ) -> arrow::error::Result { - let mut builder = VariantArrayBuilder::new(variant_array.len()); - for i in 0..variant_array.len() { - let new_variant = variant_array.value(i); - - // TODO: perf? - let Some(new_variant) = new_variant.get_path(&self.path) else { - // path not found, append null - builder.append_null(); - continue; - }; - - // TODO: we're decoding the value and doing a copy into a variant value - // again. This can be much faster by using the _metadata and _value_field - // to avoid decoding the entire variant: - // - // 1) reuse the metadata arrays as is - // - // 2) Create a new BinaryViewArray that uses the same underlying buffers - // that the original variant used, but whose views points to a new - // offset for the new path - builder.append_variant(new_variant); - } - - Ok(Arc::new(builder.build())) - } - - fn all_null( - &self, - variant_array: &VariantArray, - _metadata: &BinaryViewArray, - ) -> arrow::error::Result { - // For all-null case, simply create a VariantArray with all null values - let mut builder = VariantArrayBuilder::new(variant_array.len()); - for _i in 0..variant_array.len() { - builder.append_null(); - } - Ok(Arc::new(builder.build())) - } -}