Skip to content

Commit a9d6e92

Browse files
authored
[Variant] Move ArrayVariantToArrowRowBuilder to variant_to_arrow (apache#9094)
# Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. --> - Part of apache#8082. # Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> To support `Lists/Array` in `variant_get`, it's better to move `ArrayVariantToArrowRowBuilder` from `shred_variant` to `variant_to_arrow` and be shared with `variant_get`. In the meantime, some code movement in `variant_to_arrow` would help to get a better overview of the overall implementation # What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> This PR can be reviewed commit by commit: - Move `VariantToArrowRowBuilder` and related impl to the top of `variant_to_arrow` - Push the `FixedSizeList` check from `make_variant_to_shredded_variant_arrow_row_builder` down to `ArrayVariantToArrowRowBuilder` - Move `ArrayVariantToArrowRowBuilder` to `variant_to_arrow` # Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Covered by existing tests # Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out. --> No
1 parent b8a2c1a commit a9d6e92

2 files changed

Lines changed: 291 additions & 272 deletions

File tree

parquet-variant-compute/src/shred_variant.rs

Lines changed: 24 additions & 179 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,17 @@
1919
2020
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
2121
use crate::variant_to_arrow::{
22-
PrimitiveVariantToArrowRowBuilder, make_primitive_variant_to_arrow_row_builder,
22+
ArrayVariantToArrowRowBuilder, PrimitiveVariantToArrowRowBuilder,
23+
make_primitive_variant_to_arrow_row_builder,
2324
};
2425
use crate::{VariantArray, VariantValueArrayBuilder};
25-
use arrow::array::{
26-
ArrayRef, BinaryViewArray, GenericListArray, GenericListViewArray, NullBufferBuilder,
27-
OffsetSizeTrait,
28-
};
29-
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
26+
use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder};
27+
use arrow::buffer::NullBuffer;
3028
use arrow::compute::CastOptions;
31-
use arrow::datatypes::{ArrowNativeTypeOp, DataType, Field, FieldRef, Fields, TimeUnit};
29+
use arrow::datatypes::{DataType, Field, FieldRef, Fields, TimeUnit};
3230
use arrow::error::{ArrowError, Result};
3331
use indexmap::IndexMap;
34-
use parquet_variant::{Variant, VariantBuilderExt, VariantList, VariantPath, VariantPathElement};
32+
use parquet_variant::{Variant, VariantBuilderExt, VariantPath, VariantPathElement};
3533
use std::collections::BTreeMap;
3634
use std::sync::Arc;
3735

@@ -123,19 +121,15 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>(
123121
DataType::List(_)
124122
| DataType::LargeList(_)
125123
| DataType::ListView(_)
126-
| DataType::LargeListView(_) => {
124+
| DataType::LargeListView(_)
125+
| DataType::FixedSizeList(..) => {
127126
let typed_value_builder = VariantToShreddedArrayVariantRowBuilder::try_new(
128127
data_type,
129128
cast_options,
130129
capacity,
131130
)?;
132131
VariantToShreddedVariantRowBuilder::Array(typed_value_builder)
133132
}
134-
DataType::FixedSizeList(..) => {
135-
return Err(ArrowError::NotYetImplemented(
136-
"Shredding variant array values as fixed-size lists".to_string(),
137-
));
138-
}
139133
// Supported shredded primitive types, see Variant shredding spec:
140134
// https://github.com/apache/parquet-format/blob/master/VariantShredding.md#shredded-value-types
141135
DataType::Boolean
@@ -312,171 +306,6 @@ impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> {
312306
}
313307
}
314308

315-
enum ArrayVariantToArrowRowBuilder<'a> {
316-
List(VariantToListArrowRowBuilder<'a, i32, false>),
317-
LargeList(VariantToListArrowRowBuilder<'a, i64, false>),
318-
ListView(VariantToListArrowRowBuilder<'a, i32, true>),
319-
LargeListView(VariantToListArrowRowBuilder<'a, i64, true>),
320-
}
321-
322-
impl<'a> ArrayVariantToArrowRowBuilder<'a> {
323-
fn try_new(
324-
data_type: &'a DataType,
325-
cast_options: &'a CastOptions,
326-
capacity: usize,
327-
) -> Result<Self> {
328-
use ArrayVariantToArrowRowBuilder::*;
329-
330-
// Make List/ListView builders without repeating the constructor boilerplate.
331-
macro_rules! make_list_builder {
332-
($variant:ident, $offset:ty, $is_view:expr, $field:ident) => {
333-
$variant(VariantToListArrowRowBuilder::<$offset, $is_view>::try_new(
334-
$field.clone(),
335-
$field.data_type(),
336-
cast_options,
337-
capacity,
338-
)?)
339-
};
340-
}
341-
342-
let builder = match data_type {
343-
DataType::List(field) => make_list_builder!(List, i32, false, field),
344-
DataType::LargeList(field) => make_list_builder!(LargeList, i64, false, field),
345-
DataType::ListView(field) => make_list_builder!(ListView, i32, true, field),
346-
DataType::LargeListView(field) => make_list_builder!(LargeListView, i64, true, field),
347-
other => {
348-
return Err(ArrowError::InvalidArgumentError(format!(
349-
"Casting to {other:?} is not applicable for array Variant types"
350-
)));
351-
}
352-
};
353-
Ok(builder)
354-
}
355-
356-
fn append_null(&mut self) {
357-
match self {
358-
Self::List(builder) => builder.append_null(),
359-
Self::LargeList(builder) => builder.append_null(),
360-
Self::ListView(builder) => builder.append_null(),
361-
Self::LargeListView(builder) => builder.append_null(),
362-
}
363-
}
364-
365-
fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> {
366-
match self {
367-
Self::List(builder) => builder.append_value(list),
368-
Self::LargeList(builder) => builder.append_value(list),
369-
Self::ListView(builder) => builder.append_value(list),
370-
Self::LargeListView(builder) => builder.append_value(list),
371-
}
372-
}
373-
374-
fn finish(self) -> Result<ArrayRef> {
375-
match self {
376-
Self::List(builder) => builder.finish(),
377-
Self::LargeList(builder) => builder.finish(),
378-
Self::ListView(builder) => builder.finish(),
379-
Self::LargeListView(builder) => builder.finish(),
380-
}
381-
}
382-
}
383-
384-
struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool>
385-
where
386-
O: OffsetSizeTrait + ArrowNativeTypeOp,
387-
{
388-
field: FieldRef,
389-
offsets: Vec<O>,
390-
element_builder: Box<VariantToShreddedVariantRowBuilder<'a>>,
391-
nulls: NullBufferBuilder,
392-
current_offset: O,
393-
}
394-
395-
impl<'a, O, const IS_VIEW: bool> VariantToListArrowRowBuilder<'a, O, IS_VIEW>
396-
where
397-
O: OffsetSizeTrait + ArrowNativeTypeOp,
398-
{
399-
fn try_new(
400-
field: FieldRef,
401-
element_data_type: &'a DataType,
402-
cast_options: &'a CastOptions,
403-
capacity: usize,
404-
) -> Result<Self> {
405-
if capacity >= isize::MAX as usize {
406-
return Err(ArrowError::ComputeError(
407-
"Capacity exceeds isize::MAX when reserving list offsets".to_string(),
408-
));
409-
}
410-
let mut offsets = Vec::with_capacity(capacity + 1);
411-
offsets.push(O::ZERO);
412-
let element_builder = make_variant_to_shredded_variant_arrow_row_builder(
413-
element_data_type,
414-
cast_options,
415-
capacity,
416-
false,
417-
)?;
418-
Ok(Self {
419-
field,
420-
offsets,
421-
element_builder: Box::new(element_builder),
422-
nulls: NullBufferBuilder::new(capacity),
423-
current_offset: O::ZERO,
424-
})
425-
}
426-
427-
fn append_null(&mut self) {
428-
self.offsets.push(self.current_offset);
429-
self.nulls.append_null();
430-
}
431-
432-
fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> {
433-
for element in list.iter() {
434-
self.element_builder.append_value(element)?;
435-
self.current_offset = self.current_offset.add_checked(O::ONE)?;
436-
}
437-
self.offsets.push(self.current_offset);
438-
self.nulls.append_non_null();
439-
Ok(())
440-
}
441-
442-
fn finish(mut self) -> Result<ArrayRef> {
443-
let (value, typed_value, nulls) = self.element_builder.finish()?;
444-
let element_array =
445-
ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls);
446-
let field = Arc::new(
447-
self.field
448-
.as_ref()
449-
.clone()
450-
.with_data_type(element_array.data_type().clone()),
451-
);
452-
453-
if IS_VIEW {
454-
// NOTE: `offsets` is never empty (constructor pushes an entry)
455-
let mut sizes = Vec::with_capacity(self.offsets.len() - 1);
456-
for i in 1..self.offsets.len() {
457-
sizes.push(self.offsets[i] - self.offsets[i - 1]);
458-
}
459-
self.offsets.pop();
460-
let list_view_array = GenericListViewArray::<O>::new(
461-
field,
462-
ScalarBuffer::from(self.offsets),
463-
ScalarBuffer::from(sizes),
464-
ArrayRef::from(element_array),
465-
self.nulls.finish(),
466-
);
467-
Ok(Arc::new(list_view_array))
468-
} else {
469-
let list_array = GenericListArray::<O>::new(
470-
field,
471-
OffsetBuffer::<O>::new(ScalarBuffer::from(self.offsets)),
472-
ArrayRef::from(element_array),
473-
self.nulls.finish(),
474-
);
475-
Ok(Arc::new(list_array))
476-
}
477-
}
478-
}
479-
480309
pub(crate) struct VariantToShreddedObjectVariantRowBuilder<'a> {
481310
value_builder: VariantValueArrayBuilder,
482311
typed_value_builders: IndexMap<&'a str, VariantToShreddedVariantRowBuilder<'a>>,
@@ -1513,6 +1342,22 @@ mod tests {
15131342
);
15141343
}
15151344

1345+
#[test]
1346+
fn test_array_shredding_as_fixed_size_list() {
1347+
let input = build_variant_array(vec![VariantRow::List(vec![
1348+
VariantValue::from(1i64),
1349+
VariantValue::from(2i64),
1350+
VariantValue::from(3i64),
1351+
])]);
1352+
let list_schema =
1353+
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2);
1354+
let err = shred_variant(&input, &list_schema).unwrap_err();
1355+
assert_eq!(
1356+
err.to_string(),
1357+
"Not yet implemented: Converting unshredded variant arrays to arrow fixed-size lists"
1358+
);
1359+
}
1360+
15161361
#[test]
15171362
fn test_array_shredding_with_array_elements() {
15181363
let input = build_variant_array(vec![

0 commit comments

Comments
 (0)