Skip to content

Commit 744c829

Browse files
committed
having fun with schema
1 parent 1faac00 commit 744c829

File tree

2 files changed

+65
-79
lines changed

2 files changed

+65
-79
lines changed

crates/iceberg/src/arrow/nan_val_cnt_visitor.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ impl NanValueCountVisitor {
156156

157157
/// Compute nan value counts in given schema and record batch
158158
pub fn compute(&mut self, schema: SchemaRef, batch: RecordBatch) -> Result<()> {
159-
let arrow_arr_partner_accessor = ArrowArrayAccessor {};
159+
let arrow_arr_partner_accessor =
160+
ArrowArrayAccessor::new_with_table_schema(schema.as_ref())?;
160161

161162
let struct_arr = Arc::new(StructArray::from(batch)) as ArrayRef;
162163
visit_struct_with_partner(

crates/iceberg/src/arrow/value.rs

Lines changed: 63 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@ use arrow_array::{
2121
LargeListArray, LargeStringArray, ListArray, MapArray, StringArray, StructArray,
2222
Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray,
2323
};
24-
use arrow_schema::DataType;
24+
use arrow_schema::{DataType, Schema as ArrowSchema};
2525
use uuid::Uuid;
2626

27-
use super::get_field_id;
27+
use super::{get_field_id, schema_to_arrow_schema};
2828
use crate::spec::{
29-
ListType, Literal, Map, MapType, NestedField, PartnerAccessor, PrimitiveType,
29+
ListType, Literal, Map, MapType, NestedField, PartnerAccessor, PrimitiveType, Schema,
3030
SchemaWithPartnerVisitor, Struct, StructType, visit_struct_with_partner,
3131
};
3232
use crate::{Error, ErrorKind, Result};
@@ -426,7 +426,24 @@ impl SchemaWithPartnerVisitor<ArrayRef> for ArrowArrayToIcebergStructConverter {
426426
}
427427

428428
/// Partner type representing accessing and walking arrow arrays alongside iceberg schema
429-
pub struct ArrowArrayAccessor;
429+
pub struct ArrowArrayAccessor {
430+
arrow_schema: Option<ArrowSchema>,
431+
}
432+
433+
impl ArrowArrayAccessor {
434+
/// Creates a new instance of ArrowArrayAccessor without arrow schema fallback
435+
pub fn new() -> Result<Self> {
436+
Ok(Self { arrow_schema: None })
437+
}
438+
439+
/// Creates a new instance of ArrowArrayAccessor with arrow schema converted from table schema
440+
/// for field ID resolution fallback
441+
pub fn new_with_table_schema(table_schema: &Schema) -> Result<Self> {
442+
Ok(Self {
443+
arrow_schema: Some(schema_to_arrow_schema(table_schema)?),
444+
})
445+
}
446+
}
430447

431448
impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
432449
fn struct_partner<'a>(&self, schema_partner: &'a ArrayRef) -> Result<&'a ArrayRef> {
@@ -459,9 +476,13 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
459476
.fields()
460477
.iter()
461478
.position(|arrow_field| {
462-
// match by ID if available, otherwise try matching by name
463-
get_field_id(arrow_field)
464-
.map_or(arrow_field.name() == &field.name, |id| id == field.id)
479+
get_field_id(arrow_field).map_or(false, |id| id == field.id)
480+
|| self
481+
.arrow_schema
482+
.as_ref()
483+
.and_then(|schema| schema.field_with_name(&field.name).ok())
484+
.and_then(|field_from_schema| get_field_id(field_from_schema).ok())
485+
.map_or(false, |id| id == field.id)
465486
})
466487
.ok_or_else(|| {
467488
Error::new(
@@ -559,7 +580,7 @@ pub fn arrow_struct_to_literal(
559580
ty,
560581
struct_array,
561582
&mut ArrowArrayToIcebergStructConverter,
562-
&ArrowArrayAccessor,
583+
&ArrowArrayAccessor::new()?,
563584
)
564585
}
565586

@@ -910,86 +931,50 @@ mod test {
910931
}
911932

912933
#[test]
913-
fn test_field_name_fallback_when_id_unavailable() {
914-
// Create an Arrow struct array with fields that don't have field IDs in metadata
915-
let int32_array = Int32Array::from(vec![Some(1), Some(2), None]);
916-
let string_array = StringArray::from(vec![Some("hello"), Some("world"), None]);
934+
fn test_field_id_fallback_with_arrow_schema() {
935+
// Create an Arrow struct array with a field that doesn't have field ID in metadata
936+
let int32_array = Int32Array::from(vec![Some(42), Some(43), None]);
917937

918-
let struct_array =
919-
Arc::new(StructArray::from(vec![
920-
(
921-
// Field without field ID metadata - should fallback to name matching
922-
Arc::new(Field::new("field_a", DataType::Int32, true)),
923-
Arc::new(int32_array) as ArrayRef,
924-
),
925-
(
926-
// Field with the correct field ID metadata
927-
Arc::new(Field::new("field_b", DataType::Utf8, true).with_metadata(
928-
HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())]),
929-
)),
930-
Arc::new(string_array) as ArrayRef,
931-
),
932-
])) as ArrayRef;
938+
// Create the struct array with a field that has no field ID metadata
939+
let struct_array = Arc::new(StructArray::from(vec![(
940+
Arc::new(Field::new("field_a", DataType::Int32, true)), // No field ID metadata
941+
Arc::new(int32_array) as ArrayRef,
942+
)])) as ArrayRef;
933943

934-
// Create Iceberg struct type with field IDs that don't match the Arrow metadata
935-
let iceberg_struct_type = StructType::new(vec![
936-
Arc::new(NestedField::optional(
937-
1, // Different ID than what's in Arrow metadata (or no metadata)
938-
"field_a", // Same name as Arrow field
944+
// Create an Iceberg schema with field ID
945+
let iceberg_schema = Schema::builder()
946+
.with_schema_id(1)
947+
.with_fields(vec![Arc::new(NestedField::optional(
948+
100, // Field ID that we'll look for
949+
"field_a",
939950
Type::Primitive(PrimitiveType::Int),
940-
)),
941-
Arc::new(NestedField::optional(
942-
2, // Same ID
943-
"field_b", // Same name as Arrow field
944-
Type::Primitive(PrimitiveType::String),
945-
)),
946-
]);
947-
948-
// This should succeed by falling back to field name matching
949-
let result = arrow_struct_to_literal(&struct_array, &iceberg_struct_type).unwrap();
950-
951-
assert_eq!(result, vec![
952-
Some(Literal::Struct(Struct::from_iter(vec![
953-
Some(Literal::int(1)),
954-
Some(Literal::string("hello".to_string())),
955-
]))),
956-
Some(Literal::Struct(Struct::from_iter(vec![
957-
Some(Literal::int(2)),
958-
Some(Literal::string("world".to_string())),
959-
]))),
960-
Some(Literal::Struct(Struct::from_iter(vec![None, None,]))),
961-
]);
962-
}
951+
))])
952+
.build()
953+
.unwrap();
963954

964-
#[test]
965-
fn test_field_not_found_error() {
966-
// Test that we get an appropriate error when neither field ID nor name matches
955+
// Create an ArrowArrayAccessor with the table schema for fallback
956+
let accessor = ArrowArrayAccessor::new_with_table_schema(&iceberg_schema).unwrap();
967957

968-
let int32_array = Int32Array::from(vec![Some(1), Some(2)]);
958+
// Create a nested field to look up
959+
let field = NestedField::optional(100, "field_a", Type::Primitive(PrimitiveType::Int));
969960

970-
let struct_array = Arc::new(StructArray::from(vec![(
971-
Arc::new(Field::new("arrow_field_name", DataType::Int32, true)),
972-
Arc::new(int32_array) as ArrayRef,
973-
)])) as ArrayRef;
961+
// This should succeed by using the arrow_schema fallback
962+
let result = accessor.field_partner(&struct_array, &field);
974963

975-
// Create Iceberg struct type with field that doesn't match by ID or name
976-
let iceberg_struct_type = StructType::new(vec![Arc::new(NestedField::optional(
977-
10,
978-
"different_field_name", // Different name than Arrow field
979-
Type::Primitive(PrimitiveType::Int),
980-
))]);
964+
// Verify that the field was found
965+
assert!(result.is_ok());
981966

982-
// This should fail with an appropriate error message
983-
let result = arrow_struct_to_literal(&struct_array, &iceberg_struct_type);
967+
// Verify that the field has the expected value
968+
let array_ref = result.unwrap();
969+
let int_array = array_ref.as_any().downcast_ref::<Int32Array>().unwrap();
970+
assert_eq!(int_array.value(0), 42);
971+
assert_eq!(int_array.value(1), 43);
972+
assert!(int_array.is_null(2));
984973

974+
// Now try with an accessor without arrow_schema - this should fail
975+
let accessor_without_schema = ArrowArrayAccessor::new().unwrap();
976+
let result = accessor_without_schema.field_partner(&struct_array, &field);
985977
assert!(result.is_err());
986-
let error = result.unwrap_err();
987-
assert_eq!(error.kind(), ErrorKind::DataInvalid);
988-
assert!(
989-
error.message().contains(
990-
"Field with id=10 or name=different_field_name not found in struct array"
991-
)
992-
);
993978
}
994979

995980
#[test]

0 commit comments

Comments
 (0)