@@ -21,12 +21,12 @@ use arrow_array::{
21
21
LargeListArray , LargeStringArray , ListArray , MapArray , StringArray , StructArray ,
22
22
Time64MicrosecondArray , TimestampMicrosecondArray , TimestampNanosecondArray ,
23
23
} ;
24
- use arrow_schema:: { DataType , FieldRef , Schema as ArrowSchema } ;
24
+ use arrow_schema:: { DataType , FieldRef } ;
25
25
use uuid:: Uuid ;
26
26
27
- use super :: { get_field_id, schema_to_arrow_schema } ;
27
+ use super :: get_field_id;
28
28
use crate :: spec:: {
29
- ListType , Literal , Map , MapType , NestedField , PartnerAccessor , PrimitiveType , Schema ,
29
+ ListType , Literal , Map , MapType , NestedField , PartnerAccessor , PrimitiveType ,
30
30
SchemaWithPartnerVisitor , Struct , StructType , visit_struct_with_partner,
31
31
} ;
32
32
use crate :: { Error , ErrorKind , Result } ;
@@ -425,13 +425,26 @@ impl SchemaWithPartnerVisitor<ArrayRef> for ArrowArrayToIcebergStructConverter {
425
425
}
426
426
}
427
427
428
- /// todo doc
428
+ /// Defines how Arrow fields are matched with Iceberg fields when converting data.
429
+ ///
430
+ /// This enum provides two strategies for matching fields:
431
+ /// - `Id`: Match fields by their ID, which is stored in Arrow field metadata.
432
+ /// - `Name`: Match fields by their name, ignoring the field ID.
433
+ ///
434
+ /// The ID matching mode is the default and preferred approach as it's more robust
435
+ /// against schema evolution where field names might change but IDs remain stable.
436
+ /// The name matching mode can be useful in scenarios where field IDs are not available
437
+ /// or when working with systems that don't preserve field IDs.
438
+ #[ derive( Clone , Copy , Debug ) ]
429
439
pub enum FieldMatchMode {
440
+ /// Match fields by their ID stored in Arrow field metadata
430
441
Id ,
442
+ /// Match fields by their name, ignoring field IDs
431
443
Name ,
432
444
}
433
445
434
446
impl FieldMatchMode {
447
+ /// Determines if an Arrow field matches an Iceberg field based on the matching mode.
435
448
pub fn match_field ( & self , arrow_field : & FieldRef , iceberg_field : & NestedField ) -> bool {
436
449
match self {
437
450
FieldMatchMode :: Id => get_field_id ( arrow_field)
@@ -448,15 +461,14 @@ pub struct ArrowArrayAccessor {
448
461
}
449
462
450
463
impl ArrowArrayAccessor {
451
- /// Creates a new instance of ArrowArrayAccessor without arrow schema fallback
464
+ /// Creates a new instance of ArrowArrayAccessor with the default ID matching mode
452
465
pub fn new ( ) -> Self {
453
466
Self {
454
467
match_mode : FieldMatchMode :: Id ,
455
468
}
456
469
}
457
470
458
- /// Creates a new instance of ArrowArrayAccessor with arrow schema converted from table schema
459
- /// for field ID resolution fallback
471
+ /// Creates a new instance of ArrowArrayAccessor with the specified matching mode
460
472
pub fn new_with_match_mode ( match_mode : FieldMatchMode ) -> Self {
461
473
Self { match_mode }
462
474
}
@@ -933,50 +945,180 @@ mod test {
933
945
}
934
946
935
947
#[ test]
936
- fn test_field_id_fallback_with_arrow_schema ( ) {
937
- // Create an Arrow struct array with a field that doesn't have field ID in metadata
938
- let int32_array = Int32Array :: from ( vec ! [ Some ( 42 ) , Some ( 43 ) , None ] ) ;
939
-
940
- // Create the struct array with a field that has no field ID metadata
941
- let struct_array = Arc :: new ( StructArray :: from ( vec ! [ (
942
- Arc :: new( Field :: new( "field_a" , DataType :: Int32 , true ) ) , // No field ID metadata
943
- Arc :: new( int32_array) as ArrayRef ,
944
- ) ] ) ) as ArrayRef ;
945
-
946
- // Create an Iceberg schema with field ID
947
- let iceberg_schema = Schema :: builder ( )
948
- . with_schema_id ( 1 )
949
- . with_fields ( vec ! [ Arc :: new( NestedField :: optional(
950
- 100 , // Field ID that we'll look for
951
- "field_a" ,
952
- Type :: Primitive ( PrimitiveType :: Int ) ,
953
- ) ) ] )
954
- . build ( )
955
- . unwrap ( ) ;
948
+ fn test_find_field_by_id ( ) {
949
+ // Create Arrow arrays for the nested structure
950
+ let field_a_array = Int32Array :: from ( vec ! [ Some ( 42 ) , Some ( 43 ) , None ] ) ;
951
+ let field_b_array = StringArray :: from ( vec ! [ Some ( "value1" ) , Some ( "value2" ) , None ] ) ;
952
+
953
+ // Create the nested struct array with field IDs in metadata
954
+ let nested_struct_array =
955
+ Arc :: new ( StructArray :: from ( vec ! [
956
+ (
957
+ Arc :: new( Field :: new( "field_a" , DataType :: Int32 , true ) . with_metadata(
958
+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "1" . to_string( ) ) ] ) ,
959
+ ) ) ,
960
+ Arc :: new( field_a_array) as ArrayRef ,
961
+ ) ,
962
+ (
963
+ Arc :: new( Field :: new( "field_b" , DataType :: Utf8 , true ) . with_metadata(
964
+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "2" . to_string( ) ) ] ) ,
965
+ ) ) ,
966
+ Arc :: new( field_b_array) as ArrayRef ,
967
+ ) ,
968
+ ] ) ) as ArrayRef ;
956
969
957
- // Create an ArrowArrayAccessor with the table schema for fallback
958
- let accessor = ArrowArrayAccessor :: new_with_table_schema ( & iceberg_schema) . unwrap ( ) ;
970
+ let field_c_array = Int32Array :: from ( vec ! [ Some ( 100 ) , Some ( 200 ) , None ] ) ;
959
971
960
- // Create a nested field to look up
961
- let field = NestedField :: optional ( 100 , "field_a" , Type :: Primitive ( PrimitiveType :: Int ) ) ;
972
+ // Create the top-level struct array with field IDs in metadata
973
+ let struct_array = Arc :: new ( StructArray :: from ( vec ! [
974
+ (
975
+ Arc :: new(
976
+ Field :: new(
977
+ "nested_struct" ,
978
+ DataType :: Struct ( Fields :: from( vec![
979
+ Field :: new( "field_a" , DataType :: Int32 , true ) . with_metadata(
980
+ HashMap :: from( [ (
981
+ PARQUET_FIELD_ID_META_KEY . to_string( ) ,
982
+ "1" . to_string( ) ,
983
+ ) ] ) ,
984
+ ) ,
985
+ Field :: new( "field_b" , DataType :: Utf8 , true ) . with_metadata(
986
+ HashMap :: from( [ (
987
+ PARQUET_FIELD_ID_META_KEY . to_string( ) ,
988
+ "2" . to_string( ) ,
989
+ ) ] ) ,
990
+ ) ,
991
+ ] ) ) ,
992
+ true ,
993
+ )
994
+ . with_metadata( HashMap :: from( [ (
995
+ PARQUET_FIELD_ID_META_KEY . to_string( ) ,
996
+ "3" . to_string( ) ,
997
+ ) ] ) ) ,
998
+ ) ,
999
+ nested_struct_array,
1000
+ ) ,
1001
+ (
1002
+ Arc :: new( Field :: new( "field_c" , DataType :: Int32 , true ) . with_metadata(
1003
+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "4" . to_string( ) ) ] ) ,
1004
+ ) ) ,
1005
+ Arc :: new( field_c_array) as ArrayRef ,
1006
+ ) ,
1007
+ ] ) ) as ArrayRef ;
962
1008
963
- // This should succeed by using the arrow_schema fallback
964
- let result = accessor . field_partner ( & struct_array , & field ) ;
1009
+ // Create an ArrowArrayAccessor with ID matching mode
1010
+ let accessor = ArrowArrayAccessor :: new_with_match_mode ( FieldMatchMode :: Id ) ;
965
1011
966
- // Verify that the field was found
967
- assert ! ( result. is_ok( ) ) ;
1012
+ // Test finding fields by ID
1013
+ let nested_field = NestedField :: optional (
1014
+ 3 ,
1015
+ "nested_struct" ,
1016
+ Type :: Struct ( StructType :: new ( vec ! [
1017
+ Arc :: new( NestedField :: optional(
1018
+ 1 ,
1019
+ "field_a" ,
1020
+ Type :: Primitive ( PrimitiveType :: Int ) ,
1021
+ ) ) ,
1022
+ Arc :: new( NestedField :: optional(
1023
+ 2 ,
1024
+ "field_b" ,
1025
+ Type :: Primitive ( PrimitiveType :: String ) ,
1026
+ ) ) ,
1027
+ ] ) ) ,
1028
+ ) ;
1029
+ let nested_partner = accessor
1030
+ . field_partner ( & struct_array, & nested_field)
1031
+ . unwrap ( ) ;
968
1032
969
- // Verify that the field has the expected value
970
- let array_ref = result. unwrap ( ) ;
971
- let int_array = array_ref. as_any ( ) . downcast_ref :: < Int32Array > ( ) . unwrap ( ) ;
1033
+ // Verify we can access the nested field
1034
+ let field_a = NestedField :: optional ( 1 , "field_a" , Type :: Primitive ( PrimitiveType :: Int ) ) ;
1035
+ let field_a_partner = accessor. field_partner ( nested_partner, & field_a) . unwrap ( ) ;
1036
+
1037
+ // Verify the field has the expected value
1038
+ let int_array = field_a_partner
1039
+ . as_any ( )
1040
+ . downcast_ref :: < Int32Array > ( )
1041
+ . unwrap ( ) ;
972
1042
assert_eq ! ( int_array. value( 0 ) , 42 ) ;
973
1043
assert_eq ! ( int_array. value( 1 ) , 43 ) ;
974
1044
assert ! ( int_array. is_null( 2 ) ) ;
1045
+ }
1046
+
1047
+ #[ test]
1048
+ fn test_find_field_by_name ( ) {
1049
+ // Create Arrow arrays for the nested structure
1050
+ let field_a_array = Int32Array :: from ( vec ! [ Some ( 42 ) , Some ( 43 ) , None ] ) ;
1051
+ let field_b_array = StringArray :: from ( vec ! [ Some ( "value1" ) , Some ( "value2" ) , None ] ) ;
1052
+
1053
+ // Create the nested struct array WITHOUT field IDs in metadata
1054
+ let nested_struct_array = Arc :: new ( StructArray :: from ( vec ! [
1055
+ (
1056
+ Arc :: new( Field :: new( "field_a" , DataType :: Int32 , true ) ) ,
1057
+ Arc :: new( field_a_array) as ArrayRef ,
1058
+ ) ,
1059
+ (
1060
+ Arc :: new( Field :: new( "field_b" , DataType :: Utf8 , true ) ) ,
1061
+ Arc :: new( field_b_array) as ArrayRef ,
1062
+ ) ,
1063
+ ] ) ) as ArrayRef ;
1064
+
1065
+ let field_c_array = Int32Array :: from ( vec ! [ Some ( 100 ) , Some ( 200 ) , None ] ) ;
1066
+
1067
+ // Create the top-level struct array WITHOUT field IDs in metadata
1068
+ let struct_array = Arc :: new ( StructArray :: from ( vec ! [
1069
+ (
1070
+ Arc :: new( Field :: new(
1071
+ "nested_struct" ,
1072
+ DataType :: Struct ( Fields :: from( vec![
1073
+ Field :: new( "field_a" , DataType :: Int32 , true ) ,
1074
+ Field :: new( "field_b" , DataType :: Utf8 , true ) ,
1075
+ ] ) ) ,
1076
+ true ,
1077
+ ) ) ,
1078
+ nested_struct_array,
1079
+ ) ,
1080
+ (
1081
+ Arc :: new( Field :: new( "field_c" , DataType :: Int32 , true ) ) ,
1082
+ Arc :: new( field_c_array) as ArrayRef ,
1083
+ ) ,
1084
+ ] ) ) as ArrayRef ;
1085
+
1086
+ // Create an ArrowArrayAccessor with Name matching mode
1087
+ let accessor = ArrowArrayAccessor :: new_with_match_mode ( FieldMatchMode :: Name ) ;
1088
+
1089
+ // Test finding fields by name
1090
+ let nested_field = NestedField :: optional (
1091
+ 3 ,
1092
+ "nested_struct" ,
1093
+ Type :: Struct ( StructType :: new ( vec ! [
1094
+ Arc :: new( NestedField :: optional(
1095
+ 1 ,
1096
+ "field_a" ,
1097
+ Type :: Primitive ( PrimitiveType :: Int ) ,
1098
+ ) ) ,
1099
+ Arc :: new( NestedField :: optional(
1100
+ 2 ,
1101
+ "field_b" ,
1102
+ Type :: Primitive ( PrimitiveType :: String ) ,
1103
+ ) ) ,
1104
+ ] ) ) ,
1105
+ ) ;
1106
+ let nested_partner = accessor
1107
+ . field_partner ( & struct_array, & nested_field)
1108
+ . unwrap ( ) ;
1109
+
1110
+ // Verify we can access the nested field by name
1111
+ let field_a = NestedField :: optional ( 1 , "field_a" , Type :: Primitive ( PrimitiveType :: Int ) ) ;
1112
+ let field_a_partner = accessor. field_partner ( nested_partner, & field_a) . unwrap ( ) ;
975
1113
976
- // Now try with an accessor without arrow_schema - this should fail
977
- let accessor_without_schema = ArrowArrayAccessor :: new ( ) . unwrap ( ) ;
978
- let result = accessor_without_schema. field_partner ( & struct_array, & field) ;
979
- assert ! ( result. is_err( ) ) ;
1114
+ // Verify the field has the expected value
1115
+ let int_array = field_a_partner
1116
+ . as_any ( )
1117
+ . downcast_ref :: < Int32Array > ( )
1118
+ . unwrap ( ) ;
1119
+ assert_eq ! ( int_array. value( 0 ) , 42 ) ;
1120
+ assert_eq ! ( int_array. value( 1 ) , 43 ) ;
1121
+ assert ! ( int_array. is_null( 2 ) ) ;
980
1122
}
981
1123
982
1124
#[ test]
0 commit comments