@@ -21,7 +21,7 @@ use arrow_array::{
21
21
LargeListArray , LargeStringArray , ListArray , MapArray , StringArray , StructArray ,
22
22
Time64MicrosecondArray , TimestampMicrosecondArray , TimestampNanosecondArray ,
23
23
} ;
24
- use arrow_schema:: DataType ;
24
+ use arrow_schema:: { DataType , FieldRef } ;
25
25
use uuid:: Uuid ;
26
26
27
27
use super :: get_field_id;
@@ -425,11 +425,63 @@ impl SchemaWithPartnerVisitor<ArrayRef> for ArrowArrayToIcebergStructConverter {
425
425
}
426
426
}
427
427
428
+ /// Defines how Arrow fields are matched with Iceberg fields when converting data.
429
+ ///
430
+ /// This enum provides two strategies for matching fields:
431
+ /// - `Id`: Match fields by their ID, which is stored in Arrow field metadata.
432
+ /// - `Name`: Match fields by their name, ignoring the field ID.
433
+ ///
434
+ /// The ID matching mode is the default and preferred approach as it's more robust
435
+ /// against schema evolution where field names might change but IDs remain stable.
436
+ /// The name matching mode can be useful in scenarios where field IDs are not available
437
+ /// or when working with systems that don't preserve field IDs.
438
+ #[ derive( Clone , Copy , Debug ) ]
439
+ pub enum FieldMatchMode {
440
+ /// Match fields by their ID stored in Arrow field metadata
441
+ Id ,
442
+ /// Match fields by their name, ignoring field IDs
443
+ Name ,
444
+ }
445
+
446
+ impl FieldMatchMode {
447
+ /// Determines if an Arrow field matches an Iceberg field based on the matching mode.
448
+ pub fn match_field ( & self , arrow_field : & FieldRef , iceberg_field : & NestedField ) -> bool {
449
+ match self {
450
+ FieldMatchMode :: Id => get_field_id ( arrow_field)
451
+ . map ( |id| id == iceberg_field. id )
452
+ . unwrap_or ( false ) ,
453
+ FieldMatchMode :: Name => arrow_field. name ( ) == & iceberg_field. name ,
454
+ }
455
+ }
456
+ }
457
+
428
458
/// Partner type representing accessing and walking arrow arrays alongside iceberg schema
429
- pub struct ArrowArrayAccessor ;
459
+ pub struct ArrowArrayAccessor {
460
+ match_mode : FieldMatchMode ,
461
+ }
462
+
463
+ impl ArrowArrayAccessor {
464
+ /// Creates a new instance of ArrowArrayAccessor with the default ID matching mode
465
+ pub fn new ( ) -> Self {
466
+ Self {
467
+ match_mode : FieldMatchMode :: Id ,
468
+ }
469
+ }
470
+
471
+ /// Creates a new instance of ArrowArrayAccessor with the specified matching mode
472
+ pub fn new_with_match_mode ( match_mode : FieldMatchMode ) -> Self {
473
+ Self { match_mode }
474
+ }
475
+ }
476
+
477
+ impl Default for ArrowArrayAccessor {
478
+ fn default ( ) -> Self {
479
+ Self :: new ( )
480
+ }
481
+ }
430
482
431
483
impl PartnerAccessor < ArrayRef > for ArrowArrayAccessor {
432
- fn struct_parner < ' a > ( & self , schema_partner : & ' a ArrayRef ) -> Result < & ' a ArrayRef > {
484
+ fn struct_partner < ' a > ( & self , schema_partner : & ' a ArrayRef ) -> Result < & ' a ArrayRef > {
433
485
if !matches ! ( schema_partner. data_type( ) , DataType :: Struct ( _) ) {
434
486
return Err ( Error :: new (
435
487
ErrorKind :: DataInvalid ,
@@ -451,18 +503,17 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
451
503
. ok_or_else ( || {
452
504
Error :: new (
453
505
ErrorKind :: DataInvalid ,
454
- "The struct partner is not a struct array" ,
506
+ format ! (
507
+ "The struct partner is not a struct array, partner: {:?}" ,
508
+ struct_partner
509
+ ) ,
455
510
)
456
511
} ) ?;
457
512
458
513
let field_pos = struct_array
459
514
. fields ( )
460
515
. iter ( )
461
- . position ( |arrow_field| {
462
- get_field_id ( arrow_field)
463
- . map ( |id| id == field. id )
464
- . unwrap_or ( false )
465
- } )
516
+ . position ( |arrow_field| self . match_mode . match_field ( arrow_field, field) )
466
517
. ok_or_else ( || {
467
518
Error :: new (
468
519
ErrorKind :: DataInvalid ,
@@ -549,7 +600,7 @@ pub fn arrow_struct_to_literal(
549
600
ty,
550
601
struct_array,
551
602
& mut ArrowArrayToIcebergStructConverter ,
552
- & ArrowArrayAccessor ,
603
+ & ArrowArrayAccessor :: new ( ) ,
553
604
)
554
605
}
555
606
@@ -899,6 +950,183 @@ mod test {
899
950
assert_eq ! ( result, vec![ None ; 0 ] ) ;
900
951
}
901
952
953
+ #[ test]
954
+ fn test_find_field_by_id ( ) {
955
+ // Create Arrow arrays for the nested structure
956
+ let field_a_array = Int32Array :: from ( vec ! [ Some ( 42 ) , Some ( 43 ) , None ] ) ;
957
+ let field_b_array = StringArray :: from ( vec ! [ Some ( "value1" ) , Some ( "value2" ) , None ] ) ;
958
+
959
+ // Create the nested struct array with field IDs in metadata
960
+ let nested_struct_array =
961
+ Arc :: new ( StructArray :: from ( vec ! [
962
+ (
963
+ Arc :: new( Field :: new( "field_a" , DataType :: Int32 , true ) . with_metadata(
964
+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "1" . to_string( ) ) ] ) ,
965
+ ) ) ,
966
+ Arc :: new( field_a_array) as ArrayRef ,
967
+ ) ,
968
+ (
969
+ Arc :: new( Field :: new( "field_b" , DataType :: Utf8 , true ) . with_metadata(
970
+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "2" . to_string( ) ) ] ) ,
971
+ ) ) ,
972
+ Arc :: new( field_b_array) as ArrayRef ,
973
+ ) ,
974
+ ] ) ) as ArrayRef ;
975
+
976
+ let field_c_array = Int32Array :: from ( vec ! [ Some ( 100 ) , Some ( 200 ) , None ] ) ;
977
+
978
+ // Create the top-level struct array with field IDs in metadata
979
+ let struct_array = Arc :: new ( StructArray :: from ( vec ! [
980
+ (
981
+ Arc :: new(
982
+ Field :: new(
983
+ "nested_struct" ,
984
+ DataType :: Struct ( Fields :: from( vec![
985
+ Field :: new( "field_a" , DataType :: Int32 , true ) . with_metadata(
986
+ HashMap :: from( [ (
987
+ PARQUET_FIELD_ID_META_KEY . to_string( ) ,
988
+ "1" . to_string( ) ,
989
+ ) ] ) ,
990
+ ) ,
991
+ Field :: new( "field_b" , DataType :: Utf8 , true ) . with_metadata(
992
+ HashMap :: from( [ (
993
+ PARQUET_FIELD_ID_META_KEY . to_string( ) ,
994
+ "2" . to_string( ) ,
995
+ ) ] ) ,
996
+ ) ,
997
+ ] ) ) ,
998
+ true ,
999
+ )
1000
+ . with_metadata( HashMap :: from( [ (
1001
+ PARQUET_FIELD_ID_META_KEY . to_string( ) ,
1002
+ "3" . to_string( ) ,
1003
+ ) ] ) ) ,
1004
+ ) ,
1005
+ nested_struct_array,
1006
+ ) ,
1007
+ (
1008
+ Arc :: new( Field :: new( "field_c" , DataType :: Int32 , true ) . with_metadata(
1009
+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "4" . to_string( ) ) ] ) ,
1010
+ ) ) ,
1011
+ Arc :: new( field_c_array) as ArrayRef ,
1012
+ ) ,
1013
+ ] ) ) as ArrayRef ;
1014
+
1015
+ // Create an ArrowArrayAccessor with ID matching mode
1016
+ let accessor = ArrowArrayAccessor :: new_with_match_mode ( FieldMatchMode :: Id ) ;
1017
+
1018
+ // Test finding fields by ID
1019
+ let nested_field = NestedField :: optional (
1020
+ 3 ,
1021
+ "nested_struct" ,
1022
+ Type :: Struct ( StructType :: new ( vec ! [
1023
+ Arc :: new( NestedField :: optional(
1024
+ 1 ,
1025
+ "field_a" ,
1026
+ Type :: Primitive ( PrimitiveType :: Int ) ,
1027
+ ) ) ,
1028
+ Arc :: new( NestedField :: optional(
1029
+ 2 ,
1030
+ "field_b" ,
1031
+ Type :: Primitive ( PrimitiveType :: String ) ,
1032
+ ) ) ,
1033
+ ] ) ) ,
1034
+ ) ;
1035
+ let nested_partner = accessor
1036
+ . field_partner ( & struct_array, & nested_field)
1037
+ . unwrap ( ) ;
1038
+
1039
+ // Verify we can access the nested field
1040
+ let field_a = NestedField :: optional ( 1 , "field_a" , Type :: Primitive ( PrimitiveType :: Int ) ) ;
1041
+ let field_a_partner = accessor. field_partner ( nested_partner, & field_a) . unwrap ( ) ;
1042
+
1043
+ // Verify the field has the expected value
1044
+ let int_array = field_a_partner
1045
+ . as_any ( )
1046
+ . downcast_ref :: < Int32Array > ( )
1047
+ . unwrap ( ) ;
1048
+ assert_eq ! ( int_array. value( 0 ) , 42 ) ;
1049
+ assert_eq ! ( int_array. value( 1 ) , 43 ) ;
1050
+ assert ! ( int_array. is_null( 2 ) ) ;
1051
+ }
1052
+
1053
+ #[ test]
1054
+ fn test_find_field_by_name ( ) {
1055
+ // Create Arrow arrays for the nested structure
1056
+ let field_a_array = Int32Array :: from ( vec ! [ Some ( 42 ) , Some ( 43 ) , None ] ) ;
1057
+ let field_b_array = StringArray :: from ( vec ! [ Some ( "value1" ) , Some ( "value2" ) , None ] ) ;
1058
+
1059
+ // Create the nested struct array WITHOUT field IDs in metadata
1060
+ let nested_struct_array = Arc :: new ( StructArray :: from ( vec ! [
1061
+ (
1062
+ Arc :: new( Field :: new( "field_a" , DataType :: Int32 , true ) ) ,
1063
+ Arc :: new( field_a_array) as ArrayRef ,
1064
+ ) ,
1065
+ (
1066
+ Arc :: new( Field :: new( "field_b" , DataType :: Utf8 , true ) ) ,
1067
+ Arc :: new( field_b_array) as ArrayRef ,
1068
+ ) ,
1069
+ ] ) ) as ArrayRef ;
1070
+
1071
+ let field_c_array = Int32Array :: from ( vec ! [ Some ( 100 ) , Some ( 200 ) , None ] ) ;
1072
+
1073
+ // Create the top-level struct array WITHOUT field IDs in metadata
1074
+ let struct_array = Arc :: new ( StructArray :: from ( vec ! [
1075
+ (
1076
+ Arc :: new( Field :: new(
1077
+ "nested_struct" ,
1078
+ DataType :: Struct ( Fields :: from( vec![
1079
+ Field :: new( "field_a" , DataType :: Int32 , true ) ,
1080
+ Field :: new( "field_b" , DataType :: Utf8 , true ) ,
1081
+ ] ) ) ,
1082
+ true ,
1083
+ ) ) ,
1084
+ nested_struct_array,
1085
+ ) ,
1086
+ (
1087
+ Arc :: new( Field :: new( "field_c" , DataType :: Int32 , true ) ) ,
1088
+ Arc :: new( field_c_array) as ArrayRef ,
1089
+ ) ,
1090
+ ] ) ) as ArrayRef ;
1091
+
1092
+ // Create an ArrowArrayAccessor with Name matching mode
1093
+ let accessor = ArrowArrayAccessor :: new_with_match_mode ( FieldMatchMode :: Name ) ;
1094
+
1095
+ // Test finding fields by name
1096
+ let nested_field = NestedField :: optional (
1097
+ 3 ,
1098
+ "nested_struct" ,
1099
+ Type :: Struct ( StructType :: new ( vec ! [
1100
+ Arc :: new( NestedField :: optional(
1101
+ 1 ,
1102
+ "field_a" ,
1103
+ Type :: Primitive ( PrimitiveType :: Int ) ,
1104
+ ) ) ,
1105
+ Arc :: new( NestedField :: optional(
1106
+ 2 ,
1107
+ "field_b" ,
1108
+ Type :: Primitive ( PrimitiveType :: String ) ,
1109
+ ) ) ,
1110
+ ] ) ) ,
1111
+ ) ;
1112
+ let nested_partner = accessor
1113
+ . field_partner ( & struct_array, & nested_field)
1114
+ . unwrap ( ) ;
1115
+
1116
+ // Verify we can access the nested field by name
1117
+ let field_a = NestedField :: optional ( 1 , "field_a" , Type :: Primitive ( PrimitiveType :: Int ) ) ;
1118
+ let field_a_partner = accessor. field_partner ( nested_partner, & field_a) . unwrap ( ) ;
1119
+
1120
+ // Verify the field has the expected value
1121
+ let int_array = field_a_partner
1122
+ . as_any ( )
1123
+ . downcast_ref :: < Int32Array > ( )
1124
+ . unwrap ( ) ;
1125
+ assert_eq ! ( int_array. value( 0 ) , 42 ) ;
1126
+ assert_eq ! ( int_array. value( 1 ) , 43 ) ;
1127
+ assert ! ( int_array. is_null( 2 ) ) ;
1128
+ }
1129
+
902
1130
#[ test]
903
1131
fn test_complex_nested ( ) {
904
1132
// complex nested type for test
0 commit comments