@@ -31,13 +31,14 @@ use arrow_schema::{
3131use parquet:: arrow:: PARQUET_FIELD_ID_META_KEY ;
3232
3333use crate :: arrow:: schema_to_arrow_schema;
34- use crate :: metadata_columns:: get_metadata_field;
34+ use crate :: metadata_columns:: { get_metadata_field, metadata_field_primitive_type } ;
3535use crate :: spec:: {
36- Literal , PartitionSpec , PrimitiveLiteral , Schema as IcebergSchema , Struct , Transform ,
36+ Datum , Literal , PartitionSpec , PrimitiveLiteral , PrimitiveType , Schema as IcebergSchema ,
37+ Struct , Transform ,
3738} ;
3839use crate :: { Error , ErrorKind , Result } ;
3940
40- /// Build a map of field ID to constant value for identity-partitioned fields.
41+ /// Build a map of field ID to constant value (as Datum) for identity-partitioned fields.
4142///
4243/// Implements Iceberg spec "Column Projection" rule #1: use partition metadata constants
4344/// only for identity-transformed fields. Non-identity transforms (bucket, truncate, year, etc.)
@@ -54,20 +55,32 @@ use crate::{Error, ErrorKind, Result};
5455fn constants_map (
5556 partition_spec : & PartitionSpec ,
5657 partition_data : & Struct ,
57- ) -> HashMap < i32 , PrimitiveLiteral > {
58+ schema : & IcebergSchema ,
59+ ) -> Result < HashMap < i32 , Datum > > {
5860 let mut constants = HashMap :: new ( ) ;
5961
6062 for ( pos, field) in partition_spec. fields ( ) . iter ( ) . enumerate ( ) {
6163 // Only identity transforms should use constant values from partition metadata
6264 if matches ! ( field. transform, Transform :: Identity ) {
6365 // Get the partition value for this field
6466 if let Some ( Literal :: Primitive ( value) ) = & partition_data[ pos] {
65- constants. insert ( field. source_id , value. clone ( ) ) ;
67+ // Get the field from schema to extract its type
68+ let iceberg_field = schema. field_by_id ( field. source_id ) . ok_or ( Error :: new (
69+ ErrorKind :: Unexpected ,
70+ format ! ( "Field {} not found in schema" , field. source_id) ,
71+ ) ) ?;
72+
73+ // Extract the primitive type from the field
74+ if let crate :: spec:: Type :: Primitive ( prim_type) = & * iceberg_field. field_type {
75+ // Create a Datum from the primitive type and value
76+ let datum = Datum :: new ( prim_type. clone ( ) , value. clone ( ) ) ;
77+ constants. insert ( field. source_id , datum) ;
78+ }
6679 }
6780 }
6881 }
6982
70- constants
83+ Ok ( constants)
7184}
7285
7386/// Indicates how a particular column in a processed RecordBatch should
@@ -153,7 +166,7 @@ enum SchemaComparison {
153166pub ( crate ) struct RecordBatchTransformerBuilder {
154167 snapshot_schema : Arc < IcebergSchema > ,
155168 projected_iceberg_field_ids : Vec < i32 > ,
156- constant_fields : HashMap < i32 , ( DataType , PrimitiveLiteral ) > ,
169+ constant_fields : HashMap < i32 , Datum > ,
157170}
158171
159172impl RecordBatchTransformerBuilder {
@@ -173,11 +186,34 @@ impl RecordBatchTransformerBuilder {
173186 ///
174187 /// # Arguments
175188 /// * `field_id` - The field ID to associate with the constant
176- /// * `value` - The constant value for this field
177- pub ( crate ) fn with_constant ( mut self , field_id : i32 , value : PrimitiveLiteral ) -> Result < Self > {
178- let arrow_type = RecordBatchTransformer :: primitive_literal_to_arrow_type ( & value) ?;
179- self . constant_fields . insert ( field_id, ( arrow_type, value) ) ;
180- Ok ( self )
189+ /// * `datum` - The constant value (with type) for this field
190+ pub ( crate ) fn with_constant ( mut self , field_id : i32 , datum : Datum ) -> Self {
191+ self . constant_fields . insert ( field_id, datum) ;
192+ self
193+ }
194+
195+ /// Add a reserved/metadata field with a constant string value.
196+ /// This is a convenience method for reserved fields like _file that automatically
197+ /// handles type extraction from the field definition.
198+ ///
199+ /// # Arguments
200+ /// * `field_id` - The reserved field ID (e.g., RESERVED_FIELD_ID_FILE)
201+ /// * `value` - The constant string value for this field
202+ ///
203+ /// # Returns
204+ /// Self for method chaining, or an error if the field is not a valid metadata field
205+ pub ( crate ) fn with_reserved_field ( self , field_id : i32 , value : String ) -> Result < Self > {
206+ // Get the Iceberg field definition
207+ let iceberg_field = get_metadata_field ( field_id) ?;
208+
209+ // Extract the primitive type from the field
210+ let prim_type = metadata_field_primitive_type ( & iceberg_field) ?;
211+
212+ // Create a Datum with the extracted type and value
213+ let datum = Datum :: new ( prim_type, PrimitiveLiteral :: String ( value) ) ;
214+
215+ // Add the constant field
216+ Ok ( self . with_constant ( field_id, datum) )
181217 }
182218
183219 /// Set partition spec and data together for identifying identity-transformed partition columns.
@@ -190,13 +226,13 @@ impl RecordBatchTransformerBuilder {
190226 partition_spec : Arc < PartitionSpec > ,
191227 partition_data : Struct ,
192228 ) -> Result < Self > {
193- // Compute partition constants for identity-transformed fields
194- let partition_constants = constants_map ( & partition_spec, & partition_data) ;
229+ // Compute partition constants for identity-transformed fields (already returns Datum)
230+ let partition_constants =
231+ constants_map ( & partition_spec, & partition_data, & self . snapshot_schema ) ?;
195232
196- // Add partition constants to constant_fields (compute REE types from literals)
197- for ( field_id, value) in partition_constants {
198- let arrow_type = RecordBatchTransformer :: primitive_literal_to_arrow_type ( & value) ?;
199- self . constant_fields . insert ( field_id, ( arrow_type, value) ) ;
233+ // Add partition constants to constant_fields
234+ for ( field_id, datum) in partition_constants {
235+ self . constant_fields . insert ( field_id, datum) ;
200236 }
201237
202238 Ok ( self )
@@ -246,10 +282,10 @@ impl RecordBatchTransformerBuilder {
246282pub ( crate ) struct RecordBatchTransformer {
247283 snapshot_schema : Arc < IcebergSchema > ,
248284 projected_iceberg_field_ids : Vec < i32 > ,
249- // Pre-computed constant field information: field_id -> (arrow_type, value)
285+ // Pre-computed constant field information: field_id -> Datum
250286 // Includes both virtual/metadata fields (like _file) and identity-partitioned fields
251- // Avoids type conversions during batch processing
252- constant_fields : HashMap < i32 , ( DataType , PrimitiveLiteral ) > ,
287+ // Datum holds both the Iceberg type and the value
288+ constant_fields : HashMap < i32 , Datum > ,
253289
254290 // BatchTransform gets lazily constructed based on the schema of
255291 // the first RecordBatch we receive from the file
@@ -310,7 +346,7 @@ impl RecordBatchTransformer {
310346 source_schema : & ArrowSchemaRef ,
311347 snapshot_schema : & IcebergSchema ,
312348 projected_iceberg_field_ids : & [ i32 ] ,
313- constant_fields : & HashMap < i32 , ( DataType , PrimitiveLiteral ) > ,
349+ constant_fields : & HashMap < i32 , Datum > ,
314350 ) -> Result < BatchTransform > {
315351 let mapped_unprojected_arrow_schema = Arc :: new ( schema_to_arrow_schema ( snapshot_schema) ?) ;
316352 let field_id_to_mapped_schema_map =
@@ -325,19 +361,28 @@ impl RecordBatchTransformer {
325361 if constant_fields. contains_key ( field_id) {
326362 // For metadata/virtual fields (like _file), get name from metadata_columns
327363 // For partition fields, get name from schema (they exist in schema)
328- if let Ok ( field) = get_metadata_field ( * field_id) {
329- // This is a metadata/virtual field - use the predefined field
330- Ok ( field)
364+ if let Ok ( iceberg_field) = get_metadata_field ( * field_id) {
365+ // This is a metadata/virtual field - convert Iceberg field to Arrow
366+ let arrow_type =
367+ Self :: datum_to_arrow_type ( constant_fields. get ( field_id) . unwrap ( ) ) ;
368+ let arrow_field =
369+ Field :: new ( & iceberg_field. name , arrow_type, !iceberg_field. required )
370+ . with_metadata ( HashMap :: from ( [ (
371+ PARQUET_FIELD_ID_META_KEY . to_string ( ) ,
372+ iceberg_field. id . to_string ( ) ,
373+ ) ] ) ) ;
374+ Ok ( Arc :: new ( arrow_field) )
331375 } else {
332376 // This is a partition constant field (exists in schema but uses constant value)
333377 let field = & field_id_to_mapped_schema_map
334378 . get ( field_id)
335379 . ok_or ( Error :: new ( ErrorKind :: Unexpected , "field not found" ) ) ?
336380 . 0 ;
337- let ( arrow_type, _) = constant_fields. get ( field_id) . unwrap ( ) ;
381+ let datum = constant_fields. get ( field_id) . unwrap ( ) ;
382+ let arrow_type = Self :: datum_to_arrow_type ( datum) ;
338383 // Use the type from constant_fields (REE for constants)
339384 let constant_field =
340- Field :: new ( field. name ( ) , arrow_type. clone ( ) , field. is_nullable ( ) )
385+ Field :: new ( field. name ( ) , arrow_type, field. is_nullable ( ) )
341386 . with_metadata ( field. metadata ( ) . clone ( ) ) ;
342387 Ok ( Arc :: new ( constant_field) )
343388 }
@@ -420,7 +465,7 @@ impl RecordBatchTransformer {
420465 snapshot_schema : & IcebergSchema ,
421466 projected_iceberg_field_ids : & [ i32 ] ,
422467 field_id_to_mapped_schema_map : HashMap < i32 , ( FieldRef , usize ) > ,
423- constant_fields : & HashMap < i32 , ( DataType , PrimitiveLiteral ) > ,
468+ constant_fields : & HashMap < i32 , Datum > ,
424469 ) -> Result < Vec < ColumnSource > > {
425470 let field_id_to_source_schema_map =
426471 Self :: build_field_id_to_arrow_schema_map ( source_schema) ?;
@@ -432,10 +477,11 @@ impl RecordBatchTransformer {
432477 // Constant fields always use their pre-computed constant values, regardless of whether
433478 // they exist in the Parquet file. This is per Iceberg spec rule #1: partition metadata
434479 // is authoritative and should be preferred over file data.
435- if let Some ( ( arrow_type, value) ) = constant_fields. get ( field_id) {
480+ if let Some ( datum) = constant_fields. get ( field_id) {
481+ let arrow_type = Self :: datum_to_arrow_type ( datum) ;
436482 return Ok ( ColumnSource :: Add {
437- value : Some ( value . clone ( ) ) ,
438- target_type : arrow_type. clone ( ) ,
483+ value : Some ( datum . literal ( ) . clone ( ) ) ,
484+ target_type : arrow_type,
439485 } ) ;
440486 }
441487
@@ -791,10 +837,10 @@ impl RecordBatchTransformer {
791837 }
792838 }
793839
794- /// Converts a PrimitiveLiteral to its corresponding Arrow DataType.
795- /// This is used for constant fields to determine the Arrow type.
840+ /// Converts a Datum (Iceberg type + primitive literal) to its corresponding Arrow DataType.
841+ /// Uses the PrimitiveType from the Datum to determine the correct Arrow type.
796842 /// For constant values, we use Run-End Encoding for all types to save memory.
797- fn primitive_literal_to_arrow_type ( literal : & PrimitiveLiteral ) -> Result < DataType > {
843+ fn datum_to_arrow_type ( datum : & Datum ) -> DataType {
798844 // Helper to create REE type with the given values type
799845 // Note: values field is nullable as Arrow expects this when building the
800846 // final Arrow schema with `RunArray::try_new`.
@@ -804,23 +850,27 @@ impl RecordBatchTransformer {
804850 DataType :: RunEndEncoded ( run_ends_field, values_field)
805851 } ;
806852
807- Ok ( match literal {
808- PrimitiveLiteral :: Boolean ( _) => make_ree ( DataType :: Boolean ) ,
809- PrimitiveLiteral :: Int ( _) => make_ree ( DataType :: Int32 ) ,
810- PrimitiveLiteral :: Long ( _) => make_ree ( DataType :: Int64 ) ,
811- PrimitiveLiteral :: Float ( _) => make_ree ( DataType :: Float32 ) ,
812- PrimitiveLiteral :: Double ( _) => make_ree ( DataType :: Float64 ) ,
813- PrimitiveLiteral :: String ( _) => make_ree ( DataType :: Utf8 ) ,
814- PrimitiveLiteral :: Binary ( _) => make_ree ( DataType :: Binary ) ,
815- PrimitiveLiteral :: Int128 ( _) => make_ree ( DataType :: Decimal128 ( 38 , 0 ) ) ,
816- PrimitiveLiteral :: UInt128 ( _) => make_ree ( DataType :: Decimal128 ( 38 , 0 ) ) ,
817- PrimitiveLiteral :: AboveMax | PrimitiveLiteral :: BelowMin => {
818- return Err ( Error :: new (
819- ErrorKind :: Unexpected ,
820- "Cannot create arrow type for AboveMax/BelowMin literal" ,
821- ) ) ;
853+ // Match on the PrimitiveType from the Datum to determine the Arrow type
854+ match datum. data_type ( ) {
855+ PrimitiveType :: Boolean => make_ree ( DataType :: Boolean ) ,
856+ PrimitiveType :: Int => make_ree ( DataType :: Int32 ) ,
857+ PrimitiveType :: Long => make_ree ( DataType :: Int64 ) ,
858+ PrimitiveType :: Float => make_ree ( DataType :: Float32 ) ,
859+ PrimitiveType :: Double => make_ree ( DataType :: Float64 ) ,
860+ PrimitiveType :: Date => make_ree ( DataType :: Date32 ) ,
861+ PrimitiveType :: Time => make_ree ( DataType :: Int64 ) ,
862+ PrimitiveType :: Timestamp => make_ree ( DataType :: Int64 ) ,
863+ PrimitiveType :: Timestamptz => make_ree ( DataType :: Int64 ) ,
864+ PrimitiveType :: TimestampNs => make_ree ( DataType :: Int64 ) ,
865+ PrimitiveType :: TimestamptzNs => make_ree ( DataType :: Int64 ) ,
866+ PrimitiveType :: String => make_ree ( DataType :: Utf8 ) ,
867+ PrimitiveType :: Uuid => make_ree ( DataType :: Binary ) ,
868+ PrimitiveType :: Fixed ( _) => make_ree ( DataType :: Binary ) ,
869+ PrimitiveType :: Binary => make_ree ( DataType :: Binary ) ,
870+ PrimitiveType :: Decimal { precision, scale } => {
871+ make_ree ( DataType :: Decimal128 ( * precision as u8 , * scale as i8 ) )
822872 }
823- } )
873+ }
824874 }
825875}
826876
0 commit comments