refactor: simplify RecordBatchProjector and optimize partition calculation

fvaleye · fvaleye · commit edb4719ba236 · 2025-09-29T11:59:43.000+02:00
Signed-off-by: Florian Valeye &lt;florian.valeye@gmail.com&gt;
diff --git a/crates/iceberg/src/arrow/record_batch_projector.rs b/crates/iceberg/src/arrow/record_batch_projector.rs
@@ -22,6 +22,7 @@ use arrow_buffer::NullBuffer;
 use arrow_schema::{DataType, Field, FieldRef, Fields, Schema, SchemaRef};
 use parquet::arrow::PARQUET_FIELD_ID_META_KEY;
 
+use crate::arrow::schema::schema_to_arrow_schema;
 use crate::error::Result;
 use crate::spec::Schema as IcebergSchema;
 use crate::{Error, ErrorKind};
@@ -79,22 +80,21 @@ impl RecordBatchProjector {
         })
     }
 
-    /// Create RecordBatchProjector using Iceberg schema for field mapping.
+    /// Create RecordBatchProjector using Iceberg schema.
     ///
-    /// This constructor is more flexible and works with any Arrow schema by using
-    /// the Iceberg schema to map field names to field IDs.
+    /// This constructor converts the Iceberg schema to Arrow schema with field ID metadata,
+    /// then uses the standard field ID lookup for projection.
     ///
     /// # Arguments
-    /// * `original_schema` - The original Arrow schema (doesn't need field ID metadata)
-    /// * `iceberg_schema` - The Iceberg schema for field ID mapping
+    /// * `iceberg_schema` - The Iceberg schema for field ID mapping  
     /// * `target_field_ids` - The field IDs to project
-    pub fn from_iceberg_schema_mapping(
-        original_schema: SchemaRef,
+    pub fn from_iceberg_schema(
         iceberg_schema: Arc<IcebergSchema>,
         target_field_ids: &[i32],
     ) -> Result<Self> {
+        let arrow_schema_with_ids = Arc::new(schema_to_arrow_schema(&iceberg_schema)?);
+
         let field_id_fetch_func = |field: &Field| -> Result<Option<i64>> {
-            // First try to get field ID from metadata (Parquet case)
             if let Some(value) = field.metadata().get(PARQUET_FIELD_ID_META_KEY) {
                 let field_id = value.parse::<i32>().map_err(|e| {
                     Error::new(
@@ -104,49 +104,16 @@ impl RecordBatchProjector {
                     .with_context("value", value)
                     .with_source(e)
                 })?;
-                return Ok(Some(field_id as i64));
-            }
-
-            // Fallback: use Iceberg schema's built-in field lookup
-            if let Some(iceberg_field) = iceberg_schema.field_by_name(field.name()) {
-                return Ok(Some(iceberg_field.id as i64));
-            }
-
-            // Additional fallback: for nested fields, we need to search recursively
-            fn find_field_id_in_struct(
-                struct_type: &crate::spec::StructType,
-                field_name: &str,
-            ) -> Option<i32> {
-                for field in struct_type.fields() {
-                    if field.name == field_name {
-                        return Some(field.id);
-                    }
-                    if let crate::spec::Type::Struct(nested_struct) = &*field.field_type {
-                        if let Some(nested_id) = find_field_id_in_struct(nested_struct, field_name)
-                        {
-                            return Some(nested_id);
-                        }
-                    }
-                }
-                None
-            }
-
-            // Search in nested structs
-            for iceberg_field in iceberg_schema.as_struct().fields() {
-                if let crate::spec::Type::Struct(struct_type) = &*iceberg_field.field_type {
-                    if let Some(nested_id) = find_field_id_in_struct(struct_type, field.name()) {
-                        return Ok(Some(nested_id as i64));
-                    }
-                }
+                Ok(Some(field_id as i64))
+            } else {
+                Ok(None)
             }
-
-            Ok(None)
         };
 
         let searchable_field_func = |_field: &Field| -> bool { true };
 
         Self::new(
-            original_schema,
+            arrow_schema_with_ids,
             target_field_ids,
             field_id_fetch_func,
             searchable_field_func,
@@ -242,6 +209,7 @@ mod test {
     use arrow_schema::{DataType, Field, Fields, Schema};
 
     use crate::arrow::record_batch_projector::RecordBatchProjector;
+    use crate::spec::{NestedField, PrimitiveType, Schema as IcebergSchema, Type};
     use crate::{Error, ErrorKind};
 
     #[test]
@@ -369,4 +337,25 @@ mod test {
             RecordBatchProjector::new(schema.clone(), &[3], field_id_fetch_func, |_| true);
         assert!(projector.is_ok());
     }
+
+    #[test]
+    fn test_from_iceberg_schema() {
+        let iceberg_schema = IcebergSchema::builder()
+            .with_schema_id(0)
+            .with_fields(vec![
+                NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(),
+                NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(),
+                NestedField::optional(3, "age", Type::Primitive(PrimitiveType::Int)).into(),
+            ])
+            .build()
+            .unwrap();
+
+        let projector =
+            RecordBatchProjector::from_iceberg_schema(Arc::new(iceberg_schema), &[1, 3]).unwrap();
+
+        assert_eq!(projector.field_indices.len(), 2);
+        assert_eq!(projector.projected_schema_ref().fields().len(), 2);
+        assert_eq!(projector.projected_schema_ref().field(0).name(), "id");
+        assert_eq!(projector.projected_schema_ref().field(1).name(), "age");
+    }
 }
diff --git a/crates/iceberg/src/transform/mod.rs b/crates/iceberg/src/transform/mod.rs
@@ -29,7 +29,7 @@ mod truncate;
 mod void;
 
 /// TransformFunction is a trait that defines the interface for all transform functions.
-pub trait TransformFunction: Send + Sync {
+pub trait TransformFunction: Send + Sync + std::fmt::Debug {
     /// transform will take an input array and transform it into a new array.
     /// The implementation of this function will need to check and downcast the input to specific
     /// type.
diff --git a/crates/integrations/datafusion/src/physical_plan/project.rs b/crates/integrations/datafusion/src/physical_plan/project.rs
@@ -30,6 +30,7 @@ use datafusion::physical_plan::{ColumnarValue, ExecutionPlan};
 use iceberg::arrow::record_batch_projector::RecordBatchProjector;
 use iceberg::spec::{PartitionSpec, Schema};
 use iceberg::table::Table;
+use iceberg::transform::BoxedTransformFunction;
 
 use crate::to_datafusion_error;
 
@@ -126,7 +127,7 @@ impl PhysicalExpr for PartitionExpr {
     }
 
     fn evaluate(&self, batch: &RecordBatch) -> DFResult<ColumnarValue> {
-        let mut calculator = self
+        let calculator = self
             .calculator
             .lock()
             .map_err(|e| DataFusionError::Internal(format!("Failed to lock calculator: {}", e)))?;
@@ -183,12 +184,12 @@ impl std::hash::Hash for PartitionExpr {
 }
 
 /// Calculator for partition values in Iceberg tables
-#[derive(Debug, Clone)]
+#[derive(Debug)]
 struct PartitionValueCalculator {
     partition_spec: PartitionSpec,
-    table_schema: Schema,
     partition_type: DataType,
-    projector: Option<RecordBatchProjector>,
+    projector: RecordBatchProjector,
+    transform_functions: Vec<BoxedTransformFunction>,
 }
 
 impl PartitionValueCalculator {
@@ -203,35 +204,37 @@ impl PartitionValueCalculator {
             ));
         }
 
+        let transform_functions: Result<Vec<BoxedTransformFunction>, _> = partition_spec
+            .fields()
+            .iter()
+            .map(|pf| iceberg::transform::create_transform_function(&pf.transform))
+            .collect();
+
+        let transform_functions = transform_functions.map_err(to_datafusion_error)?;
+
+        let source_field_ids: Vec<i32> = partition_spec
+            .fields()
+            .iter()
+            .map(|pf| pf.source_id)
+            .collect();
+
+        let projector = RecordBatchProjector::from_iceberg_schema(
+            Arc::new(table_schema.clone()),
+            &source_field_ids,
+        )
+        .map_err(to_datafusion_error)?;
+
         Ok(Self {
             partition_spec,
-            table_schema,
             partition_type,
-            projector: None,
+            projector,
+            transform_functions,
         })
     }
 
-    fn calculate(&mut self, batch: &RecordBatch) -> DFResult<ArrayRef> {
-        if self.projector.is_none() {
-            let source_field_ids: Vec<i32> = self
-                .partition_spec
-                .fields()
-                .iter()
-                .map(|pf| pf.source_id)
-                .collect();
-
-            let projector = RecordBatchProjector::from_iceberg_schema_mapping(
-                batch.schema(),
-                Arc::new(self.table_schema.clone()),
-                &source_field_ids,
-            )
-            .map_err(to_datafusion_error)?;
-
-            self.projector = Some(projector);
-        }
-
-        let projector = self.projector.as_ref().unwrap();
-        let source_columns = projector
+    fn calculate(&self, batch: &RecordBatch) -> DFResult<ArrayRef> {
+        let source_columns = self
+            .projector
             .project_column(batch.columns())
             .map_err(to_datafusion_error)?;
 
@@ -246,10 +249,7 @@ impl PartitionValueCalculator {
 
         let mut partition_values = Vec::with_capacity(self.partition_spec.fields().len());
 
-        for (source_column, pf) in source_columns.iter().zip(self.partition_spec.fields()) {
-            let transform_fn = iceberg::transform::create_transform_function(&pf.transform)
-                .map_err(to_datafusion_error)?;
-
+        for (source_column, transform_fn) in source_columns.iter().zip(&self.transform_functions) {
             let partition_value = transform_fn
                 .transform(source_column.clone())
                 .map_err(to_datafusion_error)?;
@@ -302,6 +302,11 @@ mod tests {
             .build()
             .unwrap();
 
+        let arrow_schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, false),
+        ]));
+
         let partition_type = build_partition_type(&partition_spec, &table_schema).unwrap();
         let calculator = PartitionValueCalculator::new(
             partition_spec.clone(),
@@ -476,7 +481,7 @@ mod tests {
         .unwrap();
 
         let partition_type = build_partition_type(&partition_spec, &table_schema).unwrap();
-        let mut calculator =
+        let calculator =
             PartitionValueCalculator::new(partition_spec, table_schema, partition_type).unwrap();
         let array = calculator.calculate(&batch).unwrap();