coalesce partitions

CTTY · CTTY · commit 0b869a6da2b6 · 2025-07-29T14:06:16.000-07:00
diff --git a/crates/integrations/datafusion/src/physical_plan/commit.rs b/crates/integrations/datafusion/src/physical_plan/commit.rs
@@ -29,7 +29,7 @@ use datafusion::physical_expr::{EquivalenceProperties, Partitioning};
 use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{
-    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, execute_stream_partitioned,
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
 };
 use futures::StreamExt;
 use iceberg::Catalog;
@@ -42,10 +42,11 @@ use crate::to_datafusion_error;
 
 /// IcebergCommitExec is responsible for collecting results from multiple IcebergWriteExec
 /// instances and using Transaction::fast_append to commit the data files written.
+#[derive(Debug)]
 pub(crate) struct IcebergCommitExec {
     table: Table,
     catalog: Arc<dyn Catalog>,
-    write_plan: Arc<dyn ExecutionPlan>,
+    input: Arc<dyn ExecutionPlan>,
     schema: ArrowSchemaRef,
     count_schema: ArrowSchemaRef,
     plan_properties: PlanProperties,
@@ -55,15 +56,15 @@ impl IcebergCommitExec {
     pub fn new(
         table: Table,
         catalog: Arc<dyn Catalog>,
-        write_plan: Arc<dyn ExecutionPlan>,
+        input: Arc<dyn ExecutionPlan>,
         schema: ArrowSchemaRef,
     ) -> Self {
         let plan_properties = Self::compute_properties(schema.clone());
 
         Self {
             table,
             catalog,
-            write_plan,
+            input,
             schema,
             count_schema: Self::make_count_schema(),
             plan_properties,
@@ -99,12 +100,6 @@ impl IcebergCommitExec {
     }
 }
 
-impl Debug for IcebergCommitExec {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(f, "IcebergCommitExec")
-    }
-}
-
 impl DisplayAs for IcebergCommitExec {
     fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
         match t {
@@ -140,18 +135,18 @@ impl ExecutionPlan for IcebergCommitExec {
     }
 
     fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-        vec![&self.write_plan]
+        vec![&self.input]
     }
 
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> DFResult<Arc<dyn ExecutionPlan>> {
         if children.len() != 1 {
-            return Err(DataFusionError::Internal(
-                "IcebergCommitExec expects exactly one child, but provided {children.len()}"
-                    .to_string(),
-            ));
+            return Err(DataFusionError::Internal(format!(
+                "IcebergCommitExec expects exactly one child, but provided {}",
+                children.len()
+            )));
         }
 
         Ok(Arc::new(IcebergCommitExec::new(
@@ -176,7 +171,7 @@ impl ExecutionPlan for IcebergCommitExec {
         }
 
         let table = self.table.clone();
-        let input_plan = self.write_plan.clone();
+        let input_plan = self.input.clone();
         let count_schema = Arc::clone(&self.count_schema);
 
         // todo revisit this
@@ -191,54 +186,51 @@ impl ExecutionPlan for IcebergCommitExec {
             let mut data_files: Vec<DataFile> = Vec::new();
             let mut total_record_count: u64 = 0;
 
-            // Execute and collect results from all partitions of the input plan
-            let batches = execute_stream_partitioned(input_plan, context)?;
-
-            // Collect all data files from this partition's stream
-            for mut batch_stream in batches {
-                while let Some(batch_result) = batch_stream.next().await {
-                    let batch = batch_result?;
-
-                    let files_array = batch
-                        .column_by_name(DATA_FILES_COL_NAME)
-                        .ok_or_else(|| {
-                            DataFusionError::Internal(
-                                "Expected 'data_files' column in input batch".to_string(),
-                            )
-                        })?
-                        .as_any()
-                        .downcast_ref::<StringArray>()
-                        .ok_or_else(|| {
-                            DataFusionError::Internal(
-                                "Expected 'data_files' column to be StringArray".to_string(),
-                            )
-                        })?;
-
-                    // todo remove log
-                    println!("files_array to deserialize: {:?}", files_array);
-
-                    // Deserialize all data files from the StringArray
-                    let batch_files: Vec<DataFile> = files_array
-                        .into_iter()
-                        .flatten()
-                        .map(|f| -> DFResult<DataFile> {
-                            // Parse JSON to DataFileSerde and convert to DataFile
-                            deserialize_data_file_from_json(
-                                f,
-                                spec_id,
-                                &partition_type,
-                                &current_schema,
-                            )
-                            .map_err(to_datafusion_error)
-                        })
-                        .collect::<datafusion::common::Result<_>>()?;
-
-                    // add record_counts from the current batch to total record count
-                    total_record_count += batch_files.iter().map(|f| f.record_count()).sum::<u64>();
-
-                    // Add all deserialized files to our collection
-                    data_files.extend(batch_files);
-                }
+            // Execute and collect results from the input coalesced plan
+            let mut batch_stream = input_plan.execute(0, context)?;
+
+            while let Some(batch_result) = batch_stream.next().await {
+                let batch = batch_result?;
+
+                let files_array = batch
+                    .column_by_name(DATA_FILES_COL_NAME)
+                    .ok_or_else(|| {
+                        DataFusionError::Internal(
+                            "Expected 'data_files' column in input batch".to_string(),
+                        )
+                    })?
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .ok_or_else(|| {
+                        DataFusionError::Internal(
+                            "Expected 'data_files' column to be StringArray".to_string(),
+                        )
+                    })?;
+
+                // todo remove log
+                println!("files_array to deserialize: {:?}", files_array);
+
+                // Deserialize all data files from the StringArray
+                let batch_files: Vec<DataFile> = files_array
+                    .into_iter()
+                    .flatten()
+                    .map(|f| -> DFResult<DataFile> {
+                        // Parse JSON to DataFileSerde and convert to DataFile
+                        deserialize_data_file_from_json(
+                            f,
+                            spec_id,
+                            &partition_type,
+                            &current_schema,
+                        )
+                        .map_err(to_datafusion_error)
+                    })
+                    .collect::<datafusion::common::Result<_>>()?;
+
+                // add record_counts from the current batch to total record count
+                total_record_count += batch_files.iter().map(|f| f.record_count()).sum::<u64>();
+
+                // Add all deserialized files to our collection
+                data_files.extend(batch_files);
             }
 
             // If no data files were collected, return an empty result
diff --git a/crates/integrations/datafusion/src/physical_plan/write.rs b/crates/integrations/datafusion/src/physical_plan/write.rs
@@ -37,8 +37,8 @@ use datafusion::physical_plan::{
 use futures::StreamExt;
 use iceberg::arrow::schema_to_arrow_schema;
 use iceberg::spec::{
-    DataFileFormat, PROPERTY_DEFAULT_FILE_FORMAT,
-    PROPERTY_DEFAULT_FILE_FORMAT_DEFAULT, serialize_data_file_to_json,
+    DataFileFormat, PROPERTY_DEFAULT_FILE_FORMAT, PROPERTY_DEFAULT_FILE_FORMAT_DEFAULT,
+    serialize_data_file_to_json,
 };
 use iceberg::table::Table;
 use iceberg::writer::base_writer::data_file_writer::DataFileWriterBuilder;
@@ -54,6 +54,7 @@ use uuid::Uuid;
 use crate::physical_plan::DATA_FILES_COL_NAME;
 use crate::to_datafusion_error;
 
+#[derive(Debug)]
 pub(crate) struct IcebergWriteExec {
     table: Table,
     input: Arc<dyn ExecutionPlan>,
@@ -104,12 +105,6 @@ impl IcebergWriteExec {
     }
 }
 
-impl Debug for IcebergWriteExec {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(f, "IcebergWriteExec")
-    }
-}
-
 impl DisplayAs for IcebergWriteExec {
     fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
         match t {
@@ -153,9 +148,10 @@ impl ExecutionPlan for IcebergWriteExec {
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> DFResult<Arc<dyn ExecutionPlan>> {
         if children.len() != 1 {
-            return Err(DataFusionError::Internal(
-                "IcebergWriteExec expects exactly one child, but provided {} ".to_string(),
-            ));
+            return Err(DataFusionError::Internal(format!(
+                "IcebergWriteExec expects exactly one child, but provided {}",
+                children.len()
+            )));
         }
 
         Ok(Arc::new(Self::new(
diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs
@@ -30,6 +30,7 @@ use datafusion::error::Result as DFResult;
 use datafusion::logical_expr::dml::InsertOp;
 use datafusion::logical_expr::{Expr, TableProviderFilterPushDown};
 use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use iceberg::arrow::schema_to_arrow_schema;
 use iceberg::inspect::MetadataTableType;
 use iceberg::table::Table;
@@ -198,10 +199,12 @@ impl TableProvider for IcebergTableProvider {
             self.schema.clone(),
         ));
 
+        let coalesce_partitions = Arc::new(CoalescePartitionsExec::new(write_plan));
+
         Ok(Arc::new(IcebergCommitExec::new(
             self.table.clone(),
             catalog,
-            write_plan,
+            coalesce_partitions,
             self.schema.clone(),
         )))
     }