Add IcebergFileStream based on DataFusion, add benchmark. Bump the Iceberg version back to 1.8.1 after hitting known segfaults with old versions.

mbutrovich · mbutrovich · commit 8091a812e81c · 2025-10-08T06:45:53.000-04:00
diff --git a/native/core/src/execution/operators/iceberg_scan.rs b/native/core/src/execution/operators/iceberg_scan.rs
@@ -18,21 +18,24 @@
 //! Native Iceberg table scan operator using iceberg-rust
 
 use std::any::Any;
-use std::collections::HashMap;
+use std::collections::{HashMap, VecDeque};
 use std::fmt;
 use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll};
 
+use arrow::array::RecordBatch;
 use arrow::datatypes::SchemaRef;
 use datafusion::common::{DataFusionError, Result as DFResult};
-use datafusion::execution::{SendableRecordBatchStream, TaskContext};
+use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext};
 use datafusion::physical_expr::EquivalenceProperties;
 use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{
     DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
 };
-use futures::{StreamExt, TryStreamExt};
+use futures::future::BoxFuture;
+use futures::{ready, FutureExt, Stream, StreamExt, TryStreamExt};
 use iceberg::io::FileIO;
 
 use crate::execution::operators::ExecutionError;
@@ -117,14 +120,14 @@ impl ExecutionPlan for IcebergScanExec {
     fn execute(
         &self,
         partition: usize,
-        _context: Arc<TaskContext>,
+        context: Arc<TaskContext>,
     ) -> DFResult<SendableRecordBatchStream> {
         // Execute pre-planned tasks from Scala (planning happens via Iceberg's Java API)
         if let Some(ref task_groups) = self.file_task_groups {
             if partition < task_groups.len() {
                 let tasks = &task_groups[partition];
 
-                return self.execute_with_tasks(tasks.clone());
+                return self.execute_with_tasks(tasks.clone(), context);
             } else {
                 return Err(DataFusionError::Execution(format!(
                     "IcebergScanExec: Partition index {} out of range (only {} task groups available)",
@@ -148,38 +151,26 @@ impl IcebergScanExec {
     fn execute_with_tasks(
         &self,
         tasks: Vec<iceberg::scan::FileScanTask>,
+        context: Arc<TaskContext>,
     ) -> DFResult<SendableRecordBatchStream> {
         let output_schema = Arc::clone(&self.output_schema);
-        let catalog_properties = self.catalog_properties.clone();
-        let metadata_location = self.metadata_location.clone();
 
-        let fut = async move {
-            let file_io = Self::load_file_io(&catalog_properties, &metadata_location)?;
+        // Create FileIO synchronously
+        let file_io = Self::load_file_io(&self.catalog_properties, &self.metadata_location)?;
 
-            let task_stream = futures::stream::iter(tasks.into_iter().map(Ok)).boxed();
+        // Get batch size from context
+        let batch_size = context.session_config().batch_size();
 
-            let reader = iceberg::arrow::ArrowReaderBuilder::new(file_io).build();
+        // Create parallel file stream that overlaps opening next file with reading current file
+        let file_stream =
+            IcebergFileStream::new(tasks, file_io, batch_size, Arc::clone(&output_schema))?;
 
-            // read() is synchronous and returns Result<ArrowRecordBatchStream>
-            let stream = reader.read(task_stream).map_err(|e| {
-                DataFusionError::Execution(format!("Failed to read Iceberg tasks: {}", e))
-            })?;
-
-            let mapped_stream = stream
-                .map_err(|e| DataFusionError::Execution(format!("Iceberg scan error: {}", e)));
-
-            Ok::<_, DataFusionError>(Box::pin(mapped_stream)
-                as Pin<
-                    Box<dyn futures::Stream<Item = DFResult<arrow::array::RecordBatch>> + Send>,
-                >)
-        };
+        // Note: BatchSplitStream adds overhead. Since we're already setting batch_size in
+        // iceberg-rust's ArrowReaderBuilder, it should produce correctly sized batches.
+        // Only use BatchSplitStream as a safety net if needed.
+        // For now, return the file_stream directly to reduce stream nesting overhead.
 
-        let stream = futures::stream::once(fut).try_flatten();
-
-        Ok(Box::pin(RecordBatchStreamAdapter::new(
-            output_schema,
-            stream,
-        )))
+        Ok(Box::pin(file_stream))
     }
 
     fn load_file_io(
@@ -199,6 +190,194 @@ impl IcebergScanExec {
     }
 }
 
+/// State machine for IcebergFileStream
+enum FileStreamState {
+    /// Idle state - need to start opening next file
+    Idle,
+    /// Opening a file
+    Opening {
+        future: BoxFuture<'static, DFResult<SendableRecordBatchStream>>,
+    },
+    /// Reading from current file while potentially opening next file
+    Reading {
+        current: SendableRecordBatchStream,
+        next: Option<BoxFuture<'static, DFResult<SendableRecordBatchStream>>>,
+    },
+    /// Error state
+    Error,
+}
+
+/// Stream that reads Iceberg files with parallel opening optimization.
+/// Opens the next file while reading the current file to overlap IO with compute.
+///
+/// Inspired by DataFusion's [`FileStream`] pattern for overlapping file opening with reading.
+///
+/// [`FileStream`]: https://github.com/apache/datafusion/blob/main/datafusion/datasource/src/file_stream.rs
+struct IcebergFileStream {
+    schema: SchemaRef,
+    file_io: FileIO,
+    batch_size: usize,
+    tasks: VecDeque<iceberg::scan::FileScanTask>,
+    state: FileStreamState,
+}
+
+impl IcebergFileStream {
+    fn new(
+        tasks: Vec<iceberg::scan::FileScanTask>,
+        file_io: FileIO,
+        batch_size: usize,
+        schema: SchemaRef,
+    ) -> DFResult<Self> {
+        Ok(Self {
+            schema,
+            file_io,
+            batch_size,
+            tasks: tasks.into_iter().collect(),
+            state: FileStreamState::Idle,
+        })
+    }
+
+    /// Start opening the next file
+    fn start_next_file(
+        &mut self,
+    ) -> Option<BoxFuture<'static, DFResult<SendableRecordBatchStream>>> {
+        let task = self.tasks.pop_front()?;
+        let file_io = self.file_io.clone();
+        let batch_size = self.batch_size;
+        let schema = Arc::clone(&self.schema);
+
+        Some(Box::pin(async move {
+            // Create a single-task stream
+            let task_stream = futures::stream::iter(vec![Ok(task)]).boxed();
+
+            // Create reader with optimizations
+            let reader = iceberg::arrow::ArrowReaderBuilder::new(file_io)
+                .with_batch_size(batch_size)
+                .with_row_selection_enabled(true)
+                .build();
+
+            // Read the task
+            let stream = reader.read(task_stream).map_err(|e| {
+                DataFusionError::Execution(format!("Failed to read Iceberg task: {}", e))
+            })?;
+
+            // Map errors and wrap minimally - RecordBatchStreamAdapter is needed to provide schema
+            let mapped_stream = stream
+                .map_err(|e| DataFusionError::Execution(format!("Iceberg scan error: {}", e)));
+
+            Ok(
+                Box::pin(RecordBatchStreamAdapter::new(schema, mapped_stream))
+                    as SendableRecordBatchStream,
+            )
+        }))
+    }
+
+    fn poll_inner(&mut self, cx: &mut Context<'_>) -> Poll<Option<DFResult<RecordBatch>>> {
+        loop {
+            match &mut self.state {
+                FileStreamState::Idle => {
+                    // Start opening the first file
+                    match self.start_next_file() {
+                        Some(future) => {
+                            self.state = FileStreamState::Opening { future };
+                        }
+                        None => return Poll::Ready(None),
+                    }
+                }
+                FileStreamState::Opening { future } => {
+                    // Wait for file to open
+                    match ready!(future.poll_unpin(cx)) {
+                        Ok(stream) => {
+                            // File opened, start reading and open next file in parallel
+                            let next = self.start_next_file();
+                            self.state = FileStreamState::Reading {
+                                current: stream,
+                                next,
+                            };
+                        }
+                        Err(e) => {
+                            self.state = FileStreamState::Error;
+                            return Poll::Ready(Some(Err(e)));
+                        }
+                    }
+                }
+                FileStreamState::Reading { current, next } => {
+                    // Poll next file opening future to drive it forward (background IO)
+                    if let Some(next_future) = next {
+                        if let Poll::Ready(result) = next_future.poll_unpin(cx) {
+                            // Next file is ready, store it
+                            match result {
+                                Ok(stream) => {
+                                    *next = Some(Box::pin(futures::future::ready(Ok(stream))));
+                                }
+                                Err(e) => {
+                                    self.state = FileStreamState::Error;
+                                    return Poll::Ready(Some(Err(e)));
+                                }
+                            }
+                        }
+                    }
+
+                    // Poll current stream for next batch
+                    match ready!(current.poll_next_unpin(cx)) {
+                        Some(result) => {
+                            return Poll::Ready(Some(result));
+                        }
+                        None => {
+                            // Current file is done, move to next file if available
+                            match next.take() {
+                                Some(mut next_future) => {
+                                    // Check if next file is already opened
+                                    match next_future.poll_unpin(cx) {
+                                        Poll::Ready(Ok(stream)) => {
+                                            let next_next = self.start_next_file();
+                                            self.state = FileStreamState::Reading {
+                                                current: stream,
+                                                next: next_next,
+                                            };
+                                        }
+                                        Poll::Ready(Err(e)) => {
+                                            self.state = FileStreamState::Error;
+                                            return Poll::Ready(Some(Err(e)));
+                                        }
+                                        Poll::Pending => {
+                                            // Still opening, wait for it
+                                            self.state = FileStreamState::Opening {
+                                                future: next_future,
+                                            };
+                                        }
+                                    }
+                                }
+                                None => {
+                                    // No more files
+                                    return Poll::Ready(None);
+                                }
+                            }
+                        }
+                    }
+                }
+                FileStreamState::Error => {
+                    return Poll::Ready(None);
+                }
+            }
+        }
+    }
+}
+
+impl Stream for IcebergFileStream {
+    type Item = DFResult<arrow::array::RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.poll_inner(cx)
+    }
+}
+
+impl RecordBatchStream for IcebergFileStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
 impl DisplayAs for IcebergScanExec {
     fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
         write!(
diff --git a/spark/pom.xml b/spark/pom.xml
@@ -185,7 +185,7 @@ under the License.
         <dependency>
           <groupId>org.apache.iceberg</groupId>
           <artifactId>iceberg-spark-runtime-${spark.version.short}_${scala.binary.version}</artifactId>
-          <version>1.5.2</version>
+          <version>1.8.1</version>
           <scope>test</scope>
         </dependency>
       </dependencies>
@@ -200,7 +200,7 @@ under the License.
         <dependency>
           <groupId>org.apache.iceberg</groupId>
           <artifactId>iceberg-spark-runtime-${spark.version.short}_${scala.binary.version}</artifactId>
-          <version>1.5.2</version>
+          <version>1.8.1</version>
           <scope>test</scope>
         </dependency>
       </dependencies>
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometBenchmarkBase.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometBenchmarkBase.scala
@@ -138,6 +138,42 @@ trait CometBenchmarkBase extends SqlBasedBenchmark {
     saveAsEncryptedParquetV1Table(testDf, dir.getCanonicalPath + "/parquetV1")
   }
 
+  protected def prepareIcebergTable(
+      dir: File,
+      df: DataFrame,
+      tableName: String = "icebergTable",
+      partition: Option[String] = None): Unit = {
+    val warehouseDir = new File(dir, "iceberg-warehouse")
+
+    // Configure Hadoop catalog (same pattern as CometIcebergNativeSuite)
+    spark.conf.set("spark.sql.catalog.benchmark_cat", "org.apache.iceberg.spark.SparkCatalog")
+    spark.conf.set("spark.sql.catalog.benchmark_cat.type", "hadoop")
+    spark.conf.set("spark.sql.catalog.benchmark_cat.warehouse", warehouseDir.getAbsolutePath)
+
+    val fullTableName = s"benchmark_cat.db.$tableName"
+
+    // Drop table if exists
+    spark.sql(s"DROP TABLE IF EXISTS $fullTableName")
+
+    // Create a temp view from the DataFrame
+    df.createOrReplaceTempView("temp_df_for_iceberg")
+
+    // Create Iceberg table from temp view
+    val partitionClause = partition.map(p => s"PARTITIONED BY ($p)").getOrElse("")
+    spark.sql(s"""
+      CREATE TABLE $fullTableName
+      USING iceberg
+      TBLPROPERTIES ('format-version'='2', 'write.parquet.compression-codec' = 'snappy')
+      $partitionClause
+      AS SELECT * FROM temp_df_for_iceberg
+    """)
+
+    // Create temp view for benchmarking
+    spark.table(fullTableName).createOrReplaceTempView(tableName)
+
+    spark.catalog.dropTempView("temp_df_for_iceberg")
+  }
+
   protected def saveAsEncryptedParquetV1Table(df: DataFrameWriter[Row], dir: String): Unit = {
     val encoder = Base64.getEncoder
     val footerKey =
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometReadBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometReadBenchmark.scala