Add with_preserve_order flag

alamb · alamb · commit 96299554b8a2 · 2026-03-19T15:03:31.000-04:00
diff --git a/datafusion/datasource/src/file_stream.rs b/datafusion/datasource/src/file_stream.rs
@@ -117,6 +117,17 @@ pub struct FileStream {
     baseline_metrics: BaselineMetrics,
     /// Describes the behavior of the `FileStream` if file opening or scanning fails
     on_error: OnError,
+    /// Preserve the logical planner/morsel order defined by the
+    /// [`MorselPlan`] API?
+    ///
+    /// If false (the default) morsels will be produced in the order
+    /// that they are ready to be run.
+    ///
+    /// If true, Morsels will be produced in the logical order defined on
+    /// [`MorselPlan`]
+    ///
+    /// [`MorselPlan`]: crate::morsel::MorselPlan
+    preserve_order: bool,
     /// Is the stream complete?
     state: StreamState,
 }
@@ -169,6 +180,7 @@ impl FileStream {
             file_stream_metrics: FileStreamMetrics::new(metrics, partition),
             baseline_metrics: BaselineMetrics::new(metrics, partition),
             on_error: OnError::Fail,
+            preserve_order: false,
             state: StreamState::Active,
         })
     }
@@ -181,6 +193,13 @@ impl FileStream {
         self
     }
 
+    /// Specify whether this `FileStream` should preserve the logical output
+    /// order implied by `MorselPlan`s.
+    pub fn with_preserve_order(mut self, preserve_order: bool) -> Self {
+        self.preserve_order = preserve_order;
+        self
+    }
+
     /// Run a planner on CPU until it either needs I/O or fully completes.
     ///
     /// Any morsels produced along the way are appended to `self.morsels`. If
@@ -224,6 +243,18 @@ impl FileStream {
         while (self.waiting_planners.len() + self.ready_planners.len())
             < TARGET_CONCURRENT_PLANNERS
         {
+            // In ordered mode, do not admit later files while there is any
+            // earlier file work still buffered, waiting on I/O, or actively
+            // being scanned. This keeps file-level planning from introducing
+            // later output ahead of earlier files.
+            if self.preserve_order
+                && (self.reader.is_some()
+                    || !self.morsels.is_empty()
+                    || !self.ready_planners.is_empty()
+                    || !self.waiting_planners.is_empty())
+            {
+                break;
+            }
             if self.morsels.len() >= max_buffered_morsels {
                 break;
             }
@@ -307,6 +338,17 @@ impl FileStream {
             // reader is currently active. This avoids starving planner work
             // behind a reader that is itself waiting on I/O.
             while self.morsels.len() < max_buffered_morsels() {
+                // In ordered mode, once an earlier planner has produced a
+                // morsel or is blocked on I/O, do not advance later sibling
+                // planners yet. This preserves the logical `MorselPlan` order:
+                // direct morsels first, then child planners in API order.
+                if self.preserve_order
+                    && (self.reader.is_some()
+                        || !self.morsels.is_empty()
+                        || !self.waiting_planners.is_empty())
+                {
+                    break;
+                }
                 let Some(planner) = self.ready_planners.pop_front() else {
                     break;
                 };
@@ -337,6 +379,14 @@ impl FileStream {
             // planners get CPU time before we consider returning `Pending`.
             if !self.ready_planners.is_empty()
                 && self.morsels.len() < max_buffered_morsels()
+                // In ordered mode, only loop back for more planner CPU when
+                // there is no earlier reader, buffered morsel, or waiting I/O
+                // that should be drained first. Otherwise, drop to
+                // `start_next_morsel()` so output is produced in order.
+                && (!self.preserve_order
+                    || (self.reader.is_none()
+                        && self.morsels.is_empty()
+                        && self.waiting_planners.is_empty()))
             {
                 continue;
             }
@@ -780,10 +830,12 @@ mod tests {
 
     /// Helper for morsel-driven `FileStream` tests that bundles the mock
     /// `Morselizer` setup with the corresponding `FileScanConfig`.
+    #[derive(Clone)]
     struct MorselTest {
         morselizer: MockMorselizer,
         file_names: Vec<String>,
         observer: Option<MorselObserver>,
+        preserve_order: bool,
     }
 
     impl MorselTest {
@@ -792,6 +844,7 @@ mod tests {
                 morselizer: MockMorselizer::new(),
                 file_names: vec![],
                 observer: None,
+                preserve_order: false,
             }
         }
 
@@ -802,16 +855,54 @@ mod tests {
             self
         }
 
+        fn reset_observer(mut self) -> Self {
+            let observer = MorselObserver::new();
+            self.morselizer = self.morselizer.with_observer(observer);
+            self
+        }
+
         fn with_observer(mut self, observer: MorselObserver) -> Self {
             self.morselizer = self.morselizer.with_observer(observer.clone());
             self.observer = Some(observer);
             self
         }
 
+        fn with_preserve_order(mut self, preserve_order: bool) -> Self {
+            self.preserve_order = preserve_order;
+            self
+        }
+
         async fn run(self) -> Result<String> {
             let file_names = self.file_names.iter().map(String::as_str).collect();
             let config = test_config(file_names);
-            let output = run_stream(self.morselizer, config).await?;
+            let metrics_set = ExecutionPlanMetricsSet::new();
+            let mut stream = FileStream::new_with_morselizer(
+                &config,
+                0,
+                Box::new(self.morselizer),
+                &metrics_set,
+            )?
+            .with_preserve_order(self.preserve_order);
+
+            let mut stream_contents = Vec::new();
+            while let Some(result) = stream.next().await {
+                match result {
+                    Ok(batch) => {
+                        // Each batch should have a single int32 column with the
+                        // mocked batch id, which keeps snapshot output compact.
+                        let col = batch.column(0).as_primitive::<Int32Type>();
+                        assert_eq!(col.len(), 1);
+                        assert!(col.is_valid(0));
+                        let batch_id = col.value(0);
+                        stream_contents.push(format!("Batch: {batch_id}"));
+                    }
+                    Err(e) => {
+                        stream_contents.push(format!("Error: {e}"));
+                    }
+                }
+            }
+            stream_contents.push("Done".to_string());
+            let output = stream_contents.join("\n");
 
             // Snapshot both the produced output and the scheduler trace
             // together. This makes scheduler changes much easier to review than
@@ -998,12 +1089,14 @@ mod tests {
         let observer = MorselObserver::new();
         let planner_1 = MockPlanner::builder()
             .with_id(PlannerId(1))
+            // Note IO required 2 polls
             .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(100), 2))
             .return_morsel(MorselId(11), 41)
             .return_none()
             .build();
         let planner_2 = MockPlanner::builder()
             .with_id(PlannerId(2))
+            // IO only requies 1 poll, so it will resolve before planner 1's IO
             .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(101), 1)) // IO returns after 1 poll
             .return_morsel(MorselId(12), 42)
             .return_none()
@@ -1023,9 +1116,9 @@ mod tests {
             .with_file("file1.parquet", parent_planner)
             .with_observer(observer.clone());
 
-        // Expect both futures to be polled, but second planner (42) batch to be
+        // Expect both futures to be polled, but second planner's (42) batch to be
         // produced first
-        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        insta::assert_snapshot!(test.clone().run().await.unwrap(), @r"
         ----- Output Stream -----
         Batch: 42
         Batch: 41
@@ -1062,6 +1155,48 @@ mod tests {
         morsel_stream_finished: MorselId(11)
         ");
 
+        // Run same test using `with_preserve_order(true)`, but expect the first
+        // planner's batch (41) to be produced before the second's (42), even
+        // though the second planner's I/O resolves first.
+        let test = test.reset_observer().with_preserve_order(true);
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 41
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: PlannerId(0)
+        planner_called: PlannerId(0)
+        planner_produced_child: PlannerId(0) -> PlannerId(1)
+        planner_produced_child: PlannerId(0) -> PlannerId(2)
+        planner_called: PlannerId(0)
+        planner_called: PlannerId(1)
+        io_future_created: PlannerId(1), IoFutureId(100)
+        planner_called: PlannerId(2)
+        io_future_created: PlannerId(2), IoFutureId(101)
+        io_future_polled: PlannerId(1), IoFutureId(100)
+        io_future_polled: PlannerId(2), IoFutureId(101)
+        io_future_polled: PlannerId(1), IoFutureId(100)
+        io_future_polled: PlannerId(2), IoFutureId(101)
+        io_future_resolved: PlannerId(2), IoFutureId(101)
+        planner_called: PlannerId(2)
+        morsel_produced: PlannerId(2), MorselId(12)
+        planner_called: PlannerId(2)
+        io_future_polled: PlannerId(1), IoFutureId(100)
+        io_future_resolved: PlannerId(1), IoFutureId(100)
+        planner_called: PlannerId(1)
+        morsel_produced: PlannerId(1), MorselId(11)
+        planner_called: PlannerId(1)
+        morsel_stream_started: MorselId(12)
+        morsel_stream_batch_produced: MorselId(12), BatchId(42)
+        morsel_stream_finished: MorselId(12)
+        morsel_stream_started: MorselId(11)
+        morsel_stream_batch_produced: MorselId(11), BatchId(41)
+        morsel_stream_finished: MorselId(11)
+        ");
+
         Ok(())
     }
 
@@ -1146,44 +1281,6 @@ mod tests {
         .build()
     }
 
-    /// Creates a  [`FileStream`] for reading the specified config, reads all
-    /// record batches, and returns a stringified version of the results for
-    /// easy comparison in tests.
-    ///
-    /// If the FileStream returns an error during execution, formats that error
-    /// in the output stream, rather than returning an error
-    async fn run_stream(
-        morselizer: MockMorselizer,
-        config: FileScanConfig,
-    ) -> Result<String> {
-        let metrics_set = ExecutionPlanMetricsSet::new();
-        let mut stream = FileStream::new_with_morselizer(
-            &config,
-            0,
-            Box::new(morselizer),
-            &metrics_set,
-        )?;
-
-        let mut stream_contents = Vec::new();
-        while let Some(result) = stream.next().await {
-            match result {
-                Ok(batch) => {
-                    // each batch should have a single int32 column with batch id
-                    let col = batch.column(0).as_primitive::<Int32Type>();
-                    assert_eq!(col.len(), 1);
-                    assert!(col.is_valid(0));
-                    let batch_id = col.value(0);
-                    stream_contents.push(format!("Batch: {batch_id}"));
-                }
-                Err(e) => {
-                    stream_contents.push(format!("Error: {e}"));
-                }
-            }
-        }
-        stream_contents.push("Done".to_string());
-        Ok(stream_contents.join("\n"))
-    }
-
     #[tokio::test]
     async fn on_error_opening() -> Result<()> {
         let batches = FileStreamTest::new()
diff --git a/datafusion/datasource/src/morsel/mod.rs b/datafusion/datasource/src/morsel/mod.rs
@@ -65,10 +65,12 @@ pub trait Morselizer: Send + Sync + Debug {
     /// single scan of the file. Returning multiple MorselPlanners allows for
     /// multiple concurrent scans of the same file.
     ///
-    /// This may involve CPU work, such as parsing parquet metadata and evaluating pruning predicates.
-    /// It should NOT do any IO work, such as reading from the file. If IO is required, it should
-    /// return a future that the caller can poll to drive the IO work to completion, and once the future
-    /// is complete, the caller can call `morselize` again to get the next morsels.
+    /// This may involve CPU work, such as parsing parquet metadata and
+    /// evaluating pruning predicates. It should NOT do any IO work, such as
+    /// reading from the file. If IO is required, it should return a future that
+    /// the caller can poll to drive the IO work to completion, and once the
+    /// future is complete, the caller can call `morselize` again to get the
+    /// next morsels.
     fn morselize(&self, file: PartitionedFile) -> Result<Vec<Box<dyn MorselPlanner>>>;
 }
 
@@ -103,22 +105,24 @@ pub trait MorselPlanner: Send + Debug {
     ///
     /// Returns `None` if the MorselPlanner has no more work to do (is done).
     ///
-    /// # Notes:
+    /// # Empty Morsel Plans
+    ///
     /// It may return Some(..) with an empty MorselPlan, which means it is ready
     /// for more CPU work and should be called again.
     ///
-    /// See the comments on [`MorselPlan`] for ordering
+    /// # Output Ordering
+    ///
+    /// See the comments on [`MorselPlan`] for the logical output order
     fn plan(&mut self) -> Result<Option<MorselPlan>>;
 }
 
 /// Return result of [`MorselPlanner::plan`]
 ///
-/// # Ordering
+/// # Logical Ordering
 /// For plans where the output order of rows is maintained, the output order of
 /// a [`MorselPlanner`] is logically defined as follows:
 /// 1. All morsels that are directly produced
 /// 2. (recursively) All morsels produced by the returned `planners`
-///
 #[derive(Default)]
 pub struct MorselPlan {
     /// Any Morsels that are ready for processing.