feat: enhance DataFrame streaming and improve robustness, tests, and docs

kosiew · kosiew · commit e32252113f58 · 2025-09-02T16:35:08.000+08:00
- Preserve partition order in DataFrame streaming and update related
  tests
- Add tests for record batch ordering and DataFrame batch iteration
- Improve `drop_stream` to correctly handle PyArrow ownership transfer
  and null pointers
- Replace `assert` with `debug_assert` for safer ArrowArrayStream
  validation
- Add documentation for `poll_next_batch` in PyRecordBatchStream
- Refactor tests to use `fail_collect` fixture for DataFrame collect
- Refactor `range_table` return type to `DataFrame` for clearer type
  hints
- Minor cleanup in SessionContext (remove extra blank line)
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -168,6 +168,14 @@ out-of-memory errors.
     for batch in reader:
         ...  # process each batch as it is produced
 
+DataFrames are also iterable, yielding :class:`pyarrow.RecordBatch` objects
+lazily so you can loop over results directly:
+
+.. code-block:: python
+
+    for batch in df:
+        ...  # process each batch as it is produced
+
 See :doc:`../io/arrow` for additional details on the Arrow interface.
 
 HTML Rendering
diff --git a/python/datafusion/_testing.py b/python/datafusion/_testing.py
@@ -4,20 +4,24 @@
 exposed as part of the public API. Keep the implementation minimal and
 documented so reviewers can easily see it's test-only.
 """
+
 from __future__ import annotations
 
-from typing import Any
+from typing import TYPE_CHECKING
 
 from .context import SessionContext
 
+if TYPE_CHECKING:
+    from datafusion import DataFrame
+
 
 def range_table(
     ctx: SessionContext,
     start: int,
     stop: int | None = None,
     step: int = 1,
     partitions: int | None = None,
-) -> Any:
+) -> DataFrame:
     """Create a DataFrame containing a sequence of numbers using SQL RANGE.
 
     This mirrors the previous ``SessionContext.range`` convenience method but
@@ -38,5 +42,5 @@ def range_table(
         start, stop = 0, start
 
     parts = f", {int(partitions)}" if partitions is not None else ""
-    sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})"
+    sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})"  # noqa: S608
     return ctx.sql(sql)
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -731,7 +731,6 @@ def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame:
         """
         return DataFrame(self.ctx.from_polars(data, name))
 
-
     # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
     # is the discussion on how we arrived at adding register_view
     def register_view(self, name: str, df: DataFrame) -> None:
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -290,6 +290,9 @@ def __init__(
 class DataFrame:
     """Two dimensional table representation of data.
 
+    DataFrame objects are iterable; iterating over a DataFrame yields
+    :class:`pyarrow.RecordBatch` instances lazily.
+
     See :ref:`user_guide_concepts` in the online documentation for more information.
     """
 
@@ -1114,7 +1117,8 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
             Arrow PyCapsule object representing an ``ArrowArrayStream``.
         """
         # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages
-        # ``execute_stream`` under the hood to stream batches one at a time.
+        # ``execute_stream_partitioned`` under the hood to stream batches while
+        # preserving the original partition order.
         return self.df.__arrow_c_stream__(requested_schema)
 
     def __iter__(self) -> Iterator[pa.RecordBatch]:
@@ -1123,7 +1127,8 @@ def __iter__(self) -> Iterator[pa.RecordBatch]:
         This implementation streams record batches via the Arrow C Stream
         interface, allowing callers such as :func:`pyarrow.Table.from_batches` to
         consume results lazily. The DataFrame is executed using DataFusion's
-        streaming APIs so ``collect`` is never invoked.
+        partitioned streaming APIs so ``collect`` is never invoked and batch
+        order across partitions is preserved.
         """
         import pyarrow as pa
 
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
@@ -17,7 +17,7 @@
 
 import pyarrow as pa
 import pytest
-from datafusion import SessionContext
+from datafusion import DataFrame, SessionContext
 from pyarrow.csv import write_csv
 
 
@@ -49,3 +49,12 @@ def database(ctx, tmp_path):
         delimiter=",",
         schema_infer_max_records=10,
     )
+
+
+@pytest.fixture
+def fail_collect(monkeypatch):
+    def _fail_collect(self, *args, **kwargs):  # pragma: no cover - failure path
+        msg = "collect should not be called"
+        raise AssertionError(msg)
+
+    monkeypatch.setattr(DataFrame, "collect", _fail_collect)
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1582,27 +1582,51 @@ def test_empty_to_arrow_table(df):
     assert set(pyarrow_table.column_names) == {"a", "b", "c"}
 
 
-def test_arrow_c_stream_to_table(monkeypatch):
+def test_iter_batches_dataframe(fail_collect):
+    ctx = SessionContext()
+
+    batch1 = pa.record_batch([pa.array([1])], names=["a"])
+    batch2 = pa.record_batch([pa.array([2])], names=["a"])
+    df = ctx.create_dataframe([[batch1], [batch2]])
+
+    expected = [batch1, batch2]
+    for got, exp in zip(df, expected):
+        assert got.equals(exp)
+
+
+def test_arrow_c_stream_to_table(fail_collect):
     ctx = SessionContext()
 
     # Create a DataFrame with two separate record batches
     batch1 = pa.record_batch([pa.array([1])], names=["a"])
     batch2 = pa.record_batch([pa.array([2])], names=["a"])
     df = ctx.create_dataframe([[batch1], [batch2]])
 
-    # Fail if the DataFrame is pre-collected
-    def fail_collect(self):  # pragma: no cover - failure path
-        msg = "collect should not be called"
-        raise AssertionError(msg)
+    table = pa.Table.from_batches(df)
+    batches = table.to_batches()
+
+    assert len(batches) == 2
+    assert batches[0].equals(batch1)
+    assert batches[1].equals(batch2)
+    assert table.schema == df.schema()
+    assert table.column("a").num_chunks == 2
+
+
+def test_arrow_c_stream_order():
+    ctx = SessionContext()
 
-    monkeypatch.setattr(DataFrame, "collect", fail_collect)
+    batch1 = pa.record_batch([pa.array([1])], names=["a"])
+    batch2 = pa.record_batch([pa.array([2])], names=["a"])
+
+    df = ctx.create_dataframe([[batch1, batch2]])
 
     table = pa.Table.from_batches(df)
     expected = pa.Table.from_batches([batch1, batch2])
 
     assert table.equals(expected)
-    assert table.schema == df.schema()
-    assert table.column("a").num_chunks == 2
+    col = table.column("a")
+    assert col.chunk(0)[0].as_py() == 1
+    assert col.chunk(1)[0].as_py() == 2
 
 
 def test_arrow_c_stream_reader(df):
diff --git a/python/tests/test_io.py b/python/tests/test_io.py
@@ -18,7 +18,7 @@
 
 import pyarrow as pa
 import pytest
-from datafusion import DataFrame, column
+from datafusion import column
 from datafusion._testing import range_table
 from datafusion.io import read_avro, read_csv, read_json, read_parquet
 
@@ -123,15 +123,9 @@ def test_arrow_c_stream_large_dataset(ctx):
         assert current_rss - start_rss < 50 * 1024 * 1024
 
 
-def test_table_from_batches_stream(ctx, monkeypatch):
+def test_table_from_batches_stream(ctx, fail_collect):
     df = range_table(ctx, 0, 10)
 
-    def fail_collect(self):  # pragma: no cover - failure path
-        msg = "collect should not be called"
-        raise AssertionError(msg)
-
-    monkeypatch.setattr(DataFrame, "collect", fail_collect)
-
     table = pa.Table.from_batches(df)
     assert table.shape == (10, 1)
     assert table.column_names == ["value"]
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -67,11 +67,25 @@ unsafe extern "C" fn drop_stream(capsule: *mut ffi::PyObject) {
     if capsule.is_null() {
         return;
     }
-    let stream_ptr =
-        ffi::PyCapsule_GetPointer(capsule, ARROW_STREAM_NAME.as_ptr()) as *mut FFI_ArrowArrayStream;
-    if !stream_ptr.is_null() {
-        drop(Box::from_raw(stream_ptr));
+
+    // When PyArrow imports this capsule it steals the raw stream pointer and
+    // sets the capsule's internal pointer to NULL. In that case
+    // `PyCapsule_IsValid` returns 0 and this destructor must not drop the
+    // stream as ownership has been transferred to PyArrow. If the capsule was
+    // never imported, the pointer remains valid and we are responsible for
+    // freeing the stream here.
+    if ffi::PyCapsule_IsValid(capsule, ARROW_STREAM_NAME.as_ptr()) == 1 {
+        let stream_ptr = ffi::PyCapsule_GetPointer(capsule, ARROW_STREAM_NAME.as_ptr())
+            as *mut FFI_ArrowArrayStream;
+        if !stream_ptr.is_null() {
+            drop(Box::from_raw(stream_ptr));
+        }
     }
+
+    // `PyCapsule_GetPointer` sets a Python error on failure. Clear it only
+    // after the stream has been released (or determined to be owned
+    // elsewhere).
+    ffi::PyErr_Clear();
 }
 
 // https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
@@ -369,50 +383,59 @@ impl PyDataFrame {
         Ok(html_str)
     }
 }
-/// Synchronous wrapper around a [`SendableRecordBatchStream`] used for
-/// the `__arrow_c_stream__` implementation.
+
+/// Synchronous wrapper around partitioned [`SendableRecordBatchStream`]s used
+/// for the `__arrow_c_stream__` implementation.
 ///
-/// It uses `runtime.block_on` to consume the underlying async stream,
-/// providing synchronous iteration. When a `projection` is set, each
-/// batch is converted via `record_batch_into_schema` to apply schema
-/// changes per batch.
-struct DataFrameStreamReader {
-    stream: SendableRecordBatchStream,
+/// It drains each partition's stream sequentially, yielding record batches in
+/// their original partition order. When a `projection` is set, each batch is
+/// converted via `record_batch_into_schema` to apply schema changes per batch.
+struct PartitionedDataFrameStreamReader {
+    streams: Vec<SendableRecordBatchStream>,
     schema: SchemaRef,
     projection: Option<SchemaRef>,
+    current: usize,
 }
 
-impl Iterator for DataFrameStreamReader {
+impl Iterator for PartitionedDataFrameStreamReader {
     type Item = Result<RecordBatch, ArrowError>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        // Use wait_for_future to poll the underlying async stream while
-        // respecting Python signal handling (e.g. ``KeyboardInterrupt``).
-        // This mirrors the behaviour of other synchronous wrappers and
-        // prevents blocking indefinitely when a Python interrupt is raised.
-        let fut = poll_next_batch(&mut self.stream);
-        let result = Python::with_gil(|py| wait_for_future(py, fut));
-
-        match result {
-            Ok(Ok(Some(batch))) => {
-                let batch = if let Some(ref schema) = self.projection {
-                    match record_batch_into_schema(batch, schema.as_ref()) {
-                        Ok(b) => b,
-                        Err(e) => return Some(Err(e)),
-                    }
-                } else {
-                    batch
-                };
-                Some(Ok(batch))
+        while self.current < self.streams.len() {
+            let stream = &mut self.streams[self.current];
+            let fut = poll_next_batch(stream);
+            let result = Python::with_gil(|py| wait_for_future(py, fut));
+
+            match result {
+                Ok(Ok(Some(batch))) => {
+                    let batch = if let Some(ref schema) = self.projection {
+                        match record_batch_into_schema(batch, schema.as_ref()) {
+                            Ok(b) => b,
+                            Err(e) => return Some(Err(e)),
+                        }
+                    } else {
+                        batch
+                    };
+                    return Some(Ok(batch));
+                }
+                Ok(Ok(None)) => {
+                    self.current += 1;
+                    continue;
+                }
+                Ok(Err(e)) => {
+                    return Some(Err(ArrowError::ExternalError(Box::new(e))));
+                }
+                Err(e) => {
+                    return Some(Err(ArrowError::ExternalError(Box::new(e))));
+                }
             }
-            Ok(Ok(None)) => None,
-            Ok(Err(e)) => Some(Err(ArrowError::ExternalError(Box::new(e)))),
-            Err(e) => Some(Err(ArrowError::ExternalError(Box::new(e)))),
         }
+
+        None
     }
 }
 
-impl RecordBatchReader for DataFrameStreamReader {
+impl RecordBatchReader for PartitionedDataFrameStreamReader {
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
@@ -944,7 +967,7 @@ impl PyDataFrame {
         requested_schema: Option<Bound<'py, PyCapsule>>,
     ) -> PyDataFusionResult<Bound<'py, PyCapsule>> {
         let df = self.df.as_ref().clone();
-        let stream = spawn_stream(py, async move { df.execute_stream().await })?;
+        let streams = spawn_streams(py, async move { df.execute_stream_partitioned().await })?;
 
         let mut schema: Schema = self.df.schema().to_owned().into();
         let mut projection: Option<SchemaRef> = None;
@@ -961,19 +984,24 @@ impl PyDataFrame {
 
         let schema_ref = Arc::new(schema.clone());
 
-        let reader = DataFrameStreamReader {
-            stream,
+        let reader = PartitionedDataFrameStreamReader {
+            streams,
             schema: schema_ref,
             projection,
+            current: 0,
         };
         let reader: Box<dyn RecordBatchReader + Send> = Box::new(reader);
 
         let stream = Box::new(FFI_ArrowArrayStream::new(reader));
         let stream_ptr = Box::into_raw(stream);
-        assert!(
+        debug_assert!(
             !stream_ptr.is_null(),
-            "ArrowArrayStream pointer should never be null"
+            "ArrowArrayStream pointer should never be null",
         );
+        // The returned capsule allows zero-copy hand-off to PyArrow. When
+        // PyArrow imports the capsule it assumes ownership of the stream and
+        // nulls out the capsule's internal pointer so `drop_stream` knows not to
+        // free it.
         let capsule = unsafe {
             ffi::PyCapsule_New(
                 stream_ptr as *mut c_void,
diff --git a/src/record_batch.rs b/src/record_batch.rs
@@ -84,6 +84,7 @@ impl PyRecordBatchStream {
     }
 }
 
+/// Polls the next batch from a `SendableRecordBatchStream`, converting the `Option<Result<_>>` form.
 pub(crate) async fn poll_next_batch(
     stream: &mut SendableRecordBatchStream,
 ) -> datafusion::error::Result<Option<RecordBatch>> {

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,7 @@ impl PyRecordBatchStream {`
`84`	`84`	`}`
`85`	`85`	`}`
`86`	`86`
	`87`	+/// Polls the next batch from a `SendableRecordBatchStream`, converting the `Option<Result<_>>` form.
`87`	`88`	`pub(crate) async fn poll_next_batch(`
`88`	`89`	`stream: &mut SendableRecordBatchStream,`
`89`	`90`	`) -> datafusion::error::Result<Option<RecordBatch>> {`