refactor: improve DataFrame streaming, memory management, and error handling

kosiew · kosiew · commit f78e90b9bab4 · 2025-09-02T16:35:08.000+08:00
- Refactor record batch streaming to use `poll_next_batch` for clearer
  error handling
- Improve `spawn_future`/`spawn_stream` functions for better Python
  exception integration and code reuse
- Update `datafusion` and `datafusion-ffi` dependencies to 49.0.2
- Fix PyArrow `RecordBatchReader` import to use `_import_from_c_capsule`
  for safer memory handling
- Refactor `ArrowArrayStream` handling to use `PyCapsule` with
  destructor for improved memory management
- Refactor projection initialization in `PyDataFrame` for clarity
- Move `range` functionality into `_testing.py` helper
- Rename test column in `test_table_from_batches_stream` for accuracy
- Add tests for `RecordBatchReader` and enhance DataFrame stream
  handling
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -164,7 +164,7 @@ out-of-memory errors.
     import pyarrow as pa
 
     # Create a PyArrow RecordBatchReader without materializing all batches
-    reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
+    reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())
     for batch in reader:
         ...  # process each batch as it is produced
 
diff --git a/examples/datafusion-ffi-example/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml
@@ -21,8 +21,8 @@ version = "0.2.0"
 edition = "2021"
 
 [dependencies]
-datafusion = { version = "49.0.1" }
-datafusion-ffi = { version = "49.0.1" }
+datafusion = { version = "49.0.2" }
+datafusion-ffi = { version = "49.0.2" }
 pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] }
 arrow = { version = "55.0.0" }
 arrow-array = { version = "55.0.0" }
diff --git a/python/datafusion/_testing.py b/python/datafusion/_testing.py
@@ -0,0 +1,42 @@
+"""Testing-only helpers for datafusion-python.
+
+This module contains utilities used by the test-suite that should not be
+exposed as part of the public API. Keep the implementation minimal and
+documented so reviewers can easily see it's test-only.
+"""
+from __future__ import annotations
+
+from typing import Any
+
+from .context import SessionContext
+
+
+def range_table(
+    ctx: SessionContext,
+    start: int,
+    stop: int | None = None,
+    step: int = 1,
+    partitions: int | None = None,
+) -> Any:
+    """Create a DataFrame containing a sequence of numbers using SQL RANGE.
+
+    This mirrors the previous ``SessionContext.range`` convenience method but
+    lives in a testing-only module so it doesn't expand the public surface.
+
+    Args:
+        ctx: SessionContext instance to run the SQL against.
+        start: Starting value for the sequence or exclusive stop when ``stop``
+            is ``None``.
+        stop: Exclusive upper bound of the sequence.
+        step: Increment between successive values.
+        partitions: Optional number of partitions for the generated data.
+
+    Returns:
+        DataFrame produced by the range table function.
+    """
+    if stop is None:
+        start, stop = 0, start
+
+    parts = f", {int(partitions)}" if partitions is not None else ""
+    sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})"
+    return ctx.sql(sql)
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -731,36 +731,6 @@ def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame:
         """
         return DataFrame(self.ctx.from_polars(data, name))
 
-    def range(
-        self,
-        start: int,
-        stop: int | None = None,
-        step: int = 1,
-        partitions: int | None = None,
-    ) -> DataFrame:
-        """Create a DataFrame containing a sequence of numbers.
-
-        This is backed by DataFusion's ``range`` table function, which generates
-        values lazily and therefore does not materialize the full range in
-        memory. When ``stop`` is omitted, ``start`` is treated as the stop value
-        and the sequence begins at zero.
-
-        Args:
-            start: Starting value for the sequence or the exclusive stop if
-                ``stop`` is ``None``.
-            stop: Exclusive upper bound of the sequence.
-            step: Increment between successive values.
-            partitions: Optional number of partitions for the generated data.
-
-        Returns:
-            DataFrame yielding the requested range of values.
-        """
-        if stop is None:
-            start, stop = 0, start
-
-        parts = f", {int(partitions)}" if partitions is not None else ""
-        sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})"  # noqa: S608
-        return self.sql(sql)
 
     # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
     # is the discussion on how we arrived at adding register_view
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1127,7 +1127,7 @@ def __iter__(self) -> Iterator[pa.RecordBatch]:
         """
         import pyarrow as pa
 
-        reader = pa.RecordBatchReader._import_from_c(self.__arrow_c_stream__())
+        reader = pa.RecordBatchReader._import_from_c_capsule(self.__arrow_c_stream__())
         yield from reader
 
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1605,6 +1605,14 @@ def fail_collect(self):  # pragma: no cover - failure path
     assert table.column("a").num_chunks == 2
 
 
+def test_arrow_c_stream_reader(df):
+    reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())
+    assert isinstance(reader, pa.RecordBatchReader)
+    table = pa.Table.from_batches(reader)
+    expected = pa.Table.from_batches(df.collect())
+    assert table.equals(expected)
+
+
 def test_to_pylist(df):
     # Convert datafusion dataframe to Python list
     pylist = df.to_pylist()
@@ -2743,7 +2751,7 @@ def test_arrow_c_stream_interrupted():
         """
     )
 
-    reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
+    reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())
 
     interrupted = False
     interrupt_error = None
diff --git a/python/tests/test_io.py b/python/tests/test_io.py
@@ -18,7 +18,8 @@
 
 import pyarrow as pa
 import pytest
-from datafusion import column
+from datafusion import DataFrame, column
+from datafusion._testing import range_table
 from datafusion.io import read_avro, read_csv, read_json, read_parquet
 
 
@@ -104,9 +105,9 @@ def test_arrow_c_stream_large_dataset(ctx):
     handful of batches should not exhaust process memory.
     """
     # Create a very large DataFrame using range; this would be terabytes if collected
-    df = ctx.range(0, 1 << 40)
+    df = range_table(ctx, 0, 1 << 40)
 
-    reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
+    reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())
 
     # Track RSS before consuming batches
     psutil = pytest.importorskip("psutil")
@@ -120,3 +121,17 @@ def test_arrow_c_stream_large_dataset(ctx):
         current_rss = process.memory_info().rss
         # Ensure memory usage hasn't grown substantially (>50MB)
         assert current_rss - start_rss < 50 * 1024 * 1024
+
+
+def test_table_from_batches_stream(ctx, monkeypatch):
+    df = range_table(ctx, 0, 10)
+
+    def fail_collect(self):  # pragma: no cover - failure path
+        msg = "collect should not be called"
+        raise AssertionError(msg)
+
+    monkeypatch.setattr(DataFrame, "collect", fail_collect)
+
+    table = pa.Table.from_batches(df)
+    assert table.shape == (10, 1)
+    assert table.column_names == ["value"]
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use std::collections::HashMap;
-use std::ffi::CString;
+use std::ffi::{c_void, CStr, CString};
 use std::sync::Arc;
 
 use arrow::array::{new_null_array, RecordBatch, RecordBatchReader};
@@ -39,6 +39,7 @@ use datafusion::prelude::*;
 use datafusion_ffi::table_provider::FFI_TableProvider;
 use futures::{StreamExt, TryStreamExt};
 use pyo3::exceptions::PyValueError;
+use pyo3::ffi;
 use pyo3::prelude::*;
 use pyo3::pybacked::PyBackedStr;
 use pyo3::types::{PyCapsule, PyList, PyTuple, PyTupleMethods};
@@ -47,7 +48,7 @@ use crate::catalog::PyTable;
 use crate::errors::{py_datafusion_err, PyDataFusionError};
 use crate::expr::sort_expr::to_sort_expressions;
 use crate::physical_plan::PyExecutionPlan;
-use crate::record_batch::PyRecordBatchStream;
+use crate::record_batch::{poll_next_batch, PyRecordBatchStream};
 use crate::sql::logical::PyLogicalPlan;
 use crate::utils::{
     get_tokio_runtime, is_ipython_env, py_obj_to_scalar_value, spawn_stream, spawn_streams,
@@ -58,6 +59,21 @@ use crate::{
     expr::{sort_expr::PySortExpr, PyExpr},
 };
 
+#[allow(clippy::manual_c_str_literals)]
+static ARROW_STREAM_NAME: &CStr =
+    unsafe { CStr::from_bytes_with_nul_unchecked(b"arrow_array_stream\0") };
+
+unsafe extern "C" fn drop_stream(capsule: *mut ffi::PyObject) {
+    if capsule.is_null() {
+        return;
+    }
+    let stream_ptr =
+        ffi::PyCapsule_GetPointer(capsule, ARROW_STREAM_NAME.as_ptr()) as *mut FFI_ArrowArrayStream;
+    if !stream_ptr.is_null() {
+        drop(Box::from_raw(stream_ptr));
+    }
+}
+
 // https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
 // - we have not decided on the table_provider approach yet
 // this is an interim implementation
@@ -374,11 +390,11 @@ impl Iterator for DataFrameStreamReader {
         // respecting Python signal handling (e.g. ``KeyboardInterrupt``).
         // This mirrors the behaviour of other synchronous wrappers and
         // prevents blocking indefinitely when a Python interrupt is raised.
-        let fut = self.stream.next();
+        let fut = poll_next_batch(&mut self.stream);
         let result = Python::with_gil(|py| wait_for_future(py, fut));
 
         match result {
-            Ok(Some(Ok(batch))) => {
+            Ok(Ok(Some(batch))) => {
                 let batch = if let Some(ref schema) = self.projection {
                     match record_batch_into_schema(batch, schema.as_ref()) {
                         Ok(b) => b,
@@ -389,8 +405,8 @@ impl Iterator for DataFrameStreamReader {
                 };
                 Some(Ok(batch))
             }
-            Ok(Some(Err(e))) => Some(Err(ArrowError::ExternalError(Box::new(e)))),
-            Ok(None) => None,
+            Ok(Ok(None)) => None,
+            Ok(Err(e)) => Some(Err(ArrowError::ExternalError(Box::new(e)))),
             Err(e) => Some(Err(ArrowError::ExternalError(Box::new(e)))),
         }
     }
@@ -943,7 +959,7 @@ impl PyDataFrame {
             projection = Some(Arc::new(schema.clone()));
         }
 
-        let schema_ref = projection.clone().unwrap_or_else(|| Arc::new(schema));
+        let schema_ref = Arc::new(schema.clone());
 
         let reader = DataFrameStreamReader {
             stream,
@@ -952,9 +968,26 @@ impl PyDataFrame {
         };
         let reader: Box<dyn RecordBatchReader + Send> = Box::new(reader);
 
-        let ffi_stream = FFI_ArrowArrayStream::new(reader);
-        let stream_capsule_name = CString::new("arrow_array_stream").unwrap();
-        PyCapsule::new(py, ffi_stream, Some(stream_capsule_name)).map_err(PyDataFusionError::from)
+        let stream = Box::new(FFI_ArrowArrayStream::new(reader));
+        let stream_ptr = Box::into_raw(stream);
+        assert!(
+            !stream_ptr.is_null(),
+            "ArrowArrayStream pointer should never be null"
+        );
+        let capsule = unsafe {
+            ffi::PyCapsule_New(
+                stream_ptr as *mut c_void,
+                ARROW_STREAM_NAME.as_ptr(),
+                Some(drop_stream),
+            )
+        };
+        if capsule.is_null() {
+            unsafe { drop(Box::from_raw(stream_ptr)) };
+            Err(PyErr::fetch(py).into())
+        } else {
+            let any = unsafe { Bound::from_owned_ptr(py, capsule) };
+            Ok(any.downcast_into::<PyCapsule>().unwrap())
+        }
     }
 
     fn execute_stream(&self, py: Python) -> PyDataFusionResult<PyRecordBatchStream> {
diff --git a/src/record_batch.rs b/src/record_batch.rs
@@ -84,15 +84,20 @@ impl PyRecordBatchStream {
     }
 }
 
+pub(crate) async fn poll_next_batch(
+    stream: &mut SendableRecordBatchStream,
+) -> datafusion::error::Result<Option<RecordBatch>> {
+    stream.next().await.transpose()
+}
+
 async fn next_stream(
     stream: Arc<Mutex<SendableRecordBatchStream>>,
     sync: bool,
 ) -> PyResult<PyRecordBatch> {
     let mut stream = stream.lock().await;
-    match stream.next().await {
-        Some(Ok(batch)) => Ok(batch.into()),
-        Some(Err(e)) => Err(PyDataFusionError::from(e))?,
-        None => {
+    match poll_next_batch(&mut stream).await {
+        Ok(Some(batch)) => Ok(batch.into()),
+        Ok(None) => {
             // Depending on whether the iteration is sync or not, we raise either a
             // StopIteration or a StopAsyncIteration
             if sync {
@@ -101,5 +106,6 @@ async fn next_stream(
                 Err(PyStopAsyncIteration::new_err("stream exhausted"))
             }
         }
+        Err(e) => Err(PyDataFusionError::from(e))?,
     }
 }
diff --git a/src/utils.rs b/src/utils.rs
@@ -85,17 +85,42 @@ where
     })
 }
 
+/// Spawn a [`Future`] on the Tokio runtime and wait for completion
+/// while respecting Python signal handling.
+pub(crate) fn spawn_future<F, T>(py: Python, fut: F) -> PyDataFusionResult<T>
+where
+    F: Future<Output = datafusion::common::Result<T>> + Send + 'static,
+    T: Send + 'static,
+{
+    let rt = &get_tokio_runtime().0;
+    let handle: JoinHandle<datafusion::common::Result<T>> = rt.spawn(fut);
+    // Wait for the join handle while respecting Python signal handling.
+    // We handle errors in two steps so `?` maps the error types correctly:
+    // 1) convert any Python-related error from `wait_for_future` into `PyDataFusionError`
+    // 2) convert any DataFusion error (inner result) into `PyDataFusionError`
+    let inner_result = wait_for_future(py, async {
+        // handle.await yields `Result<datafusion::common::Result<T>, JoinError>`
+        // map JoinError into a DataFusion error so the async block returns
+        // `datafusion::common::Result<T>` (i.e. Result<T, DataFusionError>)
+        match handle.await {
+            Ok(inner) => inner,
+            Err(join_err) => Err(to_datafusion_err(join_err)),
+        }
+    })?; // converts PyErr -> PyDataFusionError
+
+    // `inner_result` is `datafusion::common::Result<T>`; use `?` to convert
+    // the inner DataFusion error into `PyDataFusionError` via `From` and
+    // return the inner `T` on success.
+    Ok(inner_result?)
+}
+
 /// Spawn a [`SendableRecordBatchStream`] on the Tokio runtime and wait for completion
 /// while respecting Python signal handling.
 pub(crate) fn spawn_stream<F>(py: Python, fut: F) -> PyDataFusionResult<SendableRecordBatchStream>
 where
     F: Future<Output = datafusion::common::Result<SendableRecordBatchStream>> + Send + 'static,
 {
-    let rt = &get_tokio_runtime().0;
-    let handle: JoinHandle<datafusion::common::Result<SendableRecordBatchStream>> = rt.spawn(fut);
-    Ok(wait_for_future(py, async {
-        handle.await.map_err(to_datafusion_err)
-    })???)
+    spawn_future(py, fut)
 }
 
 /// Spawn a partitioned [`SendableRecordBatchStream`] on the Tokio runtime and wait for completion
@@ -107,12 +132,7 @@ pub(crate) fn spawn_streams<F>(
 where
     F: Future<Output = datafusion::common::Result<Vec<SendableRecordBatchStream>>> + Send + 'static,
 {
-    let rt = &get_tokio_runtime().0;
-    let handle: JoinHandle<datafusion::common::Result<Vec<SendableRecordBatchStream>>> =
-        rt.spawn(fut);
-    Ok(wait_for_future(py, async {
-        handle.await.map_err(to_datafusion_err)
-    })???)
+    spawn_future(py, fut)
 }
 
 pub(crate) fn parse_volatility(value: &str) -> PyDataFusionResult<Volatility> {

Original file line number	Diff line number	Diff line change
`@@ -84,15 +84,20 @@ impl PyRecordBatchStream {`
`84`	`84`	`}`
`85`	`85`	`}`
`86`	`86`
	`87`	`+pub(crate) async fn poll_next_batch(`
	`88`	`+ stream: &mut SendableRecordBatchStream,`
	`89`	`+) -> datafusion::error::Result<Option<RecordBatch>> {`
	`90`	`+ stream.next().await.transpose()`
	`91`	`+}`
	`92`	`+`
`87`	`93`	`async fn next_stream(`
`88`	`94`	`stream: Arc<Mutex<SendableRecordBatchStream>>,`
`89`	`95`	`sync: bool,`
`90`	`96`	`) -> PyResult<PyRecordBatch> {`
`91`	`97`	`let mut stream = stream.lock().await;`
`92`		`- match stream.next().await {`
`93`		`- Some(Ok(batch)) => Ok(batch.into()),`
`94`		`- Some(Err(e)) => Err(PyDataFusionError::from(e))?,`
`95`		`- None => {`
	`98`	`+ match poll_next_batch(&mut stream).await {`
	`99`	`+ Ok(Some(batch)) => Ok(batch.into()),`
	`100`	`+ Ok(None) => {`
`96`	`101`	`// Depending on whether the iteration is sync or not, we raise either a`
`97`	`102`	`// StopIteration or a StopAsyncIteration`
`98`	`103`	`if sync {`
`@@ -101,5 +106,6 @@ async fn next_stream(`
`101`	`106`	`Err(PyStopAsyncIteration::new_err("stream exhausted"))`
`102`	`107`	`}`
`103`	`108`	`}`
	`109`	`+ Err(e) => Err(PyDataFusionError::from(e))?,`
`104`	`110`	`}`
`105`	`111`	`}`