feat: add streaming utilities, range support, and improve async handling in DataFrame

kosiew · kosiew · commit 91ccd1ed4496 · 2025-09-02T16:35:08.000+08:00
- Add `range` method to SessionContext and iterator support to DataFrame
- Introduce `spawn_stream` utility and refactor async execution for
  better signal handling
- Add tests for `KeyboardInterrupt` in `__arrow_c_stream__` and
  incremental DataFrame streaming
- Improve memory usage tracking in tests with psutil
- Update DataFrame docs with PyArrow streaming section and enhance
  `__arrow_c_stream__` documentation
- Replace Tokio runtime creation with `spawn_stream` in PySessionContext
- Bump datafusion packages to 49.0.1 and update dependencies
- Remove unused imports and restore main Cargo.toml
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -145,10 +145,31 @@ To materialize the results of your DataFrame operations:
     
     # Display results
     df.show()                         # Print tabular format to console
-    
+
     # Count rows
     count = df.count()
 
+PyArrow Streaming
+-----------------
+
+DataFusion DataFrames implement the ``__arrow_c_stream__`` protocol, enabling
+zero-copy streaming into libraries like `PyArrow <https://arrow.apache.org/>`_.
+Earlier versions eagerly converted the entire DataFrame when exporting to
+PyArrow, which could exhaust memory on large datasets. With streaming, batches
+are produced lazily so you can process arbitrarily large results without
+out-of-memory errors.
+
+.. code-block:: python
+
+    import pyarrow as pa
+
+    # Create a PyArrow RecordBatchReader without materializing all batches
+    reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
+    for batch in reader:
+        ...  # process each batch as it is produced
+
+See :doc:`../io/arrow` for additional details on the Arrow interface.
+
 HTML Rendering
 --------------
 
diff --git a/examples/datafusion-ffi-example/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml
@@ -21,8 +21,8 @@ version = "0.2.0"
 edition = "2021"
 
 [dependencies]
-datafusion = { version = "49.0.2" }
-datafusion-ffi = { version = "49.0.2" }
+datafusion = { version = "49.0.1" }
+datafusion-ffi = { version = "49.0.1" }
 pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] }
 arrow = { version = "55.0.0" }
 arrow-array = { version = "55.0.0" }
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -731,6 +731,37 @@ def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame:
         """
         return DataFrame(self.ctx.from_polars(data, name))
 
+    def range(
+        self,
+        start: int,
+        stop: int | None = None,
+        step: int = 1,
+        partitions: int | None = None,
+    ) -> DataFrame:
+        """Create a DataFrame containing a sequence of numbers.
+
+        This is backed by DataFusion's ``range`` table function, which generates
+        values lazily and therefore does not materialize the full range in
+        memory. When ``stop`` is omitted, ``start`` is treated as the stop value
+        and the sequence begins at zero.
+
+        Args:
+            start: Starting value for the sequence or the exclusive stop if
+                ``stop`` is ``None``.
+            stop: Exclusive upper bound of the sequence.
+            step: Increment between successive values.
+            partitions: Optional number of partitions for the generated data.
+
+        Returns:
+            DataFrame yielding the requested range of values.
+        """
+        if stop is None:
+            start, stop = 0, start
+
+        parts = f", {int(partitions)}" if partitions is not None else ""
+        sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})"  # noqa: S608
+        return self.sql(sql)
+
     # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
     # is the discussion on how we arrived at adding register_view
     def register_view(self, name: str, df: DataFrame) -> None:
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -26,6 +26,7 @@
     TYPE_CHECKING,
     Any,
     Iterable,
+    Iterator,
     Literal,
     Optional,
     Union,
@@ -1098,21 +1099,37 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram
         return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls))
 
     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
-        """Export an Arrow PyCapsule Stream.
+        """Export the DataFrame as an Arrow C Stream.
 
-        This will execute and collect the DataFrame. We will attempt to respect the
-        requested schema, but only trivial transformations will be applied such as only
-        returning the fields listed in the requested schema if their data types match
-        those in the DataFrame.
+        The DataFrame is executed using DataFusion's streaming APIs and exposed via
+        Arrow's C Stream interface. Record batches are produced incrementally, so the
+        full result set is never materialized in memory. When ``requested_schema`` is
+        provided, only straightforward projections such as column selection or
+        reordering are applied.
 
         Args:
             requested_schema: Attempt to provide the DataFrame using this schema.
 
         Returns:
-            Arrow PyCapsule object.
+            Arrow PyCapsule object representing an ``ArrowArrayStream``.
         """
+        # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages
+        # ``execute_stream`` under the hood to stream batches one at a time.
         return self.df.__arrow_c_stream__(requested_schema)
 
+    def __iter__(self) -> Iterator[pa.RecordBatch]:
+        """Yield record batches from the DataFrame without materializing results.
+
+        This implementation streams record batches via the Arrow C Stream
+        interface, allowing callers such as :func:`pyarrow.Table.from_batches` to
+        consume results lazily. The DataFrame is executed using DataFusion's
+        streaming APIs so ``collect`` is never invoked.
+        """
+        import pyarrow as pa
+
+        reader = pa.RecordBatchReader._import_from_c(self.__arrow_c_stream__())
+        yield from reader
+
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
         """Apply a function to the current DataFrame which returns another DataFrame.
 
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1582,6 +1582,29 @@ def test_empty_to_arrow_table(df):
     assert set(pyarrow_table.column_names) == {"a", "b", "c"}
 
 
+def test_arrow_c_stream_to_table(monkeypatch):
+    ctx = SessionContext()
+
+    # Create a DataFrame with two separate record batches
+    batch1 = pa.record_batch([pa.array([1])], names=["a"])
+    batch2 = pa.record_batch([pa.array([2])], names=["a"])
+    df = ctx.create_dataframe([[batch1], [batch2]])
+
+    # Fail if the DataFrame is pre-collected
+    def fail_collect(self):  # pragma: no cover - failure path
+        msg = "collect should not be called"
+        raise AssertionError(msg)
+
+    monkeypatch.setattr(DataFrame, "collect", fail_collect)
+
+    table = pa.Table.from_batches(df)
+    expected = pa.Table.from_batches([batch1, batch2])
+
+    assert table.equals(expected)
+    assert table.schema == df.schema()
+    assert table.column("a").num_chunks == 2
+
+
 def test_to_pylist(df):
     # Convert datafusion dataframe to Python list
     pylist = df.to_pylist()
@@ -2666,6 +2689,110 @@ def trigger_interrupt():
     interrupt_thread.join(timeout=1.0)
 
 
+def test_arrow_c_stream_interrupted():
+    """__arrow_c_stream__ responds to ``KeyboardInterrupt`` signals.
+
+    Similar to ``test_collect_interrupted`` this test issues a long running
+    query, but consumes the results via ``__arrow_c_stream__``. It then raises
+    ``KeyboardInterrupt`` in the main thread and verifies that the stream
+    iteration stops promptly with the appropriate exception.
+    """
+
+    ctx = SessionContext()
+
+    batches = []
+    for i in range(10):
+        batch = pa.RecordBatch.from_arrays(
+            [
+                pa.array(list(range(i * 1000, (i + 1) * 1000))),
+                pa.array([f"value_{j}" for j in range(i * 1000, (i + 1) * 1000)]),
+            ],
+            names=["a", "b"],
+        )
+        batches.append(batch)
+
+    ctx.register_record_batches("t1", [batches])
+    ctx.register_record_batches("t2", [batches])
+
+    df = ctx.sql(
+        """
+        WITH t1_expanded AS (
+            SELECT
+                a,
+                b,
+                CAST(a AS DOUBLE) / 1.5 AS c,
+                CAST(a AS DOUBLE) * CAST(a AS DOUBLE) AS d
+            FROM t1
+            CROSS JOIN (SELECT 1 AS dummy FROM t1 LIMIT 5)
+        ),
+        t2_expanded AS (
+            SELECT
+                a,
+                b,
+                CAST(a AS DOUBLE) * 2.5 AS e,
+                CAST(a AS DOUBLE) * CAST(a AS DOUBLE) * CAST(a AS DOUBLE) AS f
+            FROM t2
+            CROSS JOIN (SELECT 1 AS dummy FROM t2 LIMIT 5)
+        )
+        SELECT
+            t1.a, t1.b, t1.c, t1.d,
+            t2.a AS a2, t2.b AS b2, t2.e, t2.f
+        FROM t1_expanded t1
+        JOIN t2_expanded t2 ON t1.a % 100 = t2.a % 100
+        WHERE t1.a > 100 AND t2.a > 100
+        """
+    )
+
+    reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
+
+    interrupted = False
+    interrupt_error = None
+    query_started = threading.Event()
+    max_wait_time = 5.0
+
+    def trigger_interrupt():
+        start_time = time.time()
+        while not query_started.is_set():
+            time.sleep(0.1)
+            if time.time() - start_time > max_wait_time:
+                msg = f"Query did not start within {max_wait_time} seconds"
+                raise RuntimeError(msg)
+
+        thread_id = threading.main_thread().ident
+        if thread_id is None:
+            msg = "Cannot get main thread ID"
+            raise RuntimeError(msg)
+
+        exception = ctypes.py_object(KeyboardInterrupt)
+        res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
+            ctypes.c_long(thread_id), exception
+        )
+        if res != 1:
+            ctypes.pythonapi.PyThreadState_SetAsyncExc(
+                ctypes.c_long(thread_id), ctypes.py_object(0)
+            )
+            msg = "Failed to raise KeyboardInterrupt in main thread"
+            raise RuntimeError(msg)
+
+    interrupt_thread = threading.Thread(target=trigger_interrupt)
+    interrupt_thread.daemon = True
+    interrupt_thread.start()
+
+    try:
+        query_started.set()
+        # consume the reader which should block and be interrupted
+        reader.read_all()
+    except KeyboardInterrupt:
+        interrupted = True
+    except Exception as e:  # pragma: no cover - unexpected errors
+        interrupt_error = e
+
+    if not interrupted:
+        pytest.fail(f"Stream was not interrupted; got error: {interrupt_error}")
+
+    interrupt_thread.join(timeout=1.0)
+
+
 def test_show_select_where_no_rows(capsys) -> None:
     ctx = SessionContext()
     df = ctx.sql("SELECT 1 WHERE 1=0")
diff --git a/python/tests/test_io.py b/python/tests/test_io.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 
 import pyarrow as pa
+import pytest
 from datafusion import column
 from datafusion.io import read_avro, read_csv, read_json, read_parquet
 
@@ -92,3 +93,30 @@ def test_read_avro():
     path = Path.cwd() / "testing/data/avro/alltypes_plain.avro"
     avro_df = read_avro(path=path)
     assert avro_df is not None
+
+
+def test_arrow_c_stream_large_dataset(ctx):
+    """DataFrame.__arrow_c_stream__ yields batches incrementally.
+
+    This test constructs a DataFrame that would be far larger than available
+    memory if materialized. The ``__arrow_c_stream__`` method should expose a
+    stream of record batches without collecting the full dataset, so reading a
+    handful of batches should not exhaust process memory.
+    """
+    # Create a very large DataFrame using range; this would be terabytes if collected
+    df = ctx.range(0, 1 << 40)
+
+    reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
+
+    # Track RSS before consuming batches
+    psutil = pytest.importorskip("psutil")
+    process = psutil.Process()
+    start_rss = process.memory_info().rss
+
+    for _ in range(5):
+        batch = reader.read_next_batch()
+        assert batch is not None
+        assert len(batch) > 0
+        current_rss = process.memory_info().rss
+        # Ensure memory usage hasn't grown substantially (>50MB)
+        assert current_rss - start_rss < 50 * 1024 * 1024
diff --git a/src/context.rs b/src/context.rs
@@ -34,7 +34,7 @@ use pyo3::prelude::*;
 use crate::catalog::{PyCatalog, PyTable, RustWrappedPyCatalogProvider};
 use crate::dataframe::PyDataFrame;
 use crate::dataset::Dataset;
-use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult};
+use crate::errors::{py_datafusion_err, PyDataFusionResult};
 use crate::expr::sort_expr::PySortExpr;
 use crate::physical_plan::PyExecutionPlan;
 use crate::record_batch::PyRecordBatchStream;
@@ -45,7 +45,7 @@ use crate::udaf::PyAggregateUDF;
 use crate::udf::PyScalarUDF;
 use crate::udtf::PyTableFunction;
 use crate::udwf::PyWindowUDF;
-use crate::utils::{get_global_ctx, get_tokio_runtime, validate_pycapsule, wait_for_future};
+use crate::utils::{get_global_ctx, spawn_stream, validate_pycapsule, wait_for_future};
 use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef};
 use datafusion::arrow::pyarrow::PyArrowType;
 use datafusion::arrow::record_batch::RecordBatch;
@@ -66,15 +66,13 @@ use datafusion::execution::disk_manager::DiskManagerMode;
 use datafusion::execution::memory_pool::{FairSpillPool, GreedyMemoryPool, UnboundedMemoryPool};
 use datafusion::execution::options::ReadOptions;
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
-use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion::prelude::{
     AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions,
 };
 use datafusion_ffi::catalog_provider::{FFI_CatalogProvider, ForeignCatalogProvider};
 use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider};
 use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType};
 use pyo3::IntoPyObjectExt;
-use tokio::task::JoinHandle;
 
 /// Configuration options for a SessionContext
 #[pyclass(name = "SessionConfig", module = "datafusion", subclass)]
@@ -1132,12 +1130,8 @@ impl PySessionContext {
         py: Python,
     ) -> PyDataFusionResult<PyRecordBatchStream> {
         let ctx: TaskContext = TaskContext::from(&self.ctx.state());
-        // create a Tokio runtime to run the async code
-        let rt = &get_tokio_runtime().0;
         let plan = plan.plan.clone();
-        let fut: JoinHandle<datafusion::common::Result<SendableRecordBatchStream>> =
-            rt.spawn(async move { plan.execute(part, Arc::new(ctx)) });
-        let stream = wait_for_future(py, async { fut.await.map_err(to_datafusion_err) })???;
+        let stream = spawn_stream(py, async move { plan.execute(part, Arc::new(ctx)) })?;
         Ok(PyRecordBatchStream::new(stream))
     }
 }
diff --git a/src/dataframe.rs b/src/dataframe.rs
diff --git a/src/utils.rs b/src/utils.rs