@@ -67,11 +67,25 @@ unsafe extern "C" fn drop_stream(capsule: *mut ffi::PyObject) {
6767 if capsule. is_null ( ) {
6868 return ;
6969 }
70- let stream_ptr =
71- ffi:: PyCapsule_GetPointer ( capsule, ARROW_STREAM_NAME . as_ptr ( ) ) as * mut FFI_ArrowArrayStream ;
72- if !stream_ptr. is_null ( ) {
73- drop ( Box :: from_raw ( stream_ptr) ) ;
70+
71+ // When PyArrow imports this capsule it steals the raw stream pointer and
72+ // sets the capsule's internal pointer to NULL. In that case
73+ // `PyCapsule_IsValid` returns 0 and this destructor must not drop the
74+ // stream as ownership has been transferred to PyArrow. If the capsule was
75+ // never imported, the pointer remains valid and we are responsible for
76+ // freeing the stream here.
77+ if ffi:: PyCapsule_IsValid ( capsule, ARROW_STREAM_NAME . as_ptr ( ) ) == 1 {
78+ let stream_ptr = ffi:: PyCapsule_GetPointer ( capsule, ARROW_STREAM_NAME . as_ptr ( ) )
79+ as * mut FFI_ArrowArrayStream ;
80+ if !stream_ptr. is_null ( ) {
81+ drop ( Box :: from_raw ( stream_ptr) ) ;
82+ }
7483 }
84+
85+ // `PyCapsule_GetPointer` sets a Python error on failure. Clear it only
86+ // after the stream has been released (or determined to be owned
87+ // elsewhere).
88+ ffi:: PyErr_Clear ( ) ;
7589}
7690
7791// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
@@ -369,50 +383,59 @@ impl PyDataFrame {
369383 Ok ( html_str)
370384 }
371385}
372- /// Synchronous wrapper around a [`SendableRecordBatchStream`] used for
373- /// the `__arrow_c_stream__` implementation.
386+
387+ /// Synchronous wrapper around partitioned [`SendableRecordBatchStream`]s used
388+ /// for the `__arrow_c_stream__` implementation.
374389///
375- /// It uses `runtime.block_on` to consume the underlying async stream,
376- /// providing synchronous iteration. When a `projection` is set, each
377- /// batch is converted via `record_batch_into_schema` to apply schema
378- /// changes per batch.
379- struct DataFrameStreamReader {
380- stream : SendableRecordBatchStream ,
390+ /// It drains each partition's stream sequentially, yielding record batches in
391+ /// their original partition order. When a `projection` is set, each batch is
392+ /// converted via `record_batch_into_schema` to apply schema changes per batch.
393+ struct PartitionedDataFrameStreamReader {
394+ streams : Vec < SendableRecordBatchStream > ,
381395 schema : SchemaRef ,
382396 projection : Option < SchemaRef > ,
397+ current : usize ,
383398}
384399
385- impl Iterator for DataFrameStreamReader {
400+ impl Iterator for PartitionedDataFrameStreamReader {
386401 type Item = Result < RecordBatch , ArrowError > ;
387402
388403 fn next ( & mut self ) -> Option < Self :: Item > {
389- // Use wait_for_future to poll the underlying async stream while
390- // respecting Python signal handling (e.g. ``KeyboardInterrupt``).
391- // This mirrors the behaviour of other synchronous wrappers and
392- // prevents blocking indefinitely when a Python interrupt is raised.
393- let fut = poll_next_batch ( & mut self . stream ) ;
394- let result = Python :: with_gil ( |py| wait_for_future ( py, fut) ) ;
395-
396- match result {
397- Ok ( Ok ( Some ( batch) ) ) => {
398- let batch = if let Some ( ref schema) = self . projection {
399- match record_batch_into_schema ( batch, schema. as_ref ( ) ) {
400- Ok ( b) => b,
401- Err ( e) => return Some ( Err ( e) ) ,
402- }
403- } else {
404- batch
405- } ;
406- Some ( Ok ( batch) )
404+ while self . current < self . streams . len ( ) {
405+ let stream = & mut self . streams [ self . current ] ;
406+ let fut = poll_next_batch ( stream) ;
407+ let result = Python :: with_gil ( |py| wait_for_future ( py, fut) ) ;
408+
409+ match result {
410+ Ok ( Ok ( Some ( batch) ) ) => {
411+ let batch = if let Some ( ref schema) = self . projection {
412+ match record_batch_into_schema ( batch, schema. as_ref ( ) ) {
413+ Ok ( b) => b,
414+ Err ( e) => return Some ( Err ( e) ) ,
415+ }
416+ } else {
417+ batch
418+ } ;
419+ return Some ( Ok ( batch) ) ;
420+ }
421+ Ok ( Ok ( None ) ) => {
422+ self . current += 1 ;
423+ continue ;
424+ }
425+ Ok ( Err ( e) ) => {
426+ return Some ( Err ( ArrowError :: ExternalError ( Box :: new ( e) ) ) ) ;
427+ }
428+ Err ( e) => {
429+ return Some ( Err ( ArrowError :: ExternalError ( Box :: new ( e) ) ) ) ;
430+ }
407431 }
408- Ok ( Ok ( None ) ) => None ,
409- Ok ( Err ( e) ) => Some ( Err ( ArrowError :: ExternalError ( Box :: new ( e) ) ) ) ,
410- Err ( e) => Some ( Err ( ArrowError :: ExternalError ( Box :: new ( e) ) ) ) ,
411432 }
433+
434+ None
412435 }
413436}
414437
415- impl RecordBatchReader for DataFrameStreamReader {
438+ impl RecordBatchReader for PartitionedDataFrameStreamReader {
416439 fn schema ( & self ) -> SchemaRef {
417440 self . schema . clone ( )
418441 }
@@ -944,7 +967,7 @@ impl PyDataFrame {
944967 requested_schema : Option < Bound < ' py , PyCapsule > > ,
945968 ) -> PyDataFusionResult < Bound < ' py , PyCapsule > > {
946969 let df = self . df . as_ref ( ) . clone ( ) ;
947- let stream = spawn_stream ( py, async move { df. execute_stream ( ) . await } ) ?;
970+ let streams = spawn_streams ( py, async move { df. execute_stream_partitioned ( ) . await } ) ?;
948971
949972 let mut schema: Schema = self . df . schema ( ) . to_owned ( ) . into ( ) ;
950973 let mut projection: Option < SchemaRef > = None ;
@@ -961,19 +984,24 @@ impl PyDataFrame {
961984
962985 let schema_ref = Arc :: new ( schema. clone ( ) ) ;
963986
964- let reader = DataFrameStreamReader {
965- stream ,
987+ let reader = PartitionedDataFrameStreamReader {
988+ streams ,
966989 schema : schema_ref,
967990 projection,
991+ current : 0 ,
968992 } ;
969993 let reader: Box < dyn RecordBatchReader + Send > = Box :: new ( reader) ;
970994
971995 let stream = Box :: new ( FFI_ArrowArrayStream :: new ( reader) ) ;
972996 let stream_ptr = Box :: into_raw ( stream) ;
973- assert ! (
997+ debug_assert ! (
974998 !stream_ptr. is_null( ) ,
975- "ArrowArrayStream pointer should never be null"
999+ "ArrowArrayStream pointer should never be null" ,
9761000 ) ;
1001+ // The returned capsule allows zero-copy hand-off to PyArrow. When
1002+ // PyArrow imports the capsule it assumes ownership of the stream and
1003+ // nulls out the capsule's internal pointer so `drop_stream` knows not to
1004+ // free it.
9771005 let capsule = unsafe {
9781006 ffi:: PyCapsule_New (
9791007 stream_ptr as * mut c_void ,
0 commit comments