Skip to content

Commit be24c77

Browse files
refactor: simplify PyCapsule interface using pyo3-arrow (#364)
* Simplify pycapsule interface using pyo3-arrow * docs: improve docstring * chore: remove useless clones --------- Co-authored-by: Eric Jolibois <em.jolibois@gmail.com>
1 parent 48fd483 commit be24c77

File tree

8 files changed

+586
-344
lines changed

8 files changed

+586
-344
lines changed

Cargo.lock

Lines changed: 459 additions & 152 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ calamine = { version = "^0.30.0", features = ["dates"] }
3232
chrono = { version = "^0.4.41", default-features = false }
3333
log = "0.4.27"
3434
pyo3 = { version = "^0.25", features = ["abi3-py39"] }
35+
pyo3-arrow = { version = "0.11", default-features = false }
3536
pyo3-log = "^0.12.4"
3637

3738
[dev-dependencies]

python/fastexcel/__init__.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -142,18 +142,24 @@ def to_polars(self) -> "pl.DataFrame":
142142
return pl.DataFrame(self)
143143

144144
def __arrow_c_schema__(self) -> object:
145-
"""Arrow PyCapsule Interface: Export schema as a PyCapsule.
145+
"""Export the schema as an `ArrowSchema` `PyCapsule`.
146146
147-
This method allows zero-copy data exchange with Arrow-compatible libraries
148-
like Polars without requiring PyArrow as a dependency.
147+
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
148+
149+
The Arrow PyCapsule Interface enables zero-copy data exchange with
150+
Arrow-compatible libraries without requiring PyArrow as a dependency.
149151
"""
150152
return self._sheet.__arrow_c_schema__()
151153

152154
def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
153-
"""Arrow PyCapsule Interface: Export array and schema as PyCapsules.
155+
"""Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
156+
157+
The optional `requested_schema` parameter allows for potential schema conversion.
158+
159+
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
154160
155-
Returns data as a tuple of (schema_capsule, array_capsule) for zero-copy
156-
data exchange with Arrow-compatible libraries.
161+
The Arrow PyCapsule Interface enables zero-copy data exchange with
162+
Arrow-compatible libraries without requiring PyArrow as a dependency.
157163
"""
158164
return self._sheet.__arrow_c_array__(requested_schema)
159165

@@ -240,18 +246,24 @@ def to_polars(self) -> "pl.DataFrame":
240246
return pl.DataFrame(self)
241247

242248
def __arrow_c_schema__(self) -> object:
243-
"""Arrow PyCapsule Interface: Export schema as a PyCapsule.
249+
"""Export the schema as an `ArrowSchema` `PyCapsule`.
244250
245-
This method allows zero-copy data exchange with Arrow-compatible libraries
246-
like Polars without requiring PyArrow as a dependency.
251+
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
252+
253+
The Arrow PyCapsule Interface enables zero-copy data exchange with
254+
Arrow-compatible libraries without requiring PyArrow as a dependency.
247255
"""
248256
return self._table.__arrow_c_schema__()
249257

250258
def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
251-
"""Arrow PyCapsule Interface: Export array and schema as PyCapsules.
259+
"""Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
260+
261+
The optional `requested_schema` parameter allows for potential schema conversion.
262+
263+
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
252264
253-
Returns data as a tuple of (schema_capsule, array_capsule) for zero-copy
254-
data exchange with Arrow-compatible libraries.
265+
The Arrow PyCapsule Interface enables zero-copy data exchange with
266+
Arrow-compatible libraries without requiring PyArrow as a dependency.
255267
"""
256268
return self._table.__arrow_c_array__(requested_schema)
257269

python/fastexcel/_fastexcel.pyi

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,23 @@ class _ExcelSheet:
9292
therefore converted to None.
9393
"""
9494
def __arrow_c_schema__(self) -> object:
95-
"""Arrow PyCapsule Interface: Export schema as a PyCapsule"""
95+
"""Export the schema as an `ArrowSchema` `PyCapsule`.
96+
97+
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
98+
99+
The Arrow PyCapsule Interface enables zero-copy data exchange with
100+
Arrow-compatible libraries without requiring PyArrow as a dependency.
101+
"""
96102
def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]:
97-
"""Arrow PyCapsule Interface: Export array and schema as PyCapsules"""
103+
"""Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
104+
105+
The optional `requested_schema` parameter allows for potential schema conversion.
106+
107+
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
108+
109+
The Arrow PyCapsule Interface enables zero-copy data exchange with
110+
Arrow-compatible libraries without requiring PyArrow as a dependency.
111+
"""
98112

99113
class _ExcelTable:
100114
@property
@@ -126,9 +140,24 @@ class _ExcelTable:
126140
def to_arrow(self) -> "pa.RecordBatch":
127141
"""Converts the table to a pyarrow `RecordBatch`"""
128142
def __arrow_c_schema__(self) -> object:
129-
"""Arrow PyCapsule Interface: Export schema as a PyCapsule"""
143+
"""Export the schema as an `ArrowSchema` `PyCapsule`.
144+
145+
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
146+
147+
The Arrow PyCapsule Interface enables zero-copy data exchange with
148+
Arrow-compatible libraries without requiring PyArrow as a dependency.
149+
"""
150+
130151
def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]:
131-
"""Arrow PyCapsule Interface: Export array and schema as PyCapsules"""
152+
"""Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
153+
154+
The optional `requested_schema` parameter allows for potential schema conversion.
155+
156+
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
157+
158+
The Arrow PyCapsule Interface enables zero-copy data exchange with
159+
Arrow-compatible libraries without requiring PyArrow as a dependency.
160+
"""
132161

133162
class _ExcelReader:
134163
"""A class representing an open Excel file and allowing to read its sheets"""

src/arrow_capsule.rs

Lines changed: 0 additions & 123 deletions
This file was deleted.

src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
mod arrow_capsule;
21
mod data;
32
mod error;
43
mod types;

src/types/python/excelsheet/mod.rs

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
pub(crate) mod column_info;
22
pub(crate) mod table;
33

4+
use arrow_schema::Field;
45
use calamine::{CellType, Range, Sheet as CalamineSheet, SheetVisible as CalamineSheetVisible};
56
use column_info::{AvailableColumns, ColumnInfoNoDtype};
7+
use pyo3::types::{PyCapsule, PyTuple};
8+
use pyo3_arrow::ffi::{to_array_pycapsules, to_schema_pycapsule};
9+
use std::sync::Arc;
610
use std::{cmp, collections::HashSet, fmt::Debug, str::FromStr};
711

8-
use arrow_array::RecordBatch;
12+
use arrow_array::{RecordBatch, StructArray};
913
#[cfg(feature = "pyarrow")]
1014
use arrow_pyarrow::ToPyArrow;
1115

@@ -15,7 +19,7 @@ use pyo3::{
1519
types::{PyList, PyString},
1620
};
1721

18-
use crate::arrow_capsule;
22+
use crate::data::selected_columns_to_schema;
1923
use crate::{
2024
data::{
2125
ExcelSheetData, record_batch_from_data_and_columns,
@@ -647,29 +651,31 @@ impl ExcelSheet {
647651
(rb, errors).into_bound_py_any(py)
648652
}
649653

650-
/// Arrow PyCapsule Interface: __arrow_c_schema__
651-
pub fn __arrow_c_schema__<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
652-
let record_batch = RecordBatch::try_from(self)
653-
.with_context(|| {
654-
format!(
655-
"could not create RecordBatch from sheet \"{}\"",
656-
self.name()
657-
)
658-
})
659-
.into_pyresult()?;
660-
661-
arrow_capsule::schema_to_pycapsule(py, record_batch.schema().as_ref())
662-
.map(|capsule| capsule.into_any())
663-
}
664-
665-
/// Arrow PyCapsule Interface: __arrow_c_array__
654+
/// Export the schema as an [`ArrowSchema`] [`PyCapsule`].
655+
///
656+
/// <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export>
657+
///
658+
/// [`ArrowSchema`]: arrow_array::ffi::FFI_ArrowSchema
659+
/// [`PyCapsule`]: pyo3::types::PyCapsule
660+
pub fn __arrow_c_schema__<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyCapsule>> {
661+
let schema = selected_columns_to_schema(&self.selected_columns);
662+
Ok(to_schema_pycapsule(py, &schema)?)
663+
}
664+
665+
/// Export the schema and data as a pair of [`ArrowSchema`] and [`ArrowArray`] [`PyCapsules`]
666+
///
667+
/// The optional `requested_schema` parameter allows for potential schema conversion.
668+
///
669+
/// <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export>
670+
///
671+
/// [`ArrowSchema`]: arrow_array::ffi::FFI_ArrowSchema
672+
/// [`ArrowArray`]: arrow_array::ffi::FFI_ArrowArray
673+
/// [`PyCapsules`]: pyo3::types::PyCapsule
666674
pub fn __arrow_c_array__<'py>(
667675
&self,
668676
py: Python<'py>,
669-
requested_schema: Option<&Bound<'py, PyAny>>,
670-
) -> PyResult<Bound<'py, PyAny>> {
671-
let _ = requested_schema; // TODO: Support schema conversion if needed
672-
677+
requested_schema: Option<Bound<'py, PyCapsule>>,
678+
) -> PyResult<Bound<'py, PyTuple>> {
673679
let record_batch = RecordBatch::try_from(self)
674680
.with_context(|| {
675681
format!(
@@ -679,13 +685,14 @@ impl ExcelSheet {
679685
})
680686
.into_pyresult()?;
681687

682-
let (schema_capsule, array_capsule) =
683-
arrow_capsule::record_batch_to_pycapsules(py, &record_batch)?;
684-
685-
Ok(
686-
pyo3::types::PyTuple::new(py, [schema_capsule.into_any(), array_capsule.into_any()])?
687-
.into_any(),
688-
)
688+
let field = Field::new_struct("", record_batch.schema_ref().fields().clone(), false);
689+
let array = Arc::new(StructArray::from(record_batch));
690+
Ok(to_array_pycapsules(
691+
py,
692+
field.into(),
693+
array.as_ref(),
694+
requested_schema,
695+
)?)
689696
}
690697

691698
pub fn __repr__(&self) -> String {

0 commit comments

Comments
 (0)