Skip to content

Commit 9fa70cc

Browse files
authored
feat: implement Arrow PyCapsule Interface (#361)
* feat: implement Arrow PyCapsule Interface * chore: feedback review
1 parent 0799785 commit 9fa70cc

File tree

12 files changed

+554
-49
lines changed

12 files changed

+554
-49
lines changed

Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,7 @@ crate-type = ["cdylib"]
2626

2727
[dependencies]
2828
# There's a lot of stuff we don't want here, such as serde support
29-
arrow = { version = "^55.2.0", default-features = false, features = [
30-
"pyarrow",
31-
] }
29+
arrow = { version = "^55.2.0", default-features = false, features = ["ffi"] }
3230
calamine = { version = "^0.30.0", features = ["dates"] }
3331
chrono = { version = "^0.4.41", default-features = false }
3432
log = "0.4.27"
@@ -42,9 +40,10 @@ rstest = { version = "^0.26.1", default-features = false }
4240
# NOTE: This is a hack to bypass pyo3 limitations when testing:
4341
# https://pyo3.rs/v0.22.3/faq.html#i-cant-run-cargo-test-or-i-cant-build-in-a-cargo-workspace-im-having-linker-issues-like-symbol-not-found-or-undefined-reference-to-_pyexc_systemerror
4442
[features]
43+
default = ["extension-module", "pyarrow"]
4544
extension-module = ["pyo3/extension-module"]
46-
default = ["extension-module"]
45+
pyarrow = ["arrow/pyarrow"]
4746
# feature for tests only. This makes Python::with_gil auto-initialize Python
4847
# interpreters, which allows us to instantiate Python objects in tests
4948
# (see https://pyo3.rs/v0.22.3/features#auto-initialize)
50-
tests = ["pyo3/auto-initialize"]
49+
tests = ["pyo3/auto-initialize", "pyarrow"]

README.md

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,91 @@ Based on [`calamine`](https://github.com/tafia/calamine) and [Apache Arrow](http
66

77
Docs available [here](https://fastexcel.toucantoco.dev/).
88

9+
## Installation
10+
11+
```bash
12+
# Lightweight installation (no pyarrow dependency)
13+
pip install fastexcel
14+
15+
# With Polars support only (no pyarrow needed)
16+
pip install fastexcel[polars]
17+
18+
# With pandas support (includes pyarrow)
19+
pip install fastexcel[pandas]
20+
21+
# With pyarrow support
22+
pip install fastexcel[pyarrow]
23+
24+
# With all integrations
25+
pip install fastexcel[pandas,polars]
26+
```
27+
28+
## Quick Start
29+
30+
### Modern usage (recommended)
31+
32+
FastExcel supports the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) for zero-copy data exchange with libraries like Polars, without requiring pyarrow as a dependency.
33+
Use fastexcel with any Arrow-compatible library without requiring pyarrow.
34+
35+
```python
36+
import fastexcel
37+
38+
# Load an Excel file
39+
reader = fastexcel.read_excel("data.xlsx")
40+
sheet = reader.load_sheet(0) # Load first sheet
41+
42+
# Use with Polars (zero-copy, no pyarrow needed)
43+
import polars as pl
44+
df = pl.DataFrame(sheet) # Direct PyCapsule interface
45+
print(df)
46+
47+
# Or use the to_polars() method (also via PyCapsule)
48+
df = sheet.to_polars()
49+
print(df)
50+
51+
# Or access the raw Arrow data via PyCapsule interface
52+
schema = sheet.__arrow_c_schema__()
53+
array_data = sheet.__arrow_c_array__()
54+
```
55+
56+
### Traditional usage (with pandas/pyarrow)
57+
58+
```python
59+
import fastexcel
60+
61+
reader = fastexcel.read_excel("data.xlsx")
62+
sheet = reader.load_sheet(0)
63+
64+
# Convert to pandas (requires `pandas` extra)
65+
df = sheet.to_pandas()
66+
67+
# Or get pyarrow RecordBatch directly
68+
record_batch = sheet.to_arrow()
69+
```
70+
71+
### Working with tables
72+
73+
```python
74+
reader = fastexcel.read_excel("data.xlsx")
75+
76+
# List available tables
77+
tables = reader.table_names()
78+
print(f"Available tables: {tables}")
79+
80+
# Load a specific table
81+
table = reader.load_table("MyTable")
82+
df = pl.DataFrame(table) # Zero-copy via PyCapsule, no pyarrow needed
83+
```
84+
85+
## Key Features
86+
87+
- **Zero-copy data exchange** via [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html)
88+
- **Flexible dependencies** - use with Polars (no PyArrow needed) or Pandas (includes PyArrow)
89+
- **Seamless Polars integration** - `pl.DataFrame(sheet)` and `sheet.to_polars()` work without PyArrow via PyCapsule interface
90+
- **High performance** - written in Rust with [calamine](https://github.com/tafia/calamine) and [Apache Arrow](https://arrow.apache.org/)
91+
- **Memory efficient** - lazy loading and optional eager evaluation
92+
- **Type safety** - automatic type inference with manual override options
93+
994
## Dev setup
1095

1196
### Prerequisites

pyproject.toml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,17 @@ classifiers = [
2323
"Programming Language :: Python :: 3.13",
2424
"Programming Language :: Python :: Implementation :: CPython",
2525
]
26-
dependencies = [
27-
"pyarrow>=8.0.0",
28-
"typing-extensions>=4.0.0; python_version<'3.10'",
29-
]
26+
dependencies = ["typing-extensions>=4.0.0; python_version<'3.10'"]
3027
dynamic = ["version"]
3128

3229
[project.optional-dependencies]
33-
pandas = ["pandas>=1.4.4"]
30+
pyarrow = ["pyarrow>=8.0.0"]
31+
pandas = ["pandas>=1.4.4", "pyarrow>=8.0.0"]
3432
polars = ["polars>=0.16.14"]
3533

34+
[dependency-groups]
35+
test = ["pytest>=7.0.0", "pyarrow>=8.0.0", "pandas>=1.4.4", "polars>=0.16.14"]
36+
3637
[project.urls]
3738
"Source Code" = "https://github.com/ToucanToco/fastexcel"
3839
Issues = "https://github.com/ToucanToco/fastexcel"

python/fastexcel/__init__.py

Lines changed: 72 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,18 @@
1212
if TYPE_CHECKING:
1313
import pandas as pd
1414
import polars as pl
15+
import pyarrow as pa
1516

1617
from os.path import expanduser
1718
from pathlib import Path
1819

19-
import pyarrow as pa
20+
try:
21+
import pyarrow as pa
22+
23+
_PYARROW_AVAILABLE = True
24+
except ImportError:
25+
pa = None
26+
_PYARROW_AVAILABLE = False
2027

2128
from ._fastexcel import (
2229
ArrowError,
@@ -46,14 +53,6 @@
4653
SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"]
4754

4855

49-
def _recordbatch_to_polars(rb: pa.RecordBatch) -> pl.DataFrame:
50-
import polars as pl
51-
52-
df = pl.from_arrow(data=rb)
53-
assert isinstance(df, pl.DataFrame)
54-
return df
55-
56-
5756
class ExcelSheet:
5857
"""A class representing a single sheet in an Excel File"""
5958

@@ -99,16 +98,24 @@ def visible(self) -> SheetVisible:
9998
"""The visibility of the sheet"""
10099
return self._sheet.visible
101100

102-
def to_arrow(self) -> pa.RecordBatch:
101+
def to_arrow(self) -> "pa.RecordBatch":
103102
"""Converts the sheet to a pyarrow `RecordBatch`"""
103+
if not _PYARROW_AVAILABLE:
104+
raise ImportError(
105+
"pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
106+
)
104107
return self._sheet.to_arrow()
105108

106-
def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]:
109+
def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors | None]":
107110
"""Converts the sheet to a pyarrow `RecordBatch` with error information.
108111
109112
Stores the positions of any values that cannot be parsed as the specified type and were
110113
therefore converted to None.
111114
"""
115+
if not _PYARROW_AVAILABLE:
116+
raise ImportError(
117+
"pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501
118+
)
112119
rb, cell_errors = self._sheet.to_arrow_with_errors()
113120
if not cell_errors.errors:
114121
return (rb, None)
@@ -119,15 +126,36 @@ def to_pandas(self) -> "pd.DataFrame":
119126
120127
Requires the `pandas` extra to be installed.
121128
"""
122-
# We know for sure that the sheet will yield exactly one RecordBatch
129+
# Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
130+
# which we don't implement. Using pyarrow conversion for now.
131+
# (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
123132
return self.to_arrow().to_pandas()
124133

125134
def to_polars(self) -> "pl.DataFrame":
126135
"""Converts the sheet to a Polars `DataFrame`.
127136
137+
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
128138
Requires the `polars` extra to be installed.
129139
"""
130-
return _recordbatch_to_polars(self.to_arrow())
140+
import polars as pl
141+
142+
return pl.DataFrame(self)
143+
144+
def __arrow_c_schema__(self) -> object:
145+
"""Arrow PyCapsule Interface: Export schema as a PyCapsule.
146+
147+
This method allows zero-copy data exchange with Arrow-compatible libraries
148+
like Polars without requiring PyArrow as a dependency.
149+
"""
150+
return self._sheet.__arrow_c_schema__()
151+
152+
def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
153+
"""Arrow PyCapsule Interface: Export array and schema as PyCapsules.
154+
155+
Returns data as a tuple of (schema_capsule, array_capsule) for zero-copy
156+
data exchange with Arrow-compatible libraries.
157+
"""
158+
return self._sheet.__arrow_c_array__(requested_schema)
131159

132160
def __repr__(self) -> str:
133161
return self._sheet.__repr__()
@@ -183,24 +211,49 @@ def specified_dtypes(self) -> DTypeMap | None:
183211
"""The dtypes specified for the table"""
184212
return self._table.specified_dtypes
185213

186-
def to_arrow(self) -> pa.RecordBatch:
214+
def to_arrow(self) -> "pa.RecordBatch":
187215
"""Converts the table to a pyarrow `RecordBatch`"""
216+
if not _PYARROW_AVAILABLE:
217+
raise ImportError(
218+
"pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
219+
)
188220
return self._table.to_arrow()
189221

190222
def to_pandas(self) -> "pd.DataFrame":
191223
"""Converts the table to a Pandas `DataFrame`.
192224
193225
Requires the `pandas` extra to be installed.
194226
"""
195-
# We know for sure that the table will yield exactly one RecordBatch
227+
# Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
228+
# which we don't implement. Using pyarrow conversion for now.
229+
# (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
196230
return self.to_arrow().to_pandas()
197231

198232
def to_polars(self) -> "pl.DataFrame":
199233
"""Converts the table to a Polars `DataFrame`.
200234
235+
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
201236
Requires the `polars` extra to be installed.
202237
"""
203-
return _recordbatch_to_polars(self.to_arrow())
238+
import polars as pl
239+
240+
return pl.DataFrame(self)
241+
242+
def __arrow_c_schema__(self) -> object:
243+
"""Arrow PyCapsule Interface: Export schema as a PyCapsule.
244+
245+
This method allows zero-copy data exchange with Arrow-compatible libraries
246+
like Polars without requiring PyArrow as a dependency.
247+
"""
248+
return self._table.__arrow_c_schema__()
249+
250+
def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
251+
"""Arrow PyCapsule Interface: Export array and schema as PyCapsules.
252+
253+
Returns data as a tuple of (schema_capsule, array_capsule) for zero-copy
254+
data exchange with Arrow-compatible libraries.
255+
"""
256+
return self._table.__arrow_c_array__(requested_schema)
204257

205258

206259
class ExcelReader:
@@ -331,7 +384,7 @@ def load_table(
331384
| None = None,
332385
dtypes: DType | DTypeMap | None = None,
333386
eager: Literal[True] = ...,
334-
) -> pa.RecordBatch: ...
387+
) -> "pa.RecordBatch": ...
335388
def load_table(
336389
self,
337390
name: str,
@@ -349,7 +402,7 @@ def load_table(
349402
| None = None,
350403
dtypes: DType | DTypeMap | None = None,
351404
eager: bool = False,
352-
) -> ExcelTable | pa.RecordBatch:
405+
) -> "ExcelTable | pa.RecordBatch":
353406
"""Loads a table by name.
354407
355408
:param name: The name of the table to load.
@@ -413,7 +466,7 @@ def load_sheet_eager(
413466
dtype_coercion: Literal["coerce", "strict"] = "coerce",
414467
use_columns: list[str] | list[int] | str | None = None,
415468
dtypes: DType | DTypeMap | None = None,
416-
) -> pa.RecordBatch:
469+
) -> "pa.RecordBatch":
417470
"""Loads a sheet eagerly by index or name.
418471
419472
For xlsx files, this will be faster and more memory-efficient, as it will use

python/fastexcel/_fastexcel.pyi

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from __future__ import annotations
22

33
import typing
4-
from typing import Callable, Literal
4+
from typing import TYPE_CHECKING, Callable, Literal
55

6-
import pyarrow as pa
6+
if TYPE_CHECKING:
7+
import pyarrow as pa
78

89
DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
910
DTypeMap = dict[str | int, DType]
@@ -82,14 +83,18 @@ class _ExcelSheet:
8283
@property
8384
def visible(self) -> SheetVisible:
8485
"""The visibility of the sheet"""
85-
def to_arrow(self) -> pa.RecordBatch:
86+
def to_arrow(self) -> "pa.RecordBatch":
8687
"""Converts the sheet to a pyarrow `RecordBatch`"""
87-
def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors]:
88+
def to_arrow_with_errors(self) -> "tuple[pa.RecordBatch, CellErrors]":
8889
"""Converts the sheet to a pyarrow `RecordBatch` with error information.
8990
9091
Stores the positions of any values that cannot be parsed as the specified type and were
9192
therefore converted to None.
9293
"""
94+
def __arrow_c_schema__(self) -> object:
95+
"""Arrow PyCapsule Interface: Export schema as a PyCapsule"""
96+
def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]:
97+
"""Arrow PyCapsule Interface: Export array and schema as PyCapsules"""
9398

9499
class _ExcelTable:
95100
@property
@@ -118,8 +123,12 @@ class _ExcelTable:
118123
@property
119124
def specified_dtypes(self) -> DTypeMap | None:
120125
"""The dtypes specified for the table"""
121-
def to_arrow(self) -> pa.RecordBatch:
126+
def to_arrow(self) -> "pa.RecordBatch":
122127
"""Converts the table to a pyarrow `RecordBatch`"""
128+
def __arrow_c_schema__(self) -> object:
129+
"""Arrow PyCapsule Interface: Export schema as a PyCapsule"""
130+
def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]:
131+
"""Arrow PyCapsule Interface: Export array and schema as PyCapsules"""
123132

124133
class _ExcelReader:
125134
"""A class representing an open Excel file and allowing to read its sheets"""

0 commit comments

Comments
 (0)