Skip to content

Commit 08c0b78

Browse files
ENH: add basic DataFrame.from_arrow class method for importing through Arrow PyCapsule interface (#59696)
Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 53c5e30 commit 08c0b78

File tree

4 files changed

+139
-0
lines changed

4 files changed

+139
-0
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,9 @@ Other enhancements
233233
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
234234
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
235235
- Switched wheel upload to **PyPI Trusted Publishing** (OIDC) for release-tag pushes in ``wheels.yml``. (:issue:`61718`)
236+
- Added a new :meth:`DataFrame.from_arrow` method to import any Arrow-compatible
237+
tabular data object into a pandas :class:`DataFrame` through the
238+
`Arrow PyCapsule Protocol <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`__ (:issue:`59631`)
236239

237240
.. ---------------------------------------------------------------------------
238241
.. _whatsnew_300.notable_bug_fixes:

pandas/_typing.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,4 +533,44 @@ def closed(self) -> bool:
533533

534534
SliceType: TypeAlias = Hashable | None
535535

536+
537+
# Arrow PyCapsule Interface
538+
# from https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#protocol-typehints
539+
540+
541+
class ArrowArrayExportable(Protocol):
542+
"""
543+
An object with an ``__arrow_c_array__`` method.
544+
545+
This method indicates the object is an Arrow-compatible object implementing
546+
the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_ in
547+
Python), enabling zero-copy Arrow data interchange across libraries.
548+
549+
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
550+
.. _Arrow C Data Interface: https://arrow.apache.org/docs/format/CDataInterface.html
551+
552+
"""
553+
554+
def __arrow_c_array__(
555+
self, requested_schema: object | None = None
556+
) -> tuple[object, object]: ...
557+
558+
559+
class ArrowStreamExportable(Protocol):
560+
"""
561+
An object with an ``__arrow_c_stream__`` method.
562+
563+
This method indicates the object is an Arrow-compatible object implementing
564+
the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_
565+
for streams in Python), enabling zero-copy Arrow data interchange across
566+
libraries.
567+
568+
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
569+
.. _Arrow C Stream Interface: https://arrow.apache.org/docs/format/CStreamInterface.html
570+
571+
"""
572+
573+
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ...
574+
575+
536576
__all__ = ["type_t"]

pandas/core/frame.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,8 @@
214214
AnyAll,
215215
AnyArrayLike,
216216
ArrayLike,
217+
ArrowArrayExportable,
218+
ArrowStreamExportable,
217219
Axes,
218220
Axis,
219221
AxisInt,
@@ -1840,6 +1842,56 @@ def __rmatmul__(self, other) -> DataFrame:
18401842
# ----------------------------------------------------------------------
18411843
# IO methods (to / from other formats)
18421844

1845+
@classmethod
1846+
def from_arrow(
1847+
cls, data: ArrowArrayExportable | ArrowStreamExportable
1848+
) -> DataFrame:
1849+
"""
1850+
Construct a DataFrame from a tabular Arrow object.
1851+
1852+
This function accepts any Arrow-compatible tabular object implementing
1853+
the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__``
1854+
or ``__arrow_c_stream__`` method).
1855+
1856+
This function currently relies on ``pyarrow`` to convert the tabular
1857+
object in Arrow format to pandas.
1858+
1859+
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
1860+
1861+
.. versionadded:: 3.0
1862+
1863+
Parameters
1864+
----------
1865+
data : pyarrow.Table or Arrow-compatible table
1866+
Any tabular object implementing the Arrow PyCapsule Protocol
1867+
(i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__``
1868+
method).
1869+
1870+
Returns
1871+
-------
1872+
DataFrame
1873+
1874+
"""
1875+
pa = import_optional_dependency("pyarrow", min_version="14.0.0")
1876+
if not isinstance(data, pa.Table):
1877+
if not (
1878+
hasattr(data, "__arrow_c_array__")
1879+
or hasattr(data, "__arrow_c_stream__")
1880+
):
1881+
# explicitly test this, because otherwise we would accept variour other
1882+
# input types through the pa.table(..) call
1883+
raise TypeError(
1884+
"Expected an Arrow-compatible tabular object (i.e. having an "
1885+
"'_arrow_c_array__' or '__arrow_c_stream__' method), got "
1886+
f"'{type(data).__name__}' instead."
1887+
)
1888+
pa_table = pa.table(data)
1889+
else:
1890+
pa_table = data
1891+
1892+
df = pa_table.to_pandas()
1893+
return df
1894+
18431895
@classmethod
18441896
def from_dict(
18451897
cls,

pandas/tests/frame/test_arrow_interface.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pandas.util._test_decorators as td
66

77
import pandas as pd
8+
import pandas._testing as tm
89

910
pa = pytest.importorskip("pyarrow")
1011

@@ -45,3 +46,46 @@ def test_dataframe_to_arrow(using_infer_string):
4546
table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all()
4647
expected = expected.cast(schema)
4748
assert table.equals(expected)
49+
50+
51+
class ArrowArrayWrapper:
52+
def __init__(self, batch):
53+
self.array = batch
54+
55+
def __arrow_c_array__(self, requested_schema=None):
56+
return self.array.__arrow_c_array__(requested_schema)
57+
58+
59+
class ArrowStreamWrapper:
60+
def __init__(self, table):
61+
self.stream = table
62+
63+
def __arrow_c_stream__(self, requested_schema=None):
64+
return self.stream.__arrow_c_stream__(requested_schema)
65+
66+
67+
@td.skip_if_no("pyarrow", min_version="14.0")
68+
def test_dataframe_from_arrow():
69+
# objects with __arrow_c_stream__
70+
table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
71+
72+
result = pd.DataFrame.from_arrow(table)
73+
expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
74+
tm.assert_frame_equal(result, expected)
75+
76+
# not only pyarrow object are supported
77+
result = pd.DataFrame.from_arrow(ArrowStreamWrapper(table))
78+
tm.assert_frame_equal(result, expected)
79+
80+
# objects with __arrow_c_array__
81+
batch = pa.record_batch([[1, 2, 3], ["a", "b", "c"]], names=["a", "b"])
82+
83+
result = pd.DataFrame.from_arrow(table)
84+
tm.assert_frame_equal(result, expected)
85+
86+
result = pd.DataFrame.from_arrow(ArrowArrayWrapper(batch))
87+
tm.assert_frame_equal(result, expected)
88+
89+
# only accept actual Arrow objects
90+
with pytest.raises(TypeError, match="Expected an Arrow-compatible tabular object"):
91+
pd.DataFrame.from_arrow({"a": [1, 2, 3], "b": ["a", "b", "c"]})

0 commit comments

Comments
 (0)