Skip to content

Commit 555459b

Browse files
Add tests and fix bugs
1 parent f33778c commit 555459b

File tree

5 files changed

+111
-53
lines changed

5 files changed

+111
-53
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ Other enhancements
8888
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
8989
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
9090
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
91+
- Third-party packages can now register engines that can be used in pandas I/O operations :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` (:issue:`61584`)
9192

9293
.. ---------------------------------------------------------------------------
9394
.. _whatsnew_300.notable_bug_fixes:

pandas/core/frame.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,10 @@
188188
nargsort,
189189
)
190190

191-
from pandas.io.common import get_handle
191+
from pandas.io.common import (
192+
allow_third_party_engines,
193+
get_handle,
194+
)
192195
from pandas.io.formats import (
193196
console,
194197
format as fmt,
@@ -3547,6 +3550,7 @@ def to_xml(
35473550

35483551
return xml_formatter.write_output()
35493552

3553+
@allow_third_party_engines
35503554
def to_iceberg(
35513555
self,
35523556
table_identifier: str,
@@ -3556,6 +3560,7 @@ def to_iceberg(
35563560
location: str | None = None,
35573561
append: bool = False,
35583562
snapshot_properties: dict[str, str] | None = None,
3563+
engine: str | None = None,
35593564
) -> None:
35603565
"""
35613566
Write a DataFrame to an Apache Iceberg table.
@@ -3580,6 +3585,10 @@ def to_iceberg(
35803585
If ``True``, append data to the table, instead of replacing the content.
35813586
snapshot_properties : dict of {str: str}, optional
35823587
Custom properties to be added to the snapshot summary
3588+
engine : str, optional
3589+
The engine to use. Engines can be installed via third-party packages. For an
3590+
updated list of existing pandas I/O engines check the I/O engines section of
3591+
our Ecosystem page.
35833592
35843593
See Also
35853594
--------

pandas/io/common.py

Lines changed: 51 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99
import codecs
1010
from collections import defaultdict
1111
from collections.abc import (
12+
Callable,
1213
Hashable,
1314
Mapping,
1415
Sequence,
1516
)
1617
import dataclasses
1718
import functools
1819
import gzip
20+
from importlib.metadata import entry_points
1921
from io import (
2022
BufferedIOBase,
2123
BytesIO,
@@ -51,8 +53,6 @@
5153
import warnings
5254
import zipfile
5355

54-
import pkg_resources
55-
5656
from pandas._typing import (
5757
BaseBuffer,
5858
ReadCsvBuffer,
@@ -1290,9 +1290,9 @@ def dedup_names(
12901290
return names
12911291

12921292

1293-
def _engine_func(format_name: str, engine_name: str, is_writer: bool):
1293+
def _get_io_engine(name: str):
12941294
"""
1295-
Return the engine function for a given format and operation.
1295+
Return an I/O engine by its name.
12961296
12971297
pandas I/O engines can be registered via entry points. The first time this
12981298
function is called it will register all the entry points of the "pandas.io_engine"
@@ -1304,13 +1304,8 @@ def _engine_func(format_name: str, engine_name: str, is_writer: bool):
13041304
13051305
Parameters
13061306
----------
1307-
format_name : str
1308-
The format such as 'csv', 'parquet', 'json', 'html', etc.
1309-
engine_name : str
1307+
name : str
13101308
The engine name provided by the user in `engine=<value>`.
1311-
is_writer : bool
1312-
`True` to return the `to_<format>` function, `False` to return the
1313-
`read_<format>` one.
13141309
13151310
Examples
13161311
--------
@@ -1330,59 +1325,57 @@ def _engine_func(format_name: str, engine_name: str, is_writer: bool):
13301325
13311326
```
13321327
1333-
Then the `read_csv` method of the engine can be retrieved with:
1328+
Then the `read_csv` method of the engine can be used with:
13341329
1335-
>>> func = _engine_func(format_name="csv", engine_name="dummy", is_writer=False)
1330+
>>> _get_io_engine(engine_name="dummy").read_csv("myfile.csv") # doctest: +SKIP
13361331
13371332
This is used internally to dispatch the next pandas call to the engine caller:
13381333
1339-
>>> df = read_csv("myfile.csv", engine="dummy")
1334+
>>> df = read_csv("myfile.csv", engine="dummy") # doctest: +SKIP
13401335
"""
13411336
global _io_engines
13421337

13431338
if _io_engines is None:
13441339
_io_engines = {}
1345-
for entry_point in pkg_resources.iter_entry_points(group="pandas.io_engine"):
1346-
_io_engines[entry_point.name] = entry_point.load()
1340+
for entry_point in entry_points().select(group="pandas.io_engine"):
1341+
package_name = entry_point.dist.metadata["Name"]
1342+
if entry_point.name in _io_engines:
1343+
_io_engines[entry_point.name]._other_providers.append(package_name)
1344+
else:
1345+
_io_engines[entry_point.name] = entry_point.load()
1346+
_io_engines[entry_point.name]._provider_name = package_name
1347+
_io_engines[entry_point.name]._other_providers = []
13471348

13481349
try:
1349-
engine_class = _io_engines[engine_name]
1350+
engine = _io_engines[name]
13501351
except KeyError as err:
13511352
raise ValueError(
1352-
f"'{engine_name}' is not a known engine. Some engines are only available "
1353+
f"'{name}' is not a known engine. Some engines are only available "
13531354
"after installing the package that provides them."
13541355
) from err
13551356

1356-
func_name = f"to_{format_name}" if is_writer else f"read_{format_name}"
1357-
try:
1358-
engine_method = getattr(engine_class, func_name)
1359-
except AttributeError as err:
1360-
raise ValueError(
1361-
f"The engine '{engine_name}' does not provide a '{func_name}' function"
1362-
) from err
1363-
else:
1364-
return engine_method
1365-
1366-
1367-
def _extract_io_function_info(func_name):
1368-
"""
1369-
Return the format and if it's a reader or writer from a function name like read_csv.
1370-
"""
1371-
op_type, format_name = func_name.split("_", maxsplit=1)
1372-
if op_type == "read":
1373-
is_writer = False
1374-
elif op_type == "to":
1375-
is_writer = True
1376-
else:
1377-
raise ValueError(
1378-
"Unable to extract info from the function name '{func_name}'. "
1379-
"The expected format is `read_<format> or `to_<format>`."
1357+
if engine._other_providers:
1358+
msg = (
1359+
f"The engine '{name}' has been registered by the package "
1360+
f"'{engine._provider_name}' and will be used. "
13801361
)
1362+
if len(engine._other_providers):
1363+
msg += (
1364+
"The package '{engine._other_providers}' also tried to register "
1365+
"the engine, but it couldn't because it was already registered."
1366+
)
1367+
else:
1368+
msg += (
1369+
"Other packages that tried to register the engine, but they couldn't "
1370+
"because it was already registered are: "
1371+
f"{str(engine._other_providers)[1:-1]}."
1372+
)
1373+
warnings.warn(RuntimeWarning, msg, stacklevel=find_stack_level())
13811374

1382-
return format_name, is_writer
1375+
return engine
13831376

13841377

1385-
def allow_third_party_engines(skip_engines: list[str] | None = None):
1378+
def allow_third_party_engines(skip_engines: list[str] | Callable | None = None):
13861379
"""
13871380
Decorator to avoid boilerplate code when allowing readers and writers to use
13881381
third-party engines.
@@ -1415,14 +1408,21 @@ def allow_third_party_engines(skip_engines: list[str] | None = None):
14151408
def decorator(func):
14161409
@functools.wraps(func)
14171410
def wrapper(*args, **kwargs):
1418-
if "engine" in kwargs and kwargs["engine"] not in skip_engines:
1419-
format_name, is_writer = _extract_io_function_info(func.__name__)
1420-
engine_func = _engine_func(
1421-
format_name=format_name,
1422-
engine_name=kwargs.pop("engine"),
1423-
is_writer=is_writer,
1424-
)
1425-
return engine_func(*args, **kwargs)
1411+
if callable(skip_engines):
1412+
skip_engine = False
1413+
else:
1414+
skip_engine = kwargs["engine"] in skip_engines
1415+
1416+
if "engine" in kwargs and not skip_engine:
1417+
engine_name = kwargs.pop("engine")
1418+
engine = _get_io_engine(engine_name)
1419+
try:
1420+
return getattr(engine, func.__name__)(*args, **kwargs)
1421+
except AttributeError as err:
1422+
raise ValueError(
1423+
f"The engine '{engine_name}' does not provide a "
1424+
f"'{func.__name__}' function"
1425+
) from err
14261426
else:
14271427
return func(*args, **kwargs)
14281428

pandas/io/iceberg.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pandas.io.common import allow_third_party_engines
1010

1111

12-
@allow_third_party_engines()
12+
@allow_third_party_engines
1313
def read_iceberg(
1414
table_identifier: str,
1515
catalog_name: str | None = None,

pandas/tests/io/test_io_engines.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import pytest
2+
3+
from pandas.io import common
4+
5+
6+
@pytest.fixture
7+
def patch_engine(monkeypatch):
8+
class MockIoEngine:
9+
@classmethod
10+
def read_foo(cls, fname):
11+
return "third-party"
12+
13+
monkeypatch.setattr(common, "_get_io_engine", lambda name: MockIoEngine)
14+
15+
16+
class TestIoEngines:
17+
def test_decorator_with_no_engine(self, patch_engine):
18+
@common.allow_third_party_engines
19+
def read_foo(fname, engine=None):
20+
return "default"
21+
22+
result = read_foo("myfile.foo")
23+
assert result == "default"
24+
25+
def test_decorator_with_skipped_engine(self, patch_engine):
26+
@common.allow_third_party_engines(skip_engines=["c"])
27+
def read_foo(fname, engine=None):
28+
return "default"
29+
30+
result = read_foo("myfile.foo", engine="c")
31+
assert result == "default"
32+
33+
def test_decorator_with_third_party_engine(self, patch_engine):
34+
@common.allow_third_party_engines
35+
def read_foo(fname, engine=None):
36+
return "default"
37+
38+
result = read_foo("myfile.foo", engine="third-party")
39+
assert result == "third-party"
40+
41+
def test_decorator_with_third_party_engine_but_no_method(self, patch_engine):
42+
@common.allow_third_party_engines
43+
def read_bar(fname, engine=None):
44+
return "default"
45+
46+
msg = "'third-party' does not provide a 'read_bar'"
47+
with pytest.raises(ValueError, match=msg):
48+
read_bar("myfile.foo", engine="third-party")

0 commit comments

Comments
 (0)