Skip to content

Commit f33778c

Browse files
New third-party IO engines
1 parent c5457f6 commit f33778c

File tree

2 files changed

+157
-0
lines changed

2 files changed

+157
-0
lines changed

pandas/io/common.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
import warnings
5252
import zipfile
5353

54+
import pkg_resources
55+
5456
from pandas._typing import (
5557
BaseBuffer,
5658
ReadCsvBuffer,
@@ -90,6 +92,10 @@
9092

9193
from pandas import MultiIndex
9294

95+
# registry of I/O engines. It is populated the first time a non-core
96+
# pandas engine is used
97+
_io_engines = None
98+
9399

94100
@dataclasses.dataclass
95101
class IOArgs:
@@ -1282,3 +1288,146 @@ def dedup_names(
12821288
counts[col] = cur_count + 1
12831289

12841290
return names
1291+
1292+
1293+
def _engine_func(format_name: str, engine_name: str, is_writer: bool):
1294+
"""
1295+
Return the engine function for a given format and operation.
1296+
1297+
pandas I/O engines can be registered via entry points. The first time this
1298+
function is called it will register all the entry points of the "pandas.io_engine"
1299+
group and cache them in the global `_io_engines` variable.
1300+
1301+
Engines are implemented as classes with the `read_<format>` and `to_<format>`
1302+
methods (classmethods) for the formats they wish to provide. This function will
1303+
return the method from the engine and format being requested.
1304+
1305+
Parameters
1306+
----------
1307+
format_name : str
1308+
The format such as 'csv', 'parquet', 'json', 'html', etc.
1309+
engine_name : str
1310+
The engine name provided by the user in `engine=<value>`.
1311+
is_writer : bool
1312+
`True` to return the `to_<format>` function, `False` to return the
1313+
`read_<format>` one.
1314+
1315+
Examples
1316+
--------
1317+
An engine is implemented with a class like:
1318+
1319+
>>> class DummyEngine:
1320+
... @classmethod
1321+
... def read_csv(cls, filepath_or_buffer, **kwargs):
1322+
... # the engine signature must match the pandas method signature
1323+
... return pd.DataFrame()
1324+
1325+
It must be registered as an entry point with the engine name:
1326+
1327+
```
1328+
[project.entry-points."pandas.io_engine"]
1329+
dummy = "pandas:io.dummy.DummyEngine"
1330+
1331+
```
1332+
1333+
Then the `read_csv` method of the engine can be retrieved with:
1334+
1335+
>>> func = _engine_func(format_name="csv", engine_name="dummy", is_writer=False)
1336+
1337+
This is used internally to dispatch the next pandas call to the engine caller:
1338+
1339+
>>> df = read_csv("myfile.csv", engine="dummy")
1340+
"""
1341+
global _io_engines
1342+
1343+
if _io_engines is None:
1344+
_io_engines = {}
1345+
for entry_point in pkg_resources.iter_entry_points(group="pandas.io_engine"):
1346+
_io_engines[entry_point.name] = entry_point.load()
1347+
1348+
try:
1349+
engine_class = _io_engines[engine_name]
1350+
except KeyError as err:
1351+
raise ValueError(
1352+
f"'{engine_name}' is not a known engine. Some engines are only available "
1353+
"after installing the package that provides them."
1354+
) from err
1355+
1356+
func_name = f"to_{format_name}" if is_writer else f"read_{format_name}"
1357+
try:
1358+
engine_method = getattr(engine_class, func_name)
1359+
except AttributeError as err:
1360+
raise ValueError(
1361+
f"The engine '{engine_name}' does not provide a '{func_name}' function"
1362+
) from err
1363+
else:
1364+
return engine_method
1365+
1366+
1367+
def _extract_io_function_info(func_name):
1368+
"""
1369+
Return the format and if it's a reader or writer from a function name like read_csv.
1370+
"""
1371+
op_type, format_name = func_name.split("_", maxsplit=1)
1372+
if op_type == "read":
1373+
is_writer = False
1374+
elif op_type == "to":
1375+
is_writer = True
1376+
else:
1377+
raise ValueError(
1378+
"Unable to extract info from the function name '{func_name}'. "
1379+
"The expected format is `read_<format> or `to_<format>`."
1380+
)
1381+
1382+
return format_name, is_writer
1383+
1384+
1385+
def allow_third_party_engines(skip_engines: list[str] | None = None):
1386+
"""
1387+
Decorator to avoid boilerplate code when allowing readers and writers to use
1388+
third-party engines.
1389+
1390+
The decorator will introspect the function to know which format should be obtained,
1391+
and to know if it's a reader or a writer. Then it will check if the engine has been
1392+
registered, and if it has, it will dispatch the execution to the engine with the
1393+
arguments provided by the user.
1394+
1395+
Parameters
1396+
----------
1397+
skip_engines : list of str, optional
1398+
For engines that are implemented in pandas, we want to skip them for this engine
1399+
dispatching system. They should be specified in this parameter.
1400+
1401+
Examples
1402+
--------
1403+
The decorator works both with the `skip_engines` parameter, or without:
1404+
1405+
>>> class DataFrame:
1406+
... @allow_third_party_engines(["python", "c", "pyarrow"])
1407+
... def read_csv(filepath_or_buffer, **kwargs):
1408+
... pass
1409+
...
1410+
... @allow_third_party_engines
1411+
... def read_sas(filepath_or_buffer, **kwargs):
1412+
... pass
1413+
"""
1414+
1415+
def decorator(func):
1416+
@functools.wraps(func)
1417+
def wrapper(*args, **kwargs):
1418+
if "engine" in kwargs and kwargs["engine"] not in skip_engines:
1419+
format_name, is_writer = _extract_io_function_info(func.__name__)
1420+
engine_func = _engine_func(
1421+
format_name=format_name,
1422+
engine_name=kwargs.pop("engine"),
1423+
is_writer=is_writer,
1424+
)
1425+
return engine_func(*args, **kwargs)
1426+
else:
1427+
return func(*args, **kwargs)
1428+
1429+
return wrapper
1430+
1431+
if callable(skip_engines):
1432+
return decorator(skip_engines)
1433+
return decorator

pandas/io/iceberg.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66

77
from pandas import DataFrame
88

9+
from pandas.io.common import allow_third_party_engines
910

11+
12+
@allow_third_party_engines()
1013
def read_iceberg(
1114
table_identifier: str,
1215
catalog_name: str | None = None,
@@ -18,6 +21,7 @@ def read_iceberg(
1821
snapshot_id: int | None = None,
1922
limit: int | None = None,
2023
scan_properties: dict[str, Any] | None = None,
24+
engine: str | None = None,
2125
) -> DataFrame:
2226
"""
2327
Read an Apache Iceberg table into a pandas DataFrame.
@@ -52,6 +56,10 @@ def read_iceberg(
5256
scan_properties : dict of {str: obj}, optional
5357
Additional Table properties as a dictionary of string key value pairs to use
5458
for this scan.
59+
engine : str, optional
60+
The engine to use. Engines can be installed via third-party packages. For an
61+
updated list of existing pandas I/O engines check the I/O engines section of
62+
our Ecosystem page.
5563
5664
Returns
5765
-------

0 commit comments

Comments
 (0)