|
51 | 51 | import warnings |
52 | 52 | import zipfile |
53 | 53 |
|
| 54 | +import pkg_resources |
| 55 | + |
54 | 56 | from pandas._typing import ( |
55 | 57 | BaseBuffer, |
56 | 58 | ReadCsvBuffer, |
|
90 | 92 |
|
91 | 93 | from pandas import MultiIndex |
92 | 94 |
|
| 95 | +# registry of I/O engines. It is populated the first time a non-core |
| 96 | +# pandas engine is used |
| 97 | +_io_engines = None |
| 98 | + |
93 | 99 |
|
94 | 100 | @dataclasses.dataclass |
95 | 101 | class IOArgs: |
@@ -1282,3 +1288,146 @@ def dedup_names( |
1282 | 1288 | counts[col] = cur_count + 1 |
1283 | 1289 |
|
1284 | 1290 | return names |
| 1291 | + |
| 1292 | + |
| 1293 | +def _engine_func(format_name: str, engine_name: str, is_writer: bool): |
| 1294 | + """ |
| 1295 | + Return the engine function for a given format and operation. |
| 1296 | +
|
| 1297 | + pandas I/O engines can be registered via entry points. The first time this |
| 1298 | + function is called it will register all the entry points of the "pandas.io_engine" |
| 1299 | + group and cache them in the global `_io_engines` variable. |
| 1300 | +
|
| 1301 | + Engines are implemented as classes with the `read_<format>` and `to_<format>` |
| 1302 | + methods (classmethods) for the formats they wish to provide. This function will |
| 1303 | + return the method from the engine and format being requested. |
| 1304 | +
|
| 1305 | + Parameters |
| 1306 | + ---------- |
| 1307 | + format_name : str |
| 1308 | + The format such as 'csv', 'parquet', 'json', 'html', etc. |
| 1309 | + engine_name : str |
| 1310 | + The engine name provided by the user in `engine=<value>`. |
| 1311 | + is_writer : bool |
| 1312 | + `True` to return the `to_<format>` function, `False` to return the |
| 1313 | + `read_<format>` one. |
| 1314 | +
|
| 1315 | + Examples |
| 1316 | + -------- |
| 1317 | + An engine is implemented with a class like: |
| 1318 | +
|
| 1319 | + >>> class DummyEngine: |
| 1320 | + ... @classmethod |
| 1321 | + ... def read_csv(cls, filepath_or_buffer, **kwargs): |
| 1322 | + ... # the engine signature must match the pandas method signature |
| 1323 | + ... return pd.DataFrame() |
| 1324 | +
|
| 1325 | + It must be registered as an entry point with the engine name: |
| 1326 | +
|
| 1327 | + ``` |
| 1328 | + [project.entry-points."pandas.io_engine"] |
| 1329 | + dummy = "pandas:io.dummy.DummyEngine" |
| 1330 | +
|
| 1331 | + ``` |
| 1332 | +
|
| 1333 | + Then the `read_csv` method of the engine can be retrieved with: |
| 1334 | +
|
| 1335 | + >>> func = _engine_func(format_name="csv", engine_name="dummy", is_writer=False) |
| 1336 | +
|
| 1337 | + This is used internally to dispatch the next pandas call to the engine caller: |
| 1338 | +
|
| 1339 | + >>> df = read_csv("myfile.csv", engine="dummy") |
| 1340 | + """ |
| 1341 | + global _io_engines |
| 1342 | + |
| 1343 | + if _io_engines is None: |
| 1344 | + _io_engines = {} |
| 1345 | + for entry_point in pkg_resources.iter_entry_points(group="pandas.io_engine"): |
| 1346 | + _io_engines[entry_point.name] = entry_point.load() |
| 1347 | + |
| 1348 | + try: |
| 1349 | + engine_class = _io_engines[engine_name] |
| 1350 | + except KeyError as err: |
| 1351 | + raise ValueError( |
| 1352 | + f"'{engine_name}' is not a known engine. Some engines are only available " |
| 1353 | + "after installing the package that provides them." |
| 1354 | + ) from err |
| 1355 | + |
| 1356 | + func_name = f"to_{format_name}" if is_writer else f"read_{format_name}" |
| 1357 | + try: |
| 1358 | + engine_method = getattr(engine_class, func_name) |
| 1359 | + except AttributeError as err: |
| 1360 | + raise ValueError( |
| 1361 | + f"The engine '{engine_name}' does not provide a '{func_name}' function" |
| 1362 | + ) from err |
| 1363 | + else: |
| 1364 | + return engine_method |
| 1365 | + |
| 1366 | + |
| 1367 | +def _extract_io_function_info(func_name): |
| 1368 | + """ |
| 1369 | + Return the format and if it's a reader or writer from a function name like read_csv. |
| 1370 | + """ |
| 1371 | + op_type, format_name = func_name.split("_", maxsplit=1) |
| 1372 | + if op_type == "read": |
| 1373 | + is_writer = False |
| 1374 | + elif op_type == "to": |
| 1375 | + is_writer = True |
| 1376 | + else: |
| 1377 | + raise ValueError( |
| 1378 | + "Unable to extract info from the function name '{func_name}'. " |
| 1379 | + "The expected format is `read_<format> or `to_<format>`." |
| 1380 | + ) |
| 1381 | + |
| 1382 | + return format_name, is_writer |
| 1383 | + |
| 1384 | + |
| 1385 | +def allow_third_party_engines(skip_engines: list[str] | None = None): |
| 1386 | + """ |
| 1387 | + Decorator to avoid boilerplate code when allowing readers and writers to use |
| 1388 | + third-party engines. |
| 1389 | +
|
| 1390 | + The decorator will introspect the function to know which format should be obtained, |
| 1391 | + and to know if it's a reader or a writer. Then it will check if the engine has been |
| 1392 | + registered, and if it has, it will dispatch the execution to the engine with the |
| 1393 | + arguments provided by the user. |
| 1394 | +
|
| 1395 | + Parameters |
| 1396 | + ---------- |
| 1397 | + skip_engines : list of str, optional |
| 1398 | + For engines that are implemented in pandas, we want to skip them for this engine |
| 1399 | + dispatching system. They should be specified in this parameter. |
| 1400 | +
|
| 1401 | + Examples |
| 1402 | + -------- |
| 1403 | + The decorator works both with the `skip_engines` parameter, or without: |
| 1404 | +
|
| 1405 | + >>> class DataFrame: |
| 1406 | + ... @allow_third_party_engines(["python", "c", "pyarrow"]) |
| 1407 | + ... def read_csv(filepath_or_buffer, **kwargs): |
| 1408 | + ... pass |
| 1409 | + ... |
| 1410 | + ... @allow_third_party_engines |
| 1411 | + ... def read_sas(filepath_or_buffer, **kwargs): |
| 1412 | + ... pass |
| 1413 | + """ |
| 1414 | + |
| 1415 | + def decorator(func): |
| 1416 | + @functools.wraps(func) |
| 1417 | + def wrapper(*args, **kwargs): |
| 1418 | + if "engine" in kwargs and kwargs["engine"] not in skip_engines: |
| 1419 | + format_name, is_writer = _extract_io_function_info(func.__name__) |
| 1420 | + engine_func = _engine_func( |
| 1421 | + format_name=format_name, |
| 1422 | + engine_name=kwargs.pop("engine"), |
| 1423 | + is_writer=is_writer, |
| 1424 | + ) |
| 1425 | + return engine_func(*args, **kwargs) |
| 1426 | + else: |
| 1427 | + return func(*args, **kwargs) |
| 1428 | + |
| 1429 | + return wrapper |
| 1430 | + |
| 1431 | + if callable(skip_engines): |
| 1432 | + return decorator(skip_engines) |
| 1433 | + return decorator |
0 commit comments