Skip to content

Commit e71d7c0

Browse files
feat(python): support eager argument in load_sheet (#385)
* refactor: Combine `load_sheet` and `load_sheet_eager` functions into one function * fix tests * satisfy ruff lint * Satisfy mypy static type check * undo deletion of `load_sheet_eager` function * remove if/else branch and adapt typing Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * chore: undo changes on unit tests Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * update uv.lock Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * chore: undo changes on unit tests Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> --------- Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> Co-authored-by: Luka Peschke <luka.peschke@toucantoco.com>
1 parent 76c0d83 commit e71d7c0

File tree

4 files changed

+86
-23
lines changed

4 files changed

+86
-23
lines changed

python/fastexcel/__init__.py

Lines changed: 60 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ def sheet_names(self) -> list[str]:
289289
"""The list of sheet names"""
290290
return self._reader.sheet_names
291291

292+
@typing.overload
292293
def load_sheet(
293294
self,
294295
idx_or_name: int | str,
@@ -305,8 +306,48 @@ def load_sheet(
305306
| Callable[[ColumnInfoNoDtype], bool]
306307
| None = None,
307308
dtypes: DType | DTypeMap | None = None,
308-
) -> ExcelSheet:
309-
"""Loads a sheet lazily by index or name.
309+
eager: Literal[False] = ...,
310+
) -> ExcelSheet: ...
311+
312+
@typing.overload
313+
def load_sheet(
314+
self,
315+
idx_or_name: int | str,
316+
*,
317+
header_row: int | None = 0,
318+
column_names: list[str] | None = None,
319+
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
320+
n_rows: int | None = None,
321+
schema_sample_rows: int | None = 1_000,
322+
dtype_coercion: Literal["coerce", "strict"] = "coerce",
323+
use_columns: list[str]
324+
| list[int]
325+
| str
326+
| Callable[[ColumnInfoNoDtype], bool]
327+
| None = None,
328+
dtypes: DType | DTypeMap | None = None,
329+
eager: Literal[True] = ...,
330+
) -> "pa.RecordBatch": ...
331+
332+
def load_sheet(
333+
self,
334+
idx_or_name: int | str,
335+
*,
336+
header_row: int | None = 0,
337+
column_names: list[str] | None = None,
338+
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
339+
n_rows: int | None = None,
340+
schema_sample_rows: int | None = 1_000,
341+
dtype_coercion: Literal["coerce", "strict"] = "coerce",
342+
use_columns: list[str]
343+
| list[int]
344+
| str
345+
| Callable[[ColumnInfoNoDtype], bool]
346+
| None = None,
347+
dtypes: DType | DTypeMap | None = None,
348+
eager: bool = False,
349+
) -> "ExcelSheet | pa.RecordBatch":
350+
"""Loads a sheet by index or name.
310351
311352
:param idx_or_name: The index (starting at 0) or the name of the sheet to load.
312353
:param header_row: The index of the row containing the column labels, default index is 0.
@@ -349,21 +390,25 @@ def load_sheet(
349390
indicating whether the column should be used
350391
:param dtypes: An optional dtype (for all columns)
351392
or dict of dtypes with keys as column indices or names.
393+
:param eager: Specifies whether the sheet should be loaded eagerly.
394+
`False` (default) will load the sheet lazily using the `PyCapsule` interface,
395+
whereas `True` will load it eagerly via `pyarrow`.
396+
397+
Eager loading requires the `pyarrow` extra to be installed.
352398
"""
353-
return ExcelSheet(
354-
self._reader.load_sheet(
355-
idx_or_name=idx_or_name,
356-
header_row=header_row,
357-
column_names=column_names,
358-
skip_rows=skip_rows,
359-
n_rows=n_rows,
360-
schema_sample_rows=schema_sample_rows,
361-
dtype_coercion=dtype_coercion,
362-
use_columns=use_columns,
363-
dtypes=dtypes,
364-
eager=False,
365-
)
399+
sheet_or_rb = self._reader.load_sheet(
400+
idx_or_name=idx_or_name,
401+
header_row=header_row,
402+
column_names=column_names,
403+
skip_rows=skip_rows,
404+
n_rows=n_rows,
405+
schema_sample_rows=schema_sample_rows,
406+
dtype_coercion=dtype_coercion,
407+
use_columns=use_columns,
408+
dtypes=dtypes,
409+
eager=eager,
366410
)
411+
return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)
367412

368413
def table_names(self, sheet_name: str | None = None) -> list[str]:
369414
"""The list of table names.

python/fastexcel/_fastexcel.pyi

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,25 @@ class _ExcelReader:
209209
eager: Literal[True] = ...,
210210
) -> pa.RecordBatch: ...
211211
@typing.overload
212+
def load_sheet(
213+
self,
214+
idx_or_name: str | int,
215+
*,
216+
header_row: int | None = 0,
217+
column_names: list[str] | None = None,
218+
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
219+
n_rows: int | None = None,
220+
schema_sample_rows: int | None = 1_000,
221+
dtype_coercion: Literal["coerce", "strict"] = "coerce",
222+
use_columns: list[str]
223+
| list[int]
224+
| str
225+
| Callable[[ColumnInfoNoDtype], bool]
226+
| None = None,
227+
dtypes: DType | DTypeMap | None = None,
228+
eager: bool = False,
229+
) -> pa.RecordBatch: ...
230+
@typing.overload
212231
def load_table(
213232
self,
214233
name: str,

python/tests/test_dtypes.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -205,17 +205,16 @@ def test_dtype_coercion_behavior__coerce(
205205
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
206206

207207
kwargs = {"dtype_coercion": dtype_coercion} if dtype_coercion else {}
208-
sheet = (
209-
excel_reader.load_sheet_eager(0, **kwargs) # type:ignore[arg-type]
210-
if eager
211-
else excel_reader.load_sheet(0, **kwargs).to_arrow() # type:ignore[arg-type]
208+
sheet_or_rb = (
209+
excel_reader.load_sheet(0, eager=eager, **kwargs) # type:ignore[call-overload]
212210
)
211+
rb = sheet_or_rb if eager else sheet_or_rb.to_arrow()
213212

214-
pd_df = sheet.to_pandas()
213+
pd_df = rb.to_pandas()
215214
assert pd_df["Mixed dates"].dtype == "object"
216215
assert pd_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3
217216

218-
pl_df = pl.from_arrow(data=sheet)
217+
pl_df = pl.from_arrow(data=rb)
219218
assert isinstance(pl_df, pl.DataFrame)
220219
assert pl_df["Mixed dates"].dtype == pl.Utf8
221220
assert pl_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3

uv.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)