feat(python): support eager argument in load_sheet (#385)

arnabanimesh · lukapeschke · web-flow · commit e71d7c0d97d0 · 2025-09-11T12:37:21.000Z
* refactor: Combine `load_sheet` and `load_sheet_eager` functions into one function

* fix tests

* satisfy ruff lint

* Satisfy mypy static type check

* undo deletion of `load_sheet_eager` function

* remove if/else branch and adapt typing

Signed-off-by: Luka Peschke &lt;luka.peschke@toucantoco.com&gt;

* chore: undo changes on unit tests

Signed-off-by: Luka Peschke &lt;luka.peschke@toucantoco.com&gt;

* update uv.lock

Signed-off-by: Luka Peschke &lt;luka.peschke@toucantoco.com&gt;

* chore: undo changes on unit tests

Signed-off-by: Luka Peschke &lt;luka.peschke@toucantoco.com&gt;

---------

Signed-off-by: Luka Peschke &lt;luka.peschke@toucantoco.com&gt;
Co-authored-by: Luka Peschke &lt;luka.peschke@toucantoco.com&gt;
diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -289,6 +289,7 @@ def sheet_names(self) -> list[str]:
         """The list of sheet names"""
         return self._reader.sheet_names
 
+    @typing.overload
     def load_sheet(
         self,
         idx_or_name: int | str,
@@ -305,8 +306,48 @@ def load_sheet(
         | Callable[[ColumnInfoNoDtype], bool]
         | None = None,
         dtypes: DType | DTypeMap | None = None,
-    ) -> ExcelSheet:
-        """Loads a sheet lazily by index or name.
+        eager: Literal[False] = ...,
+    ) -> ExcelSheet: ...
+
+    @typing.overload
+    def load_sheet(
+        self,
+        idx_or_name: int | str,
+        *,
+        header_row: int | None = 0,
+        column_names: list[str] | None = None,
+        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
+        n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
+        dtype_coercion: Literal["coerce", "strict"] = "coerce",
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
+        dtypes: DType | DTypeMap | None = None,
+        eager: Literal[True] = ...,
+    ) -> "pa.RecordBatch": ...
+
+    def load_sheet(
+        self,
+        idx_or_name: int | str,
+        *,
+        header_row: int | None = 0,
+        column_names: list[str] | None = None,
+        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
+        n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
+        dtype_coercion: Literal["coerce", "strict"] = "coerce",
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
+        dtypes: DType | DTypeMap | None = None,
+        eager: bool = False,
+    ) -> "ExcelSheet | pa.RecordBatch":
+        """Loads a sheet by index or name.
 
         :param idx_or_name: The index (starting at 0) or the name of the sheet to load.
         :param header_row: The index of the row containing the column labels, default index is 0.
@@ -349,21 +390,25 @@ def load_sheet(
                               indicating whether the column should be used
         :param dtypes: An optional dtype (for all columns)
                        or dict of dtypes with keys as column indices or names.
+        :param eager: Specifies whether the sheet should be loaded eagerly.
+                      `False` (default) will load the sheet lazily using the `PyCapsule` interface,
+                      whereas `True` will load it eagerly via `pyarrow`.
+
+                      Eager loading requires the `pyarrow` extra to be installed.
         """
-        return ExcelSheet(
-            self._reader.load_sheet(
-                idx_or_name=idx_or_name,
-                header_row=header_row,
-                column_names=column_names,
-                skip_rows=skip_rows,
-                n_rows=n_rows,
-                schema_sample_rows=schema_sample_rows,
-                dtype_coercion=dtype_coercion,
-                use_columns=use_columns,
-                dtypes=dtypes,
-                eager=False,
-            )
+        sheet_or_rb = self._reader.load_sheet(
+            idx_or_name=idx_or_name,
+            header_row=header_row,
+            column_names=column_names,
+            skip_rows=skip_rows,
+            n_rows=n_rows,
+            schema_sample_rows=schema_sample_rows,
+            dtype_coercion=dtype_coercion,
+            use_columns=use_columns,
+            dtypes=dtypes,
+            eager=eager,
         )
+        return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)
 
     def table_names(self, sheet_name: str | None = None) -> list[str]:
         """The list of table names.
diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -209,6 +209,25 @@ class _ExcelReader:
         eager: Literal[True] = ...,
     ) -> pa.RecordBatch: ...
     @typing.overload
+    def load_sheet(
+        self,
+        idx_or_name: str | int,
+        *,
+        header_row: int | None = 0,
+        column_names: list[str] | None = None,
+        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
+        n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
+        dtype_coercion: Literal["coerce", "strict"] = "coerce",
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
+        dtypes: DType | DTypeMap | None = None,
+        eager: bool = False,
+    ) -> pa.RecordBatch: ...
+    @typing.overload
     def load_table(
         self,
         name: str,
diff --git a/python/tests/test_dtypes.py b/python/tests/test_dtypes.py
@@ -205,17 +205,16 @@ def test_dtype_coercion_behavior__coerce(
     excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
 
     kwargs = {"dtype_coercion": dtype_coercion} if dtype_coercion else {}
-    sheet = (
-        excel_reader.load_sheet_eager(0, **kwargs)  # type:ignore[arg-type]
-        if eager
-        else excel_reader.load_sheet(0, **kwargs).to_arrow()  # type:ignore[arg-type]
+    sheet_or_rb = (
+        excel_reader.load_sheet(0, eager=eager, **kwargs)  # type:ignore[call-overload]
     )
+    rb = sheet_or_rb if eager else sheet_or_rb.to_arrow()
 
-    pd_df = sheet.to_pandas()
+    pd_df = rb.to_pandas()
     assert pd_df["Mixed dates"].dtype == "object"
     assert pd_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3
 
-    pl_df = pl.from_arrow(data=sheet)
+    pl_df = pl.from_arrow(data=rb)
     assert isinstance(pl_df, pl.DataFrame)
     assert pl_df["Mixed dates"].dtype == pl.Utf8
     assert pl_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3
diff --git a/uv.lock b/uv.lock