Skip to content

Commit f010a44

Browse files
authored
feat: allow list of ints or callable for skip_rows (#367)
1 parent 1364f30 commit f010a44

File tree

9 files changed

+357
-182
lines changed

9 files changed

+357
-182
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,16 @@ Docs available [here](https://fastexcel.toucantoco.dev/).
99
## Installation
1010

1111
```bash
12-
# Lightweight installation (no pyarrow dependency)
12+
# Lightweight installation (no PyArrow dependency)
1313
pip install fastexcel
1414

15-
# With Polars support only (no pyarrow needed)
15+
# With Polars support only (no PyArrow needed)
1616
pip install fastexcel[polars]
1717

18-
# With pandas support (includes pyarrow)
18+
# With Pandas support (includes PyArrow)
1919
pip install fastexcel[pandas]
2020

21-
# With pyarrow support
21+
# With PyArrow support
2222
pip install fastexcel[pyarrow]
2323

2424
# With all integrations

python/fastexcel/__init__.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def load_sheet(
285285
*,
286286
header_row: int | None = 0,
287287
column_names: list[str] | None = None,
288-
skip_rows: int | None = None,
288+
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
289289
n_rows: int | None = None,
290290
schema_sample_rows: int | None = 1_000,
291291
dtype_coercion: Literal["coerce", "strict"] = "coerce",
@@ -306,13 +306,15 @@ def load_sheet(
306306
If `column_names` is used, `header_row` will be ignored.
307307
:param n_rows: Specifies how many rows should be loaded.
308308
If `None`, all rows are loaded
309-
:param skip_rows: Specifies how many rows should be skipped after the `header_row`.
309+
:param skip_rows: Specifies which rows should be skipped after the `header_row`.
310310
Any rows before the `header_row` are automatically skipped.
311-
If `header_row` is `None`:
312-
- if `skip_rows` is `None` (default): it skips all empty rows
313-
at the beginning of the sheet.
314-
- if `skip_rows` is a number, it skips the specified number
315-
of rows from the start of the sheet.
311+
It means row indices are relative to data rows, not the sheet!
312+
Can be one of:
313+
- `int`: Skip this many rows after the header row
314+
- `list[int]`: Skip specific row indices (0-based relative to data rows)
315+
- `Callable[[int], bool]`: Function that receives row index (0-based
316+
relative to data rows) and returns True to skip the row
317+
- `None`: If `header_row` is None, skips empty rows at beginning
316318
:param schema_sample_rows: Specifies how many rows should be used to determine
317319
the dtype of a column. Cannot be 0. A specific dtype can be
318320
enforced for some or all columns through the `dtypes` parameter.
@@ -382,6 +384,7 @@ def load_table(
382384
dtypes: DType | DTypeMap | None = None,
383385
eager: Literal[False] = ...,
384386
) -> ExcelTable: ...
387+
385388
@typing.overload
386389
def load_table(
387390
self,
@@ -401,6 +404,7 @@ def load_table(
401404
dtypes: DType | DTypeMap | None = None,
402405
eager: Literal[True] = ...,
403406
) -> "pa.RecordBatch": ...
407+
404408
def load_table(
405409
self,
406410
name: str,
@@ -493,7 +497,7 @@ def load_sheet_eager(
493497
*,
494498
header_row: int | None = 0,
495499
column_names: list[str] | None = None,
496-
skip_rows: int | None = None,
500+
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
497501
n_rows: int | None = None,
498502
schema_sample_rows: int | None = 1_000,
499503
dtype_coercion: Literal["coerce", "strict"] = "coerce",
@@ -601,11 +605,11 @@ def read_excel(source: Path | str | bytes) -> ExcelReader:
601605

602606

603607
__all__ = (
604-
## version
608+
# version
605609
"__version__",
606-
## main entrypoint
610+
# main entrypoint
607611
"read_excel",
608-
## Python types
612+
# Python types
609613
"DType",
610614
"DTypeMap",
611615
# Excel reader

python/fastexcel/_fastexcel.pyi

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ class _ExcelReader:
169169
*,
170170
header_row: int | None = 0,
171171
column_names: list[str] | None = None,
172-
skip_rows: int | None = None,
172+
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
173173
n_rows: int | None = None,
174174
schema_sample_rows: int | None = 1_000,
175175
dtype_coercion: Literal["coerce", "strict"] = "coerce",
@@ -188,7 +188,7 @@ class _ExcelReader:
188188
*,
189189
header_row: int | None = 0,
190190
column_names: list[str] | None = None,
191-
skip_rows: int | None = None,
191+
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
192192
n_rows: int | None = None,
193193
schema_sample_rows: int | None = 1_000,
194194
dtype_coercion: Literal["coerce", "strict"] = "coerce",
@@ -207,7 +207,7 @@ class _ExcelReader:
207207
*,
208208
header_row: int | None = None,
209209
column_names: list[str] | None = None,
210-
skip_rows: int = 0,
210+
skip_rows: int | list[int] | Callable[[int], bool] = 0,
211211
n_rows: int | None = None,
212212
schema_sample_rows: int | None = 1_000,
213213
dtype_coercion: Literal["coerce", "strict"] = "coerce",
@@ -226,7 +226,7 @@ class _ExcelReader:
226226
*,
227227
header_row: int | None = None,
228228
column_names: list[str] | None = None,
229-
skip_rows: int = 0,
229+
skip_rows: int | list[int] | Callable[[int], bool] = 0,
230230
n_rows: int | None = None,
231231
schema_sample_rows: int | None = 1_000,
232232
dtype_coercion: Literal["coerce", "strict"] = "coerce",
1019 Bytes
Binary file not shown.

python/tests/test_fastexcel.py

Lines changed: 73 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -566,17 +566,79 @@ def test_sheet_with_decimal_numbers() -> None:
566566
@pytest.mark.parametrize(
567567
"header_row, skip_rows, expected",
568568
[
569-
(0, None, {"a": ["b"], "0": [1.0]}), # default
570-
(None, 0, {"__UNNAMED__0": [None, None, "a", "b"], "__UNNAMED__1": [None, None, 0.0, 1.0]}),
571-
(None, None, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
572-
(0, 0, {"__UNNAMED__0": [None, "a", "b"], "__UNNAMED__1": [None, 0.0, 1.0]}),
573-
(0, 1, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
574-
(None, 2, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
575-
(None, 3, {"__UNNAMED__0": ["b"], "__UNNAMED__1": [1.0]}),
576-
(1, 0, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
577-
(2, 0, {"a": ["b"], "0": [1.0]}),
578-
(2, None, {"a": ["b"], "0": [1.0]}),
579-
(2, 1, {"a": [], "0": []}),
569+
(0, None, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}), # default
570+
(
571+
None,
572+
0,
573+
{
574+
"__UNNAMED__0": [None, None, "a", "b", "c", "d", "e", "f"],
575+
"__UNNAMED__1": [None, None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
576+
},
577+
),
578+
(
579+
None,
580+
None,
581+
{
582+
"__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
583+
"__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
584+
},
585+
),
586+
(
587+
0,
588+
0,
589+
{
590+
"__UNNAMED__0": [None, "a", "b", "c", "d", "e", "f"],
591+
"__UNNAMED__1": [None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
592+
},
593+
),
594+
(
595+
0,
596+
1,
597+
{
598+
"__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
599+
"__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
600+
},
601+
),
602+
(
603+
None,
604+
2,
605+
{
606+
"__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
607+
"__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
608+
},
609+
),
610+
(
611+
None,
612+
3,
613+
{"__UNNAMED__0": ["b", "c", "d", "e", "f"], "__UNNAMED__1": [1.0, 2.0, 3.0, 4.0, 5.0]},
614+
),
615+
(
616+
1,
617+
0,
618+
{
619+
"__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
620+
"__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
621+
},
622+
),
623+
(2, 0, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
624+
(2, None, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
625+
(2, 1, {"a": ["c", "d", "e", "f"], "0": [2.0, 3.0, 4.0, 5.0]}),
626+
(2, [1, 3], {"a": ["b", "d", "f"], "0": [1.0, 3.0, 5.0]}),
627+
(2, [0], {"a": ["c", "d", "e", "f"], "0": [2.0, 3.0, 4.0, 5.0]}),
628+
(
629+
None,
630+
[2, 4],
631+
{
632+
"__UNNAMED__0": [None, None, "b", "d", "e", "f"],
633+
"__UNNAMED__1": [None, None, 1.0, 3.0, 4.0, 5.0],
634+
},
635+
),
636+
(2, [], {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
637+
(2, [0, 1, 2, 3], {"a": ["f"], "0": [5.0]}),
638+
(2, lambda x: x % 2 == 0, {"a": ["c", "e"], "0": [2.0, 4.0]}),
639+
(2, lambda x: x in [0, 4], {"a": ["c", "d", "e"], "0": [2.0, 3.0, 4.0]}),
640+
(2, lambda x: False, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
641+
(2, lambda x: x != 2, {"a": ["d"], "0": [3.0]}),
580642
],
581643
)
582644
def test_header_row_and_skip_rows(

0 commit comments

Comments
 (0)