Skip to content

Commit 399a27d

Browse files
noctuidlukapeschke
andauthored
feat(excelsheet): add to_arrow_with_errors (#346)
* Add to_arrow_with_errors Add to_arrow_with_errors method to return both the data and the locations of cells where parsing failed in order to differentiate them from cells that were just empty. Fixes #308. There is currently strange behavior in how some types are coerced (noted in the new tests). There are some related issues already open. Probably calamine should be updated in the future to allow customizable strictness, but for now I did not change any of the logic. * fixup! Add to_arrow_with_errors --------- Co-authored-by: Luka Peschke <luka.peschke@toucantoco.com>
1 parent 1dd7dc3 commit 399a27d

File tree

7 files changed

+565
-2
lines changed

7 files changed

+565
-2
lines changed

python/fastexcel/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
CalamineCellError,
2424
CalamineError,
2525
CannotRetrieveCellDataError,
26+
CellError,
27+
CellErrors,
2628
ColumnInfo,
2729
ColumnInfoNoDtype,
2830
ColumnNotFoundError,
@@ -101,6 +103,17 @@ def to_arrow(self) -> pa.RecordBatch:
101103
"""Converts the sheet to a pyarrow `RecordBatch`"""
102104
return self._sheet.to_arrow()
103105

106+
def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]:
107+
"""Converts the sheet to a pyarrow `RecordBatch` with error information.
108+
109+
Stores the positions of any values that cannot be parsed as the specified type and were
110+
therefore converted to None.
111+
"""
112+
rb, cell_errors = self._sheet.to_arrow_with_errors()
113+
if not cell_errors.errors:
114+
return (rb, None)
115+
return (rb, cell_errors)
116+
104117
def to_pandas(self) -> "pd.DataFrame":
105118
"""Converts the sheet to a Pandas `DataFrame`.
106119
@@ -517,6 +530,9 @@ def read_excel(source: Path | str | bytes) -> ExcelReader:
517530
"DTypeFrom",
518531
"ColumnNameFrom",
519532
"ColumnInfo",
533+
# Parse error information
534+
"CellError",
535+
"CellErrors",
520536
# Exceptions
521537
"FastExcelError",
522538
"CannotRetrieveCellDataError",

python/fastexcel/_fastexcel.pyi

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,20 @@ class ColumnInfo:
4141
@property
4242
def dtype_from(self) -> DTypeFrom: ...
4343

44+
class CellError:
45+
@property
46+
def position(self) -> tuple[int, int]: ...
47+
@property
48+
def row_offset(self) -> int: ...
49+
@property
50+
def offset_position(self) -> tuple[int, int]: ...
51+
@property
52+
def detail(self) -> str: ...
53+
54+
class CellErrors:
55+
@property
56+
def errors(self) -> list[CellError]: ...
57+
4458
class _ExcelSheet:
4559
@property
4660
def name(self) -> str:
@@ -70,6 +84,12 @@ class _ExcelSheet:
7084
"""The visibility of the sheet"""
7185
def to_arrow(self) -> pa.RecordBatch:
7286
"""Converts the sheet to a pyarrow `RecordBatch`"""
87+
def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors]:
88+
"""Converts the sheet to a pyarrow `RecordBatch` with error information.
89+
90+
Stores the positions of any values that cannot be parsed as the specified type and were
91+
therefore converted to None.
92+
"""
7393

7494
class _ExcelTable:
7595
@property
10.6 KB
Binary file not shown.

python/tests/test_dtypes.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,3 +354,156 @@ def test_fallback_infer_dtypes(mocker: MockerFixture) -> None:
354354
),
355355
]
356356
assert sheet.to_polars().dtypes == [pl.Float64, pl.String]
357+
358+
359+
@pytest.mark.parametrize(
360+
("dtype", "expected_data"),
361+
[
362+
(
363+
"int",
364+
[None] * 2
365+
+ [-1.0, 0.0, 1.0, 0.0, 1.0, 1.0, -1.0, 0.0, 1.0, None, 1.0, 0.0]
366+
+ [None] * 7
367+
+ [0.0],
368+
),
369+
(
370+
"float",
371+
[None] * 2
372+
+ [-1.0, 0.0, 1.0, 0.0, 1.0, 1.1, -1.0, 0.0, 1.0, 1.1, 1.0, 0.0]
373+
+ [None] * 7
374+
+ [0.1],
375+
),
376+
(
377+
"string",
378+
[
379+
None,
380+
"foo",
381+
"-1",
382+
"0",
383+
"1",
384+
"0",
385+
"1",
386+
"1.1",
387+
"-1",
388+
"0",
389+
"1",
390+
"1.1",
391+
"true",
392+
"false",
393+
"2023-07-21 00:00:00",
394+
"2023-07-21 12:20:00",
395+
# calamine reads a time as datetimes here, which seems wrong
396+
"1899-12-31 12:20:00",
397+
"07/21/2023",
398+
"7/21/2023 12:20:00 PM",
399+
"July 23rd",
400+
"12:20:00",
401+
"0.1",
402+
],
403+
),
404+
(
405+
"boolean",
406+
[None] * 2
407+
+ [True, False, True, False, True, True]
408+
+ [None] * 4
409+
+ [True, False]
410+
+ [None] * 7
411+
+ [True],
412+
),
413+
(
414+
"datetime",
415+
[pd.NaT] * 2
416+
+ [
417+
pd.Timestamp("1899-12-30 00:00:00"),
418+
pd.Timestamp("1899-12-31 00:00:00"),
419+
pd.Timestamp("1900-01-01 00:00:00"),
420+
pd.Timestamp("1899-12-31 00:00:00"),
421+
pd.Timestamp("1900-01-01 00:00:00"),
422+
pd.Timestamp("1900-01-01 02:24:00"),
423+
]
424+
+ [pd.NaT] * 6
425+
+ [
426+
pd.Timestamp("2023-7-21 00:00:00"),
427+
pd.Timestamp("2023-7-21 12:20:00"),
428+
# calamine currently adds a date to a time, which is
429+
# questionable
430+
pd.Timestamp("1899-12-31 12:20:00"),
431+
]
432+
+ [pd.NaT] * 4
433+
+ [
434+
# calamine converts percentages to datetimes (since it does not
435+
# distinguish from floats), which seems questionable
436+
pd.Timestamp("1899-12-31 02:24:00")
437+
],
438+
),
439+
(
440+
"date",
441+
[None] * 2
442+
+ [
443+
pd.Timestamp("1899-12-30").date(),
444+
pd.Timestamp("1899-12-31").date(),
445+
pd.Timestamp("1900-01-01").date(),
446+
pd.Timestamp("1899-12-31").date(),
447+
pd.Timestamp("1900-01-01").date(),
448+
pd.Timestamp("1900-01-01").date(),
449+
]
450+
+ [None] * 6
451+
+ [
452+
pd.Timestamp("2023-7-21").date(),
453+
pd.Timestamp("2023-7-21").date(),
454+
# calamine converts any time to 1899-12-31, which is
455+
# questionable
456+
pd.Timestamp("1899-12-31").date(),
457+
]
458+
+ [None] * 4
459+
+ [
460+
# calamine converts percentages to dates (since it does not
461+
# distinguish from floats), which seems questionable
462+
pd.Timestamp("1899-12-31").date()
463+
],
464+
),
465+
(
466+
"duration",
467+
[pd.NaT] * 14
468+
+ [
469+
# dates/datetimes are converted to durations, which seems
470+
# questionable
471+
pd.Timedelta(datetime(2023, 7, 21 + 1) - datetime(1899, 12, 31)),
472+
pd.Timedelta(datetime(2023, 7, 21 + 1, 12, 20, 0) - datetime(1899, 12, 31)),
473+
pd.Timedelta(hours=12, minutes=20),
474+
]
475+
+ [pd.NaT] * 5,
476+
),
477+
],
478+
)
479+
def test_to_arrow_with_errors(
480+
dtype: fastexcel.DType,
481+
expected_data: list[Any],
482+
):
483+
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-type-errors.xlsx"))
484+
rb, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": dtype}).to_arrow_with_errors()
485+
486+
pd_df = rb.to_pandas()
487+
assert pd_df["Column"].replace(np.nan, None).to_list() == expected_data
488+
489+
def item_to_polars(item: Any):
490+
if isinstance(item, pd.Timestamp):
491+
return item.to_pydatetime()
492+
if pd.isna(item):
493+
return None
494+
return item
495+
496+
pl_df = pl.from_arrow(rb)
497+
assert isinstance(pl_df, pl.DataFrame)
498+
pl_expected_data = list(map(item_to_polars, expected_data))
499+
assert pl_df["Column"].to_list() == pl_expected_data
500+
501+
# the only empty cell is (0, 0), so all other cells that were read as None
502+
# should be errors
503+
expected_error_positions = [
504+
(i, 0) for i in range(1, len(expected_data)) if expected_data[i] in {None, pd.NaT}
505+
]
506+
if expected_error_positions:
507+
assert cell_errors is not None
508+
error_positions = [err.offset_position for err in cell_errors.errors]
509+
assert error_positions == expected_error_positions

0 commit comments

Comments
 (0)