ToucanToco
diff --git a/‎python/fastexcel/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎python/fastexcel/__init__.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎python/fastexcel/_fastexcel.pyi‎
Lines changed: 20 additions & 0 deletions b/‎python/fastexcel/_fastexcel.pyi‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎python/tests/fixtures/fixture-type-errors.xlsx‎
10.6 KB b/‎python/tests/fixtures/fixture-type-errors.xlsx‎
10.6 KB
diff --git a/‎python/tests/test_dtypes.py‎
Lines changed: 153 additions & 0 deletions b/‎python/tests/test_dtypes.py‎
Lines changed: 153 additions & 0 deletions
@@ -23,6 +23,8 @@
     CalamineCellError,
     CalamineError,
     CannotRetrieveCellDataError,
+    CellError,
+    CellErrors,
     ColumnInfo,
     ColumnInfoNoDtype,
     ColumnNotFoundError,
@@ -101,6 +103,17 @@ def to_arrow(self) -> pa.RecordBatch:
         """Converts the sheet to a pyarrow `RecordBatch`"""
         return self._sheet.to_arrow()
 
+    def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]:
+        """Converts the sheet to a pyarrow `RecordBatch` with error information.
+
+        Stores the positions of any values that cannot be parsed as the specified type and were
+        therefore converted to None.
+        """
+        rb, cell_errors = self._sheet.to_arrow_with_errors()
+        if not cell_errors.errors:
+            return (rb, None)
+        return (rb, cell_errors)
+
     def to_pandas(self) -> "pd.DataFrame":
         """Converts the sheet to a Pandas `DataFrame`.
 
@@ -517,6 +530,9 @@ def read_excel(source: Path | str | bytes) -> ExcelReader:
     "DTypeFrom",
     "ColumnNameFrom",
     "ColumnInfo",
+    # Parse error information
+    "CellError",
+    "CellErrors",
     # Exceptions
     "FastExcelError",
     "CannotRetrieveCellDataError",
 
@@ -41,6 +41,20 @@ class ColumnInfo:
     @property
     def dtype_from(self) -> DTypeFrom: ...
 
+class CellError:
+    @property
+    def position(self) -> tuple[int, int]: ...
+    @property
+    def row_offset(self) -> int: ...
+    @property
+    def offset_position(self) -> tuple[int, int]: ...
+    @property
+    def detail(self) -> str: ...
+
+class CellErrors:
+    @property
+    def errors(self) -> list[CellError]: ...
+
 class _ExcelSheet:
     @property
     def name(self) -> str:
@@ -70,6 +84,12 @@ class _ExcelSheet:
         """The visibility of the sheet"""
     def to_arrow(self) -> pa.RecordBatch:
         """Converts the sheet to a pyarrow `RecordBatch`"""
+    def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors]:
+        """Converts the sheet to a pyarrow `RecordBatch` with error information.
+
+        Stores the positions of any values that cannot be parsed as the specified type and were
+        therefore converted to None.
+        """
 
 class _ExcelTable:
     @property
 
@@ -354,3 +354,156 @@ def test_fallback_infer_dtypes(mocker: MockerFixture) -> None:
         ),
     ]
     assert sheet.to_polars().dtypes == [pl.Float64, pl.String]
+
+
+@pytest.mark.parametrize(
+    ("dtype", "expected_data"),
+    [
+        (
+            "int",
+            [None] * 2
+            + [-1.0, 0.0, 1.0, 0.0, 1.0, 1.0, -1.0, 0.0, 1.0, None, 1.0, 0.0]
+            + [None] * 7
+            + [0.0],
+        ),
+        (
+            "float",
+            [None] * 2
+            + [-1.0, 0.0, 1.0, 0.0, 1.0, 1.1, -1.0, 0.0, 1.0, 1.1, 1.0, 0.0]
+            + [None] * 7
+            + [0.1],
+        ),
+        (
+            "string",
+            [
+                None,
+                "foo",
+                "-1",
+                "0",
+                "1",
+                "0",
+                "1",
+                "1.1",
+                "-1",
+                "0",
+                "1",
+                "1.1",
+                "true",
+                "false",
+                "2023-07-21 00:00:00",
+                "2023-07-21 12:20:00",
+                # calamine reads a time as datetimes here, which seems wrong
+                "1899-12-31 12:20:00",
+                "07/21/2023",
+                "7/21/2023  12:20:00 PM",
+                "July 23rd",
+                "12:20:00",
+                "0.1",
+            ],
+        ),
+        (
+            "boolean",
+            [None] * 2
+            + [True, False, True, False, True, True]
+            + [None] * 4
+            + [True, False]
+            + [None] * 7
+            + [True],
+        ),
+        (
+            "datetime",
+            [pd.NaT] * 2
+            + [
+                pd.Timestamp("1899-12-30 00:00:00"),
+                pd.Timestamp("1899-12-31 00:00:00"),
+                pd.Timestamp("1900-01-01 00:00:00"),
+                pd.Timestamp("1899-12-31 00:00:00"),
+                pd.Timestamp("1900-01-01 00:00:00"),
+                pd.Timestamp("1900-01-01 02:24:00"),
+            ]
+            + [pd.NaT] * 6
+            + [
+                pd.Timestamp("2023-7-21 00:00:00"),
+                pd.Timestamp("2023-7-21 12:20:00"),
+                # calamine currently adds a date to a time, which is
+                # questionable
+                pd.Timestamp("1899-12-31 12:20:00"),
+            ]
+            + [pd.NaT] * 4
+            + [
+                # calamine converts percentages to datetimes (since it does not
+                # distinguish from floats), which seems questionable
+                pd.Timestamp("1899-12-31 02:24:00")
+            ],
+        ),
+        (
+            "date",
+            [None] * 2
+            + [
+                pd.Timestamp("1899-12-30").date(),
+                pd.Timestamp("1899-12-31").date(),
+                pd.Timestamp("1900-01-01").date(),
+                pd.Timestamp("1899-12-31").date(),
+                pd.Timestamp("1900-01-01").date(),
+                pd.Timestamp("1900-01-01").date(),
+            ]
+            + [None] * 6
+            + [
+                pd.Timestamp("2023-7-21").date(),
+                pd.Timestamp("2023-7-21").date(),
+                # calamine converts any time to 1899-12-31, which is
+                # questionable
+                pd.Timestamp("1899-12-31").date(),
+            ]
+            + [None] * 4
+            + [
+                # calamine converts percentages to dates (since it does not
+                # distinguish from floats), which seems questionable
+                pd.Timestamp("1899-12-31").date()
+            ],
+        ),
+        (
+            "duration",
+            [pd.NaT] * 14
+            + [
+                # dates/datetimes are converted to durations, which seems
+                # questionable
+                pd.Timedelta(datetime(2023, 7, 21 + 1) - datetime(1899, 12, 31)),
+                pd.Timedelta(datetime(2023, 7, 21 + 1, 12, 20, 0) - datetime(1899, 12, 31)),
+                pd.Timedelta(hours=12, minutes=20),
+            ]
+            + [pd.NaT] * 5,
+        ),
+    ],
+)
+def test_to_arrow_with_errors(
+    dtype: fastexcel.DType,
+    expected_data: list[Any],
+):
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-type-errors.xlsx"))
+    rb, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": dtype}).to_arrow_with_errors()
+
+    pd_df = rb.to_pandas()
+    assert pd_df["Column"].replace(np.nan, None).to_list() == expected_data
+
+    def item_to_polars(item: Any):
+        if isinstance(item, pd.Timestamp):
+            return item.to_pydatetime()
+        if pd.isna(item):
+            return None
+        return item
+
+    pl_df = pl.from_arrow(rb)
+    assert isinstance(pl_df, pl.DataFrame)
+    pl_expected_data = list(map(item_to_polars, expected_data))
+    assert pl_df["Column"].to_list() == pl_expected_data
+
+    # the only empty cell is (0, 0), so all other cells that were read as None
+    # should be errors
+    expected_error_positions = [
+        (i, 0) for i in range(1, len(expected_data)) if expected_data[i] in {None, pd.NaT}
+    ]
+    if expected_error_positions:
+        assert cell_errors is not None
+        error_positions = [err.offset_position for err in cell_errors.errors]
+        assert error_positions == expected_error_positions