feat(tests): support pandas 3 (#454)

lukapeschke · web-flow · commit 72855f07da8a · 2026-02-13T06:56:44.000+01:00
* chore: bump python dependencies Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * chore(deps): bump pandas to v3 Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * feat(tests): support pandas 3 closes #452 Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * fix: run tests on pandas<3 as well Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * fix: restore bump to pandas 3 that was discarded in merge Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> --------- Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -80,6 +80,11 @@ jobs:
           make install
           make test
 
+      - name: Test with pandas<3
+        run: |
+          uv pip install "pandas<3"
+          make test-python
+
   check-wheel-build:
     runs-on: ${{ matrix.os }}
     strategy:
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dynamic = ["version"]
 
 [project.optional-dependencies]
 pyarrow = ["pyarrow>=8.0.0"]
-pandas = ["pandas>=1.4.4,<3", "pyarrow>=8.0.0"]
+pandas = ["pandas>=1.4.4", "pyarrow>=8.0.0"]
 polars = ["polars>=1"]
 
 [dependency-groups]
diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py
@@ -5,6 +5,7 @@
 from typing import Any
 
 import fastexcel
+import numpy as np
 import pandas as pd
 import polars as pl
 import pytest
@@ -1045,12 +1046,16 @@ def test_use_column_range_with_offset_with_sheet_and_specified_dtypes() -> None:
         "__UNNAMED__2": pl.Series([None, None, None], dtype=pl.String),
         "Column at K10": [7.0, 8.0, 9.0],
     }
+    # In pandas 3, string columns use nan instead of None for missing values
+    pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2])
+    na_value = np.nan if pd_version >= (3, 0) else None
+
     expected_data_pandas = {
         # Dtype should be int, looked up by index
         "Column at H10": [1, 2, 3],
         # Dtype should be string, looked up by name
         "Column at I10": ["4", "5", "6"],
-        "__UNNAMED__2": [None, None, None],
+        "__UNNAMED__2": [na_value, na_value, na_value],
         "Column at K10": [7.0, 8.0, 9.0],
     }
     expected_column_info = [
@@ -1101,7 +1106,7 @@ def test_use_column_range_with_offset_with_sheet_and_specified_dtypes() -> None:
     pl_assert_frame_equal(pl_df_open_ended, expected_pl_df)
 
     pd_df_closed = sheet_closed.to_pandas()
-    pd_assert_frame_equal(pd_df_closed, expected_pd_df)
+    pd_assert_frame_equal(pd_df_closed, expected_pd_df, check_dtype=False)
 
     pd_df_open_ended = sheet_open_ended.to_pandas()
-    pd_assert_frame_equal(pd_df_open_ended, expected_pd_df)
+    pd_assert_frame_equal(pd_df_open_ended, expected_pd_df, check_dtype=False)
diff --git a/python/tests/test_dtypes.py b/python/tests/test_dtypes.py
@@ -12,7 +12,7 @@
 from pandas.testing import assert_frame_equal as pd_assert_frame_equal
 from polars.testing import assert_frame_equal as pl_assert_frame_equal
 
-from .utils import path_for_fixture
+from .utils import get_expected_pandas_dtype, path_for_fixture
 
 
 @pytest.fixture
@@ -123,33 +123,30 @@ def test_sheet_with_mixed_dtypes_and_sample_rows(expected_data: dict[str, list[A
 
 @pytest.mark.parametrize("dtype_by_index", (True, False))
 @pytest.mark.parametrize(
-    "dtype,expected_data,expected_pd_dtype,expected_pl_dtype",
+    "dtype,expected_data,expected_pl_dtype",
     [
-        ("int", [123456, 44333, 44333, 87878, 87878], "int64", pl.Int64),
-        ("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], "float64", pl.Float64),
-        ("string", ["123456", "44333", "44333", "87878", "87878"], "object", pl.Utf8),
-        ("boolean", [True] * 5, "bool", pl.Boolean),
+        ("int", [123456, 44333, 44333, 87878, 87878], pl.Int64),
+        ("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], pl.Float64),
+        ("string", ["123456", "44333", "44333", "87878", "87878"], pl.Utf8),
+        ("boolean", [True] * 5, pl.Boolean),
         (
             "datetime",
             [datetime(2238, 1, 3)] + [datetime(2021, 5, 17)] * 2 + [datetime(2140, 8, 6)] * 2,
-            "datetime64[ms]",
             pl.Datetime,
         ),
         (
             "date",
             [date(2238, 1, 3)] + [date(2021, 5, 17)] * 2 + [date(2140, 8, 6)] * 2,
-            "object",
             pl.Date,
         ),
         #  conversion to duration not supported yet
-        ("duration", [pd.NaT] * 5, "timedelta64[ms]", pl.Duration),
+        ("duration", [pd.NaT] * 5, pl.Duration),
     ],
 )
 def test_sheet_with_mixed_dtypes_specify_dtypes(
     dtype_by_index: bool,
     dtype: fastexcel.DType,
     expected_data: list[Any],
-    expected_pd_dtype: str,
     expected_pl_dtype: pl.DataType,
 ) -> None:
     dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype}
@@ -158,6 +155,7 @@ def test_sheet_with_mixed_dtypes_specify_dtypes(
     assert sheet.specified_dtypes == dtypes
 
     pd_df = sheet.to_pandas()
+    expected_pd_dtype = get_expected_pandas_dtype(dtype)
     assert pd_df["Employee ID"].dtype == expected_pd_dtype
     assert pd_df["Employee ID"].to_list() == expected_data
 
@@ -167,28 +165,29 @@ def test_sheet_with_mixed_dtypes_specify_dtypes(
 
 
 @pytest.mark.parametrize(
-    "dtypes,expected,expected_pd_dtype,expected_pl_dtype",
+    "dtypes,expected,fastexcel_dtype,expected_pl_dtype",
     [
-        (None, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
-        ({"Date": "datetime"}, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
-        ({"Date": "date"}, date(2023, 7, 21), "object", pl.Date),
-        ({"Date": "string"}, "2023-07-21 00:00:00", "object", pl.Utf8),
-        ({2: "datetime"}, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
-        ({2: "date"}, date(2023, 7, 21), "object", pl.Date),
-        ({2: "string"}, "2023-07-21 00:00:00", "object", pl.Utf8),
+        (None, datetime(2023, 7, 21), "datetime", pl.Datetime),
+        ({"Date": "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime),
+        ({"Date": "date"}, date(2023, 7, 21), "date", pl.Date),
+        ({"Date": "string"}, "2023-07-21 00:00:00", "string", pl.Utf8),
+        ({2: "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime),
+        ({2: "date"}, date(2023, 7, 21), "date", pl.Date),
+        ({2: "string"}, "2023-07-21 00:00:00", "string", pl.Utf8),
     ],
 )
 def test_sheet_datetime_conversion(
     dtypes: fastexcel.DTypeMap | None,
     expected: Any,
-    expected_pd_dtype: str,
+    fastexcel_dtype: str,
     expected_pl_dtype: pl.DataType,
 ) -> None:
     excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
 
     sheet = excel_reader.load_sheet(0, dtypes=dtypes)
     assert sheet.specified_dtypes == dtypes
     pd_df = sheet.to_pandas()
+    expected_pd_dtype = get_expected_pandas_dtype(fastexcel_dtype)
     assert pd_df["Date"].dtype == expected_pd_dtype
     assert pd_df["Date"].to_list() == [expected] * 9
 
@@ -211,7 +210,8 @@ def test_dtype_coercion_behavior__coerce(
     rb = sheet_or_rb if eager else sheet_or_rb.to_arrow()
 
     pd_df = rb.to_pandas()
-    assert pd_df["Mixed dates"].dtype == "object"
+    expected_pd_dtype = get_expected_pandas_dtype("string")
+    assert pd_df["Mixed dates"].dtype == expected_pd_dtype
     assert pd_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3
 
     pl_df = pl.from_arrow(data=rb)
@@ -487,7 +487,12 @@ def test_to_arrow_with_errors(
     rb, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": dtype}).to_arrow_with_errors()
 
     pd_df = rb.to_pandas()
-    assert pd_df["Column"].replace(np.nan, None).to_list() == expected_data
+    # For string columns in pandas 3, replace pd.NA with None for comparison
+    if dtype == "string":
+        column_values = pd_df["Column"].replace([np.nan, pd.NA], None).to_list()
+    else:
+        column_values = pd_df["Column"].replace(np.nan, None).to_list()
+    assert column_values == expected_data
 
     def item_to_polars(item: Any):
         if isinstance(item, pd.Timestamp):
diff --git a/python/tests/test_durations.py b/python/tests/test_durations.py
@@ -14,7 +14,7 @@
 from polars.datatypes import Utf8 as PlUtf8
 from polars.testing import assert_frame_equal as pl_assert_frame_equal
 
-from .utils import path_for_fixture
+from .utils import get_expected_pandas_dtype, path_for_fixture
 
 
 def test_sheet_with_different_time_types() -> None:
@@ -25,13 +25,10 @@ def test_sheet_with_different_time_types() -> None:
     pl_df = sheet.to_polars()
 
     ## dtypes
-    assert pd_df.dtypes.to_dict() == {
-        # the dtype for a date is object
-        "date": np.dtype("object"),
-        "datestr": np.dtype("object"),
-        "time": np.dtype("timedelta64[ms]"),
-        "datetime": np.dtype("datetime64[ms]"),
-    }
+    assert pd_df["date"].dtype == np.dtype("object")
+    assert pd_df["datestr"].dtype == get_expected_pandas_dtype("string")
+    assert pd_df["time"].dtype == np.dtype("timedelta64[ms]")
+    assert pd_df["datetime"].dtype == np.dtype("datetime64[ms]")
     expected_pl_dtypes: dict[str, PolarsDataType] = {
         "date": PlDate(),
         "datestr": PlUtf8(),
diff --git a/python/tests/utils.py b/python/tests/utils.py
@@ -1,5 +1,59 @@
+from __future__ import annotations
+
 from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pandas as pd
 
 
 def path_for_fixture(fixture_file: str) -> str:
     return str(Path(__file__).parent.parent.parent / "tests" / "fixtures" / fixture_file)
+
+
+def get_expected_pandas_dtype(fastexcel_dtype: str) -> Any:
+    """Get the expected pandas dtype for a given fastexcel dtype, accounting for pandas version.
+
+    In pandas < 3.0, string columns use object dtype.
+    In pandas >= 3.0, string columns use StringDtype (with na_value=nan when from Arrow).
+    """
+    pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2])
+
+    dtype_map = {
+        "int": np.dtype("int64"),
+        "float": np.dtype("float64"),
+        "boolean": np.dtype("bool"),
+        "datetime": np.dtype("datetime64[ms]"),
+        "duration": np.dtype("timedelta64[ms]"),
+    }
+
+    if fastexcel_dtype in dtype_map:
+        return dtype_map[fastexcel_dtype]
+
+    if fastexcel_dtype == "string":
+        if pd_version >= (3, 0):
+            # When converting from Arrow, pandas uses nan as na_value
+            return pd.StringDtype(na_value=np.nan)
+        else:
+            return np.dtype("object")
+
+    if fastexcel_dtype == "date":
+        # Date columns are always object dtype
+        return np.dtype("object")
+
+    raise ValueError(f"Unknown fastexcel dtype: {fastexcel_dtype}")
+
+
+def assert_pandas_dtypes(df: pd.DataFrame, expected_dtypes: dict[str, str]) -> None:
+    """Assert that a pandas DataFrame has the expected dtypes for each column.
+
+    Args:
+        df: The pandas DataFrame to check
+        expected_dtypes: A dict mapping column names to fastexcel dtype strings
+    """
+    for col_name, fastexcel_dtype in expected_dtypes.items():
+        expected_dtype = get_expected_pandas_dtype(fastexcel_dtype)
+        actual_dtype = df[col_name].dtype
+        assert actual_dtype == expected_dtype, (
+            f"Column '{col_name}': expected dtype {expected_dtype}, got {actual_dtype}"
+        )
diff --git a/uv.lock b/uv.lock