Skip to content

Commit 72855f0

Browse files
authored
feat(tests): support pandas 3 (#454)
* chore: bump python dependencies Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * chore(deps): bump pandas to v3 Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * feat(tests): support pandas 3 closes #452 Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * fix: run tests on pandas<3 as well Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * fix: restore bump to pandas 3 that was discarded in merge Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> --------- Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>
1 parent ad82fcc commit 72855f0

File tree

7 files changed

+201
-45
lines changed

7 files changed

+201
-45
lines changed

.github/workflows/CI.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ jobs:
8080
make install
8181
make test
8282
83+
- name: Test with pandas<3
84+
run: |
85+
uv pip install "pandas<3"
86+
make test-python
87+
8388
check-wheel-build:
8489
runs-on: ${{ matrix.os }}
8590
strategy:

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ dynamic = ["version"]
2929

3030
[project.optional-dependencies]
3131
pyarrow = ["pyarrow>=8.0.0"]
32-
pandas = ["pandas>=1.4.4,<3", "pyarrow>=8.0.0"]
32+
pandas = ["pandas>=1.4.4", "pyarrow>=8.0.0"]
3333
polars = ["polars>=1"]
3434

3535
[dependency-groups]

python/tests/test_column_selection.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import Any
66

77
import fastexcel
8+
import numpy as np
89
import pandas as pd
910
import polars as pl
1011
import pytest
@@ -1045,12 +1046,16 @@ def test_use_column_range_with_offset_with_sheet_and_specified_dtypes() -> None:
10451046
"__UNNAMED__2": pl.Series([None, None, None], dtype=pl.String),
10461047
"Column at K10": [7.0, 8.0, 9.0],
10471048
}
1049+
# In pandas 3, string columns use nan instead of None for missing values
1050+
pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2])
1051+
na_value = np.nan if pd_version >= (3, 0) else None
1052+
10481053
expected_data_pandas = {
10491054
# Dtype should be int, looked up by index
10501055
"Column at H10": [1, 2, 3],
10511056
# Dtype should be string, looked up by name
10521057
"Column at I10": ["4", "5", "6"],
1053-
"__UNNAMED__2": [None, None, None],
1058+
"__UNNAMED__2": [na_value, na_value, na_value],
10541059
"Column at K10": [7.0, 8.0, 9.0],
10551060
}
10561061
expected_column_info = [
@@ -1101,7 +1106,7 @@ def test_use_column_range_with_offset_with_sheet_and_specified_dtypes() -> None:
11011106
pl_assert_frame_equal(pl_df_open_ended, expected_pl_df)
11021107

11031108
pd_df_closed = sheet_closed.to_pandas()
1104-
pd_assert_frame_equal(pd_df_closed, expected_pd_df)
1109+
pd_assert_frame_equal(pd_df_closed, expected_pd_df, check_dtype=False)
11051110

11061111
pd_df_open_ended = sheet_open_ended.to_pandas()
1107-
pd_assert_frame_equal(pd_df_open_ended, expected_pd_df)
1112+
pd_assert_frame_equal(pd_df_open_ended, expected_pd_df, check_dtype=False)

python/tests/test_dtypes.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
1313
from polars.testing import assert_frame_equal as pl_assert_frame_equal
1414

15-
from .utils import path_for_fixture
15+
from .utils import get_expected_pandas_dtype, path_for_fixture
1616

1717

1818
@pytest.fixture
@@ -123,33 +123,30 @@ def test_sheet_with_mixed_dtypes_and_sample_rows(expected_data: dict[str, list[A
123123

124124
@pytest.mark.parametrize("dtype_by_index", (True, False))
125125
@pytest.mark.parametrize(
126-
"dtype,expected_data,expected_pd_dtype,expected_pl_dtype",
126+
"dtype,expected_data,expected_pl_dtype",
127127
[
128-
("int", [123456, 44333, 44333, 87878, 87878], "int64", pl.Int64),
129-
("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], "float64", pl.Float64),
130-
("string", ["123456", "44333", "44333", "87878", "87878"], "object", pl.Utf8),
131-
("boolean", [True] * 5, "bool", pl.Boolean),
128+
("int", [123456, 44333, 44333, 87878, 87878], pl.Int64),
129+
("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], pl.Float64),
130+
("string", ["123456", "44333", "44333", "87878", "87878"], pl.Utf8),
131+
("boolean", [True] * 5, pl.Boolean),
132132
(
133133
"datetime",
134134
[datetime(2238, 1, 3)] + [datetime(2021, 5, 17)] * 2 + [datetime(2140, 8, 6)] * 2,
135-
"datetime64[ms]",
136135
pl.Datetime,
137136
),
138137
(
139138
"date",
140139
[date(2238, 1, 3)] + [date(2021, 5, 17)] * 2 + [date(2140, 8, 6)] * 2,
141-
"object",
142140
pl.Date,
143141
),
144142
# conversion to duration not supported yet
145-
("duration", [pd.NaT] * 5, "timedelta64[ms]", pl.Duration),
143+
("duration", [pd.NaT] * 5, pl.Duration),
146144
],
147145
)
148146
def test_sheet_with_mixed_dtypes_specify_dtypes(
149147
dtype_by_index: bool,
150148
dtype: fastexcel.DType,
151149
expected_data: list[Any],
152-
expected_pd_dtype: str,
153150
expected_pl_dtype: pl.DataType,
154151
) -> None:
155152
dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype}
@@ -158,6 +155,7 @@ def test_sheet_with_mixed_dtypes_specify_dtypes(
158155
assert sheet.specified_dtypes == dtypes
159156

160157
pd_df = sheet.to_pandas()
158+
expected_pd_dtype = get_expected_pandas_dtype(dtype)
161159
assert pd_df["Employee ID"].dtype == expected_pd_dtype
162160
assert pd_df["Employee ID"].to_list() == expected_data
163161

@@ -167,28 +165,29 @@ def test_sheet_with_mixed_dtypes_specify_dtypes(
167165

168166

169167
@pytest.mark.parametrize(
170-
"dtypes,expected,expected_pd_dtype,expected_pl_dtype",
168+
"dtypes,expected,fastexcel_dtype,expected_pl_dtype",
171169
[
172-
(None, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
173-
({"Date": "datetime"}, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
174-
({"Date": "date"}, date(2023, 7, 21), "object", pl.Date),
175-
({"Date": "string"}, "2023-07-21 00:00:00", "object", pl.Utf8),
176-
({2: "datetime"}, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
177-
({2: "date"}, date(2023, 7, 21), "object", pl.Date),
178-
({2: "string"}, "2023-07-21 00:00:00", "object", pl.Utf8),
170+
(None, datetime(2023, 7, 21), "datetime", pl.Datetime),
171+
({"Date": "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime),
172+
({"Date": "date"}, date(2023, 7, 21), "date", pl.Date),
173+
({"Date": "string"}, "2023-07-21 00:00:00", "string", pl.Utf8),
174+
({2: "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime),
175+
({2: "date"}, date(2023, 7, 21), "date", pl.Date),
176+
({2: "string"}, "2023-07-21 00:00:00", "string", pl.Utf8),
179177
],
180178
)
181179
def test_sheet_datetime_conversion(
182180
dtypes: fastexcel.DTypeMap | None,
183181
expected: Any,
184-
expected_pd_dtype: str,
182+
fastexcel_dtype: str,
185183
expected_pl_dtype: pl.DataType,
186184
) -> None:
187185
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
188186

189187
sheet = excel_reader.load_sheet(0, dtypes=dtypes)
190188
assert sheet.specified_dtypes == dtypes
191189
pd_df = sheet.to_pandas()
190+
expected_pd_dtype = get_expected_pandas_dtype(fastexcel_dtype)
192191
assert pd_df["Date"].dtype == expected_pd_dtype
193192
assert pd_df["Date"].to_list() == [expected] * 9
194193

@@ -211,7 +210,8 @@ def test_dtype_coercion_behavior__coerce(
211210
rb = sheet_or_rb if eager else sheet_or_rb.to_arrow()
212211

213212
pd_df = rb.to_pandas()
214-
assert pd_df["Mixed dates"].dtype == "object"
213+
expected_pd_dtype = get_expected_pandas_dtype("string")
214+
assert pd_df["Mixed dates"].dtype == expected_pd_dtype
215215
assert pd_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3
216216

217217
pl_df = pl.from_arrow(data=rb)
@@ -487,7 +487,12 @@ def test_to_arrow_with_errors(
487487
rb, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": dtype}).to_arrow_with_errors()
488488

489489
pd_df = rb.to_pandas()
490-
assert pd_df["Column"].replace(np.nan, None).to_list() == expected_data
490+
# For string columns in pandas 3, replace pd.NA with None for comparison
491+
if dtype == "string":
492+
column_values = pd_df["Column"].replace([np.nan, pd.NA], None).to_list()
493+
else:
494+
column_values = pd_df["Column"].replace(np.nan, None).to_list()
495+
assert column_values == expected_data
491496

492497
def item_to_polars(item: Any):
493498
if isinstance(item, pd.Timestamp):

python/tests/test_durations.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from polars.datatypes import Utf8 as PlUtf8
1515
from polars.testing import assert_frame_equal as pl_assert_frame_equal
1616

17-
from .utils import path_for_fixture
17+
from .utils import get_expected_pandas_dtype, path_for_fixture
1818

1919

2020
def test_sheet_with_different_time_types() -> None:
@@ -25,13 +25,10 @@ def test_sheet_with_different_time_types() -> None:
2525
pl_df = sheet.to_polars()
2626

2727
## dtypes
28-
assert pd_df.dtypes.to_dict() == {
29-
# the dtype for a date is object
30-
"date": np.dtype("object"),
31-
"datestr": np.dtype("object"),
32-
"time": np.dtype("timedelta64[ms]"),
33-
"datetime": np.dtype("datetime64[ms]"),
34-
}
28+
assert pd_df["date"].dtype == np.dtype("object")
29+
assert pd_df["datestr"].dtype == get_expected_pandas_dtype("string")
30+
assert pd_df["time"].dtype == np.dtype("timedelta64[ms]")
31+
assert pd_df["datetime"].dtype == np.dtype("datetime64[ms]")
3532
expected_pl_dtypes: dict[str, PolarsDataType] = {
3633
"date": PlDate(),
3734
"datestr": PlUtf8(),

python/tests/utils.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,59 @@
1+
from __future__ import annotations
2+
13
from pathlib import Path
4+
from typing import Any
5+
6+
import numpy as np
7+
import pandas as pd
28

39

410
def path_for_fixture(fixture_file: str) -> str:
511
return str(Path(__file__).parent.parent.parent / "tests" / "fixtures" / fixture_file)
12+
13+
14+
def get_expected_pandas_dtype(fastexcel_dtype: str) -> Any:
15+
"""Get the expected pandas dtype for a given fastexcel dtype, accounting for pandas version.
16+
17+
In pandas < 3.0, string columns use object dtype.
18+
In pandas >= 3.0, string columns use StringDtype (with na_value=nan when from Arrow).
19+
"""
20+
pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2])
21+
22+
dtype_map = {
23+
"int": np.dtype("int64"),
24+
"float": np.dtype("float64"),
25+
"boolean": np.dtype("bool"),
26+
"datetime": np.dtype("datetime64[ms]"),
27+
"duration": np.dtype("timedelta64[ms]"),
28+
}
29+
30+
if fastexcel_dtype in dtype_map:
31+
return dtype_map[fastexcel_dtype]
32+
33+
if fastexcel_dtype == "string":
34+
if pd_version >= (3, 0):
35+
# When converting from Arrow, pandas uses nan as na_value
36+
return pd.StringDtype(na_value=np.nan)
37+
else:
38+
return np.dtype("object")
39+
40+
if fastexcel_dtype == "date":
41+
# Date columns are always object dtype
42+
return np.dtype("object")
43+
44+
raise ValueError(f"Unknown fastexcel dtype: {fastexcel_dtype}")
45+
46+
47+
def assert_pandas_dtypes(df: pd.DataFrame, expected_dtypes: dict[str, str]) -> None:
48+
"""Assert that a pandas DataFrame has the expected dtypes for each column.
49+
50+
Args:
51+
df: The pandas DataFrame to check
52+
expected_dtypes: A dict mapping column names to fastexcel dtype strings
53+
"""
54+
for col_name, fastexcel_dtype in expected_dtypes.items():
55+
expected_dtype = get_expected_pandas_dtype(fastexcel_dtype)
56+
actual_dtype = df[col_name].dtype
57+
assert actual_dtype == expected_dtype, (
58+
f"Column '{col_name}': expected dtype {expected_dtype}, got {actual_dtype}"
59+
)

0 commit comments

Comments
 (0)