Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ jobs:
make install
make test
- name: Test with pandas<3
run: |
uv pip install "pandas<3"
make test-python
check-wheel-build:
runs-on: ${{ matrix.os }}
strategy:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ dynamic = ["version"]

[project.optional-dependencies]
pyarrow = ["pyarrow>=8.0.0"]
pandas = ["pandas>=1.4.4,<3", "pyarrow>=8.0.0"]
pandas = ["pandas>=1.4.4", "pyarrow>=8.0.0"]
polars = ["polars>=1"]

[dependency-groups]
Expand Down
11 changes: 8 additions & 3 deletions python/tests/test_column_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any

import fastexcel
import numpy as np
import pandas as pd
import polars as pl
import pytest
Expand Down Expand Up @@ -1045,12 +1046,16 @@ def test_use_column_range_with_offset_with_sheet_and_specified_dtypes() -> None:
"__UNNAMED__2": pl.Series([None, None, None], dtype=pl.String),
"Column at K10": [7.0, 8.0, 9.0],
}
# In pandas 3, string columns use nan instead of None for missing values
pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2])
na_value = np.nan if pd_version >= (3, 0) else None

expected_data_pandas = {
# Dtype should be int, looked up by index
"Column at H10": [1, 2, 3],
# Dtype should be string, looked up by name
"Column at I10": ["4", "5", "6"],
"__UNNAMED__2": [None, None, None],
"__UNNAMED__2": [na_value, na_value, na_value],
"Column at K10": [7.0, 8.0, 9.0],
}
expected_column_info = [
Expand Down Expand Up @@ -1101,7 +1106,7 @@ def test_use_column_range_with_offset_with_sheet_and_specified_dtypes() -> None:
pl_assert_frame_equal(pl_df_open_ended, expected_pl_df)

pd_df_closed = sheet_closed.to_pandas()
pd_assert_frame_equal(pd_df_closed, expected_pd_df)
pd_assert_frame_equal(pd_df_closed, expected_pd_df, check_dtype=False)

pd_df_open_ended = sheet_open_ended.to_pandas()
pd_assert_frame_equal(pd_df_open_ended, expected_pd_df)
pd_assert_frame_equal(pd_df_open_ended, expected_pd_df, check_dtype=False)
47 changes: 26 additions & 21 deletions python/tests/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal

from .utils import path_for_fixture
from .utils import get_expected_pandas_dtype, path_for_fixture


@pytest.fixture
Expand Down Expand Up @@ -123,33 +123,30 @@ def test_sheet_with_mixed_dtypes_and_sample_rows(expected_data: dict[str, list[A

@pytest.mark.parametrize("dtype_by_index", (True, False))
@pytest.mark.parametrize(
"dtype,expected_data,expected_pd_dtype,expected_pl_dtype",
"dtype,expected_data,expected_pl_dtype",
[
("int", [123456, 44333, 44333, 87878, 87878], "int64", pl.Int64),
("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], "float64", pl.Float64),
("string", ["123456", "44333", "44333", "87878", "87878"], "object", pl.Utf8),
("boolean", [True] * 5, "bool", pl.Boolean),
("int", [123456, 44333, 44333, 87878, 87878], pl.Int64),
("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], pl.Float64),
("string", ["123456", "44333", "44333", "87878", "87878"], pl.Utf8),
("boolean", [True] * 5, pl.Boolean),
(
"datetime",
[datetime(2238, 1, 3)] + [datetime(2021, 5, 17)] * 2 + [datetime(2140, 8, 6)] * 2,
"datetime64[ms]",
pl.Datetime,
),
(
"date",
[date(2238, 1, 3)] + [date(2021, 5, 17)] * 2 + [date(2140, 8, 6)] * 2,
"object",
pl.Date,
),
# conversion to duration not supported yet
("duration", [pd.NaT] * 5, "timedelta64[ms]", pl.Duration),
("duration", [pd.NaT] * 5, pl.Duration),
],
)
def test_sheet_with_mixed_dtypes_specify_dtypes(
dtype_by_index: bool,
dtype: fastexcel.DType,
expected_data: list[Any],
expected_pd_dtype: str,
expected_pl_dtype: pl.DataType,
) -> None:
dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype}
Expand All @@ -158,6 +155,7 @@ def test_sheet_with_mixed_dtypes_specify_dtypes(
assert sheet.specified_dtypes == dtypes

pd_df = sheet.to_pandas()
expected_pd_dtype = get_expected_pandas_dtype(dtype)
assert pd_df["Employee ID"].dtype == expected_pd_dtype
assert pd_df["Employee ID"].to_list() == expected_data

Expand All @@ -167,28 +165,29 @@ def test_sheet_with_mixed_dtypes_specify_dtypes(


@pytest.mark.parametrize(
"dtypes,expected,expected_pd_dtype,expected_pl_dtype",
"dtypes,expected,fastexcel_dtype,expected_pl_dtype",
[
(None, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
({"Date": "datetime"}, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
({"Date": "date"}, date(2023, 7, 21), "object", pl.Date),
({"Date": "string"}, "2023-07-21 00:00:00", "object", pl.Utf8),
({2: "datetime"}, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
({2: "date"}, date(2023, 7, 21), "object", pl.Date),
({2: "string"}, "2023-07-21 00:00:00", "object", pl.Utf8),
(None, datetime(2023, 7, 21), "datetime", pl.Datetime),
({"Date": "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime),
({"Date": "date"}, date(2023, 7, 21), "date", pl.Date),
({"Date": "string"}, "2023-07-21 00:00:00", "string", pl.Utf8),
({2: "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime),
({2: "date"}, date(2023, 7, 21), "date", pl.Date),
({2: "string"}, "2023-07-21 00:00:00", "string", pl.Utf8),
],
)
def test_sheet_datetime_conversion(
dtypes: fastexcel.DTypeMap | None,
expected: Any,
expected_pd_dtype: str,
fastexcel_dtype: str,
expected_pl_dtype: pl.DataType,
) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))

sheet = excel_reader.load_sheet(0, dtypes=dtypes)
assert sheet.specified_dtypes == dtypes
pd_df = sheet.to_pandas()
expected_pd_dtype = get_expected_pandas_dtype(fastexcel_dtype)
assert pd_df["Date"].dtype == expected_pd_dtype
assert pd_df["Date"].to_list() == [expected] * 9

Expand All @@ -211,7 +210,8 @@ def test_dtype_coercion_behavior__coerce(
rb = sheet_or_rb if eager else sheet_or_rb.to_arrow()

pd_df = rb.to_pandas()
assert pd_df["Mixed dates"].dtype == "object"
expected_pd_dtype = get_expected_pandas_dtype("string")
assert pd_df["Mixed dates"].dtype == expected_pd_dtype
assert pd_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3

pl_df = pl.from_arrow(data=rb)
Expand Down Expand Up @@ -487,7 +487,12 @@ def test_to_arrow_with_errors(
rb, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": dtype}).to_arrow_with_errors()

pd_df = rb.to_pandas()
assert pd_df["Column"].replace(np.nan, None).to_list() == expected_data
# For string columns in pandas 3, replace pd.NA with None for comparison
if dtype == "string":
column_values = pd_df["Column"].replace([np.nan, pd.NA], None).to_list()
else:
column_values = pd_df["Column"].replace(np.nan, None).to_list()
assert column_values == expected_data

def item_to_polars(item: Any):
if isinstance(item, pd.Timestamp):
Expand Down
13 changes: 5 additions & 8 deletions python/tests/test_durations.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from polars.datatypes import Utf8 as PlUtf8
from polars.testing import assert_frame_equal as pl_assert_frame_equal

from .utils import path_for_fixture
from .utils import get_expected_pandas_dtype, path_for_fixture


def test_sheet_with_different_time_types() -> None:
Expand All @@ -25,13 +25,10 @@ def test_sheet_with_different_time_types() -> None:
pl_df = sheet.to_polars()

## dtypes
assert pd_df.dtypes.to_dict() == {
# the dtype for a date is object
"date": np.dtype("object"),
"datestr": np.dtype("object"),
"time": np.dtype("timedelta64[ms]"),
"datetime": np.dtype("datetime64[ms]"),
}
assert pd_df["date"].dtype == np.dtype("object")
assert pd_df["datestr"].dtype == get_expected_pandas_dtype("string")
assert pd_df["time"].dtype == np.dtype("timedelta64[ms]")
assert pd_df["datetime"].dtype == np.dtype("datetime64[ms]")
expected_pl_dtypes: dict[str, PolarsDataType] = {
"date": PlDate(),
"datestr": PlUtf8(),
Expand Down
54 changes: 54 additions & 0 deletions python/tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,59 @@
from __future__ import annotations

from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd


def path_for_fixture(fixture_file: str) -> str:
return str(Path(__file__).parent.parent.parent / "tests" / "fixtures" / fixture_file)


def get_expected_pandas_dtype(fastexcel_dtype: str) -> Any:
"""Get the expected pandas dtype for a given fastexcel dtype, accounting for pandas version.
In pandas < 3.0, string columns use object dtype.
In pandas >= 3.0, string columns use StringDtype (with na_value=nan when from Arrow).
"""
pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2])

dtype_map = {
"int": np.dtype("int64"),
"float": np.dtype("float64"),
"boolean": np.dtype("bool"),
"datetime": np.dtype("datetime64[ms]"),
"duration": np.dtype("timedelta64[ms]"),
}

if fastexcel_dtype in dtype_map:
return dtype_map[fastexcel_dtype]

if fastexcel_dtype == "string":
if pd_version >= (3, 0):
# When converting from Arrow, pandas uses nan as na_value
return pd.StringDtype(na_value=np.nan)
else:
return np.dtype("object")

if fastexcel_dtype == "date":
# Date columns are always object dtype
return np.dtype("object")

raise ValueError(f"Unknown fastexcel dtype: {fastexcel_dtype}")


def assert_pandas_dtypes(df: pd.DataFrame, expected_dtypes: dict[str, str]) -> None:
"""Assert that a pandas DataFrame has the expected dtypes for each column.
Args:
df: The pandas DataFrame to check
expected_dtypes: A dict mapping column names to fastexcel dtype strings
"""
for col_name, fastexcel_dtype in expected_dtypes.items():
expected_dtype = get_expected_pandas_dtype(fastexcel_dtype)
actual_dtype = df[col_name].dtype
assert actual_dtype == expected_dtype, (
f"Column '{col_name}': expected dtype {expected_dtype}, got {actual_dtype}"
)
Loading
Loading