Skip to content

Commit 9254a10

Browse files
committed
BUG(string dtype): groupby/resampler.min/max returns float on all NA strings
1 parent f46d853 commit 9254a10

File tree

8 files changed

+168
-19
lines changed

8 files changed

+168
-19
lines changed

doc/source/whatsnew/v2.3.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,14 @@ Conversion
118118

119119
Strings
120120
^^^^^^^
121+
- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`)
121122
- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
122123
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
123124
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
124125
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
125126
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
126127
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
128+
- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max`,
127129

128130
Interval
129131
^^^^^^^^

pandas/core/arrays/base.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2628,7 +2628,15 @@ def _groupby_op(
26282628
if op.how not in ["any", "all"]:
26292629
# Fail early to avoid conversion to object
26302630
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
2631-
npvalues = self.to_numpy(object, na_value=np.nan)
2631+
2632+
arr = self
2633+
if op.how == "sum":
2634+
# https://github.com/pandas-dev/pandas/issues/60229
2635+
# All NA should result in the empty string.
2636+
assert "skipna" in kwargs
2637+
if kwargs["skipna"] and min_count == 0:
2638+
arr = arr.fillna("")
2639+
npvalues = arr.to_numpy(object, na_value=np.nan)
26322640
else:
26332641
raise NotImplementedError(
26342642
f"function is not implemented for this dtype: {self.dtype}"

pandas/core/groupby/groupby.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ class providing the base-class of operations.
8181
is_numeric_dtype,
8282
is_object_dtype,
8383
is_scalar,
84+
is_string_dtype,
8485
needs_i8_conversion,
8586
pandas_dtype,
8687
)
@@ -1725,6 +1726,10 @@ def _agg_py_fallback(
17251726

17261727
if ser.dtype == object:
17271728
res_values = res_values.astype(object, copy=False)
1729+
elif is_string_dtype(ser.dtype) and how in ["min", "max"]:
1730+
dtype = ser.dtype
1731+
string_array_cls = dtype.construct_array_type()
1732+
res_values = string_array_cls._from_sequence(res_values, dtype=dtype)
17281733

17291734
# If we are DataFrameGroupBy and went through a SeriesGroupByPath
17301735
# then we need to reshape

pandas/core/series.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4651,7 +4651,7 @@ def rename(
46514651
inplace: Literal[True],
46524652
level: Level | None = ...,
46534653
errors: IgnoreRaise = ...,
4654-
) -> None: ...
4654+
) -> Series | None: ...
46554655

46564656
@overload
46574657
def rename(
@@ -4665,18 +4665,6 @@ def rename(
46654665
errors: IgnoreRaise = ...,
46664666
) -> Series: ...
46674667

4668-
@overload
4669-
def rename(
4670-
self,
4671-
index: Renamer | Hashable | None = ...,
4672-
*,
4673-
axis: Axis | None = ...,
4674-
copy: bool | lib.NoDefault = ...,
4675-
inplace: bool = ...,
4676-
level: Level | None = ...,
4677-
errors: IgnoreRaise = ...,
4678-
) -> Series | None: ...
4679-
46804668
def rename(
46814669
self,
46824670
index: Renamer | Hashable | None = None,
@@ -4734,8 +4722,9 @@ def rename(
47344722
47354723
Returns
47364724
-------
4737-
Series or None
4738-
Series with index labels or name altered or None if ``inplace=True``.
4725+
Series
4726+
A shallow copy with index labels or name altered, or the same object
4727+
if ``inplace=True`` and index is not a dict or callable else None.
47394728
47404729
See Also
47414730
--------

pandas/tests/frame/test_reductions.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -835,6 +835,16 @@ def test_axis_1_empty(self, all_reductions, index):
835835
expected = Series([], index=index, dtype=expected_dtype)
836836
tm.assert_series_equal(result, expected)
837837

838+
@pytest.mark.parametrize("min_count", [0, 1])
839+
def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count):
840+
# https://github.com/pandas-dev/pandas/issues/60229
841+
dtype = string_dtype_no_object
842+
df = DataFrame({"a": [pd.NA]}, dtype=dtype)
843+
result = df.sum(axis=1, skipna=skipna, min_count=min_count)
844+
value = "" if skipna and min_count == 0 else pd.NA
845+
expected = Series([value], dtype=dtype)
846+
tm.assert_series_equal(result, expected)
847+
838848
@pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
839849
@pytest.mark.parametrize("numeric_only", [None, True, False])
840850
def test_sum_prod_nanops(self, method, unit, numeric_only):

pandas/tests/groupby/test_reductions.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
isna,
2121
)
2222
import pandas._testing as tm
23+
from pandas.tests.groupby import get_groupby_method_args
2324
from pandas.util import _test_decorators as td
2425

2526

@@ -955,6 +956,98 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
955956
tm.assert_frame_equal(result, expected)
956957

957958

959+
@pytest.mark.parametrize("min_count", [0, 1])
960+
@pytest.mark.parametrize("test_series", [True, False])
961+
def test_string_dtype_all_na(
962+
string_dtype_no_object, reduction_func, skipna, min_count, test_series
963+
):
964+
# https://github.com/pandas-dev/pandas/issues/60229
965+
if reduction_func == "corrwith":
966+
# corrwith is deprecated.
967+
return
968+
969+
dtype = string_dtype_no_object
970+
971+
if reduction_func in [
972+
"any",
973+
"all",
974+
"idxmin",
975+
"idxmax",
976+
"mean",
977+
"median",
978+
"std",
979+
"var",
980+
]:
981+
kwargs = {"skipna": skipna}
982+
elif reduction_func in ["kurt"]:
983+
kwargs = {"min_count": min_count}
984+
elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]:
985+
kwargs = {}
986+
else:
987+
kwargs = {"skipna": skipna, "min_count": min_count}
988+
989+
expected_dtype, expected_value = dtype, pd.NA
990+
if reduction_func in ["all", "any"]:
991+
expected_dtype = "bool"
992+
# TODO: For skipna=False, bool(pd.NA) raises; should groupby?
993+
expected_value = not skipna if reduction_func == "any" else True
994+
elif reduction_func in ["count", "nunique", "size"]:
995+
# TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA?
996+
if (
997+
test_series
998+
and reduction_func == "size"
999+
and dtype.storage == "pyarrow"
1000+
and dtype.na_value is pd.NA
1001+
):
1002+
expected_dtype = "Int64"
1003+
else:
1004+
expected_dtype = "int64"
1005+
expected_value = 1 if reduction_func == "size" else 0
1006+
elif reduction_func in ["idxmin", "idxmax"]:
1007+
expected_dtype, expected_value = "float64", np.nan
1008+
elif not skipna or min_count > 0:
1009+
expected_value = pd.NA
1010+
elif reduction_func == "sum":
1011+
# https://github.com/pandas-dev/pandas/pull/60936
1012+
expected_value = ""
1013+
1014+
df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
1015+
obj = df["b"] if test_series else df
1016+
args = get_groupby_method_args(reduction_func, obj)
1017+
gb = obj.groupby(df["a"])
1018+
method = getattr(gb, reduction_func)
1019+
1020+
if reduction_func in [
1021+
"mean",
1022+
"median",
1023+
"kurt",
1024+
"prod",
1025+
"quantile",
1026+
"sem",
1027+
"skew",
1028+
"std",
1029+
"var",
1030+
]:
1031+
msg = f"dtype '{dtype}' does not support operation '{reduction_func}'"
1032+
with pytest.raises(TypeError, match=msg):
1033+
method(*args, **kwargs)
1034+
return
1035+
elif reduction_func in ["idxmin", "idxmax"] and not skipna:
1036+
msg = f"{reduction_func} with skipna=False encountered an NA value."
1037+
with pytest.raises(ValueError, match=msg):
1038+
method(*args, **kwargs)
1039+
return
1040+
1041+
result = method(*args, **kwargs)
1042+
index = pd.Index(["x"], name="a", dtype=dtype)
1043+
if test_series or reduction_func == "size":
1044+
name = None if not test_series and reduction_func == "size" else "b"
1045+
expected = Series(expected_value, index=index, dtype=expected_dtype, name=name)
1046+
else:
1047+
expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype)
1048+
tm.assert_equal(result, expected)
1049+
1050+
9581051
def test_max_nan_bug():
9591052
df = DataFrame(
9601053
{

pandas/tests/resample/test_base.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,31 @@ def test_resample_empty_series(freq, index, resample_method):
223223
assert result.index.freq == expected.index.freq
224224

225225

226+
@pytest.mark.parametrize("min_count", [0, 1])
227+
def test_resample_empty_sum_string(string_dtype_no_object, min_count):
228+
# https://github.com/pandas-dev/pandas/issues/60229
229+
dtype = string_dtype_no_object
230+
ser = Series(
231+
pd.NA,
232+
index=DatetimeIndex(
233+
[
234+
"2000-01-01 00:00:00",
235+
"2000-01-01 00:00:10",
236+
"2000-01-01 00:00:20",
237+
"2000-01-01 00:00:30",
238+
]
239+
),
240+
dtype=dtype,
241+
)
242+
rs = ser.resample("20s")
243+
result = rs.sum(min_count=min_count)
244+
245+
value = "" if min_count == 0 else pd.NA
246+
index = date_range(start="2000-01-01", freq="20s", periods=2, unit="s")
247+
expected = Series(value, index=index, dtype=dtype)
248+
tm.assert_series_equal(result, expected)
249+
250+
226251
@pytest.mark.parametrize(
227252
"freq",
228253
[

pandas/tests/resample/test_resampler_grouper.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
from pandas.compat import is_platform_windows
97

108
import pandas as pd
@@ -462,7 +460,6 @@ def test_empty(keys):
462460
tm.assert_frame_equal(result, expected)
463461

464462

465-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
466463
@pytest.mark.parametrize("consolidate", [True, False])
467464
def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
468465
# https://github.com/pandas-dev/pandas/issues/39329
@@ -494,6 +491,26 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
494491
tm.assert_frame_equal(result, expected)
495492

496493

494+
@pytest.mark.parametrize("min_count", [0, 1])
495+
def test_groupby_resample_empty_sum_string(
496+
string_dtype_no_object, test_frame, min_count
497+
):
498+
# https://github.com/pandas-dev/pandas/issues/60229
499+
dtype = string_dtype_no_object
500+
test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype))
501+
gbrs = test_frame.groupby("A").resample("40s")
502+
result = gbrs.sum(min_count=min_count)
503+
504+
index = pd.MultiIndex(
505+
levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]],
506+
codes=[[0, 1, 2], [0, 0, 0]],
507+
names=["A", None],
508+
)
509+
value = "" if min_count == 0 else pd.NA
510+
expected = DataFrame({"B": value}, index=index, dtype=dtype)
511+
tm.assert_frame_equal(result, expected)
512+
513+
497514
def test_groupby_resample_with_list_of_keys():
498515
# GH 47362
499516
df = DataFrame(

0 commit comments

Comments
 (0)