BUG(string dtype): groupby/resampler.min/max returns float on all NA strings

rhshadrach · rhshadrach · commit 9254a10bb7b1 · 2025-02-22T10:29:14.000-05:00
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -118,12 +118,14 @@ Conversion
 
 Strings
 ^^^^^^^
+- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`)
 - Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
 - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
 - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
 - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
 - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
+- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max`,
 
 Interval
 ^^^^^^^^
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -2628,7 +2628,15 @@ def _groupby_op(
             if op.how not in ["any", "all"]:
                 # Fail early to avoid conversion to object
                 op._get_cython_function(op.kind, op.how, np.dtype(object), False)
-            npvalues = self.to_numpy(object, na_value=np.nan)
+
+            arr = self
+            if op.how == "sum":
+                # https://github.com/pandas-dev/pandas/issues/60229
+                # All NA should result in the empty string.
+                assert "skipna" in kwargs
+                if kwargs["skipna"] and min_count == 0:
+                    arr = arr.fillna("")
+            npvalues = arr.to_numpy(object, na_value=np.nan)
         else:
             raise NotImplementedError(
                 f"function is not implemented for this dtype: {self.dtype}"
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -81,6 +81,7 @@ class providing the base-class of operations.
     is_numeric_dtype,
     is_object_dtype,
     is_scalar,
+    is_string_dtype,
     needs_i8_conversion,
     pandas_dtype,
 )
@@ -1725,6 +1726,10 @@ def _agg_py_fallback(
 
         if ser.dtype == object:
             res_values = res_values.astype(object, copy=False)
+        elif is_string_dtype(ser.dtype) and how in ["min", "max"]:
+            dtype = ser.dtype
+            string_array_cls = dtype.construct_array_type()
+            res_values = string_array_cls._from_sequence(res_values, dtype=dtype)
 
         # If we are DataFrameGroupBy and went through a SeriesGroupByPath
         # then we need to reshape
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -4651,7 +4651,7 @@ def rename(
         inplace: Literal[True],
         level: Level | None = ...,
         errors: IgnoreRaise = ...,
-    ) -> None: ...
+    ) -> Series | None: ...
 
     @overload
     def rename(
@@ -4665,18 +4665,6 @@ def rename(
         errors: IgnoreRaise = ...,
     ) -> Series: ...
 
-    @overload
-    def rename(
-        self,
-        index: Renamer | Hashable | None = ...,
-        *,
-        axis: Axis | None = ...,
-        copy: bool | lib.NoDefault = ...,
-        inplace: bool = ...,
-        level: Level | None = ...,
-        errors: IgnoreRaise = ...,
-    ) -> Series | None: ...
-
     def rename(
         self,
         index: Renamer | Hashable | None = None,
@@ -4734,8 +4722,9 @@ def rename(
 
         Returns
         -------
-        Series or None
-            Series with index labels or name altered or None if ``inplace=True``.
+        Series
+            A shallow copy with index labels or name altered, or the same object
+            if ``inplace=True`` and index is not a dict or callable else None.
 
         See Also
         --------
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -835,6 +835,16 @@ def test_axis_1_empty(self, all_reductions, index):
         expected = Series([], index=index, dtype=expected_dtype)
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize("min_count", [0, 1])
+    def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count):
+        # https://github.com/pandas-dev/pandas/issues/60229
+        dtype = string_dtype_no_object
+        df = DataFrame({"a": [pd.NA]}, dtype=dtype)
+        result = df.sum(axis=1, skipna=skipna, min_count=min_count)
+        value = "" if skipna and min_count == 0 else pd.NA
+        expected = Series([value], dtype=dtype)
+        tm.assert_series_equal(result, expected)
+
     @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
     @pytest.mark.parametrize("numeric_only", [None, True, False])
     def test_sum_prod_nanops(self, method, unit, numeric_only):
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
@@ -20,6 +20,7 @@
     isna,
 )
 import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
 from pandas.util import _test_decorators as td
 
 
@@ -955,6 +956,98 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("min_count", [0, 1])
+@pytest.mark.parametrize("test_series", [True, False])
+def test_string_dtype_all_na(
+    string_dtype_no_object, reduction_func, skipna, min_count, test_series
+):
+    # https://github.com/pandas-dev/pandas/issues/60229
+    if reduction_func == "corrwith":
+        # corrwith is deprecated.
+        return
+
+    dtype = string_dtype_no_object
+
+    if reduction_func in [
+        "any",
+        "all",
+        "idxmin",
+        "idxmax",
+        "mean",
+        "median",
+        "std",
+        "var",
+    ]:
+        kwargs = {"skipna": skipna}
+    elif reduction_func in ["kurt"]:
+        kwargs = {"min_count": min_count}
+    elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]:
+        kwargs = {}
+    else:
+        kwargs = {"skipna": skipna, "min_count": min_count}
+
+    expected_dtype, expected_value = dtype, pd.NA
+    if reduction_func in ["all", "any"]:
+        expected_dtype = "bool"
+        # TODO: For skipna=False, bool(pd.NA) raises; should groupby?
+        expected_value = not skipna if reduction_func == "any" else True
+    elif reduction_func in ["count", "nunique", "size"]:
+        # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA?
+        if (
+            test_series
+            and reduction_func == "size"
+            and dtype.storage == "pyarrow"
+            and dtype.na_value is pd.NA
+        ):
+            expected_dtype = "Int64"
+        else:
+            expected_dtype = "int64"
+        expected_value = 1 if reduction_func == "size" else 0
+    elif reduction_func in ["idxmin", "idxmax"]:
+        expected_dtype, expected_value = "float64", np.nan
+    elif not skipna or min_count > 0:
+        expected_value = pd.NA
+    elif reduction_func == "sum":
+        # https://github.com/pandas-dev/pandas/pull/60936
+        expected_value = ""
+
+    df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
+    obj = df["b"] if test_series else df
+    args = get_groupby_method_args(reduction_func, obj)
+    gb = obj.groupby(df["a"])
+    method = getattr(gb, reduction_func)
+
+    if reduction_func in [
+        "mean",
+        "median",
+        "kurt",
+        "prod",
+        "quantile",
+        "sem",
+        "skew",
+        "std",
+        "var",
+    ]:
+        msg = f"dtype '{dtype}' does not support operation '{reduction_func}'"
+        with pytest.raises(TypeError, match=msg):
+            method(*args, **kwargs)
+        return
+    elif reduction_func in ["idxmin", "idxmax"] and not skipna:
+        msg = f"{reduction_func} with skipna=False encountered an NA value."
+        with pytest.raises(ValueError, match=msg):
+            method(*args, **kwargs)
+        return
+
+    result = method(*args, **kwargs)
+    index = pd.Index(["x"], name="a", dtype=dtype)
+    if test_series or reduction_func == "size":
+        name = None if not test_series and reduction_func == "size" else "b"
+        expected = Series(expected_value, index=index, dtype=expected_dtype, name=name)
+    else:
+        expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype)
+    tm.assert_equal(result, expected)
+
+
 def test_max_nan_bug():
     df = DataFrame(
         {
diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py
@@ -223,6 +223,31 @@ def test_resample_empty_series(freq, index, resample_method):
     assert result.index.freq == expected.index.freq
 
 
+@pytest.mark.parametrize("min_count", [0, 1])
+def test_resample_empty_sum_string(string_dtype_no_object, min_count):
+    # https://github.com/pandas-dev/pandas/issues/60229
+    dtype = string_dtype_no_object
+    ser = Series(
+        pd.NA,
+        index=DatetimeIndex(
+            [
+                "2000-01-01 00:00:00",
+                "2000-01-01 00:00:10",
+                "2000-01-01 00:00:20",
+                "2000-01-01 00:00:30",
+            ]
+        ),
+        dtype=dtype,
+    )
+    rs = ser.resample("20s")
+    result = rs.sum(min_count=min_count)
+
+    value = "" if min_count == 0 else pd.NA
+    index = date_range(start="2000-01-01", freq="20s", periods=2, unit="s")
+    expected = Series(value, index=index, dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "freq",
     [
diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import is_platform_windows
 
 import pandas as pd
@@ -462,7 +460,6 @@ def test_empty(keys):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize("consolidate", [True, False])
 def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
     # https://github.com/pandas-dev/pandas/issues/39329
@@ -494,6 +491,26 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("min_count", [0, 1])
+def test_groupby_resample_empty_sum_string(
+    string_dtype_no_object, test_frame, min_count
+):
+    # https://github.com/pandas-dev/pandas/issues/60229
+    dtype = string_dtype_no_object
+    test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype))
+    gbrs = test_frame.groupby("A").resample("40s")
+    result = gbrs.sum(min_count=min_count)
+
+    index = pd.MultiIndex(
+        levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]],
+        codes=[[0, 1, 2], [0, 0, 0]],
+        names=["A", None],
+    )
+    value = "" if min_count == 0 else pd.NA
+    expected = DataFrame({"B": value}, index=index, dtype=dtype)
+    tm.assert_frame_equal(result, expected)
+
+
 def test_groupby_resample_with_list_of_keys():
     # GH 47362
     df = DataFrame(