fix object-dtype implementation + update tests

jorisvandenbossche · jorisvandenbossche · commit ab30d871eb10 · 2024-09-21T10:16:54.000+02:00
diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py
@@ -62,6 +62,9 @@ def _reductions(
         ):
             return libmissing.NA
 
+        if values.dtype == np.dtype(object):
+            values = values[~mask]
+            return func(values, axis=axis, **kwargs)
         return func(values, where=~mask, axis=axis, **kwargs)
 
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -803,15 +803,12 @@ def _reduce(
             else:
                 return nanops.nanall(self._ndarray, skipna=skipna)
 
-        if name in ["min", "max"]:
-            result = getattr(self, name)(skipna=skipna, axis=axis)
+        if name in ["min", "max", "sum"]:
+            result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs)
             if keepdims:
                 return self._from_sequence([result], dtype=self.dtype)
             return result
 
-        if name == "sum":
-            return nanops.nansum(self._ndarray, skipna=skipna, **kwargs)
-
         raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
 
     def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
@@ -834,6 +831,20 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
         )
         return self._wrap_reduction_result(axis, result)
 
+    def sum(
+        self,
+        *,
+        axis: AxisInt | None = None,
+        skipna: bool = True,
+        min_count: int = 0,
+        **kwargs,
+    ) -> Scalar:
+        nv.validate_sum((), kwargs)
+        result = masked_reductions.sum(
+            values=self._ndarray, mask=self.isna(), skipna=skipna
+        )
+        return self._wrap_reduction_result(axis, result)
+
     def value_counts(self, dropna: bool = True) -> Series:
         from pandas.core.algorithms import value_counts_internal as value_counts
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -444,14 +444,12 @@ def test_astype_float(dtype, any_float_dtype):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(reason="Not implemented StringArray.sum")
 def test_reduce(skipna, dtype):
     arr = pd.Series(["a", "b", "c"], dtype=dtype)
     result = arr.sum(skipna=skipna)
     assert result == "abc"
 
 
-@pytest.mark.xfail(reason="Not implemented StringArray.sum")
 def test_reduce_missing(skipna, dtype):
     arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype)
     result = arr.sum(skipna=skipna)
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -480,10 +480,11 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
                 pass
             else:
                 return False
+        elif pa.types.is_binary(pa_dtype) and op_name == "sum":
+            return False
         elif (
             pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
         ) and op_name in [
-            "sum",
             "mean",
             "median",
             "prod",
@@ -582,6 +583,8 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
             cmp_dtype = "float64[pyarrow]"
         elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type):
             cmp_dtype = "uint64[pyarrow]"
+        elif op_name == "sum" and pa.types.is_string(pa_type):
+            cmp_dtype = arr.dtype
         else:
             cmp_dtype = {
                 "i": "int64[pyarrow]",
@@ -613,26 +616,6 @@ def test_median_not_approximate(self, typ):
         result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median()
         assert result == 1.5
 
-    def test_in_numeric_groupby(self, data_for_grouping):
-        dtype = data_for_grouping.dtype
-        if is_string_dtype(dtype):
-            df = pd.DataFrame(
-                {
-                    "A": [1, 1, 2, 2, 3, 3, 1, 4],
-                    "B": data_for_grouping,
-                    "C": [1, 1, 1, 1, 1, 1, 1, 1],
-                }
-            )
-
-            expected = pd.Index(["C"])
-            msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}")
-            with pytest.raises(TypeError, match=msg):
-                df.groupby("A").sum()
-            result = df.groupby("A").sum(numeric_only=True).columns
-            tm.assert_index_equal(result, expected)
-        else:
-            super().test_in_numeric_groupby(data_for_grouping)
-
     def test_construct_from_string_own_name(self, dtype, request):
         pa_dtype = dtype.pyarrow_dtype
         if pa.types.is_decimal(pa_dtype):
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -188,7 +188,7 @@ def _get_expected_exception(
 
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         return (
-            op_name in ["min", "max"]
+            op_name in ["min", "max", "sum"]
             or ser.dtype.na_value is np.nan  # type: ignore[union-attr]
             and op_name in ("any", "all")
         )