ENH(string dtype): Implement cumsum for Python-backed strings

rhshadrach · rhshadrach · commit 51b363ec3f2e · 2025-02-15T18:06:25.000-05:00
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -37,7 +37,7 @@ Other enhancements
   updated to work correctly with NumPy >= 2 (:issue:`57739`)
 - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
 - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype``  (:issue:`60663`)
-- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
+- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`, :issue:`???`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -870,6 +870,100 @@ def _reduce(
 
         raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
 
+    def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray:
+        """
+        Return an ExtensionArray performing an accumulation operation.
+
+        The underlying data type might change.
+
+        Parameters
+        ----------
+        name : str
+            Name of the function, supported values are:
+            - cummin
+            - cummax
+            - cumsum
+            - cumprod
+        skipna : bool, default True
+            If True, skip NA values.
+        **kwargs
+            Additional keyword arguments passed to the accumulation function.
+            Currently, there is no supported kwarg.
+
+        Returns
+        -------
+        array
+
+        Raises
+        ------
+        NotImplementedError : subclass does not define accumulations
+        """
+        if is_string_dtype(self):
+            return self._str_accumulate(name=name, skipna=skipna, **kwargs)
+        return super()._accumulate(name=name, skipna=skipna, **kwargs)
+
+    def _str_accumulate(
+        self, name: str, *, skipna: bool = True, **kwargs
+    ) -> StringArray:
+        """
+        Accumulate implementation for strings, see `_accumulate` docstring for details.
+
+        pyarrow.compute does not implement these methods for strings.
+        """
+        if name == "cumprod":
+            msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
+            raise TypeError(msg)
+
+        # We may need to strip out trailing NA values
+        tail: np.array | None = None
+        na_mask: np.array | None = None
+        ndarray = self._ndarray
+        np_func = {
+            "cumsum": np.cumsum,
+            "cummin": np.minimum.accumulate,
+            "cummax": np.maximum.accumulate,
+        }[name]
+
+        from pandas.core import missing
+
+        if self._hasna:
+            na_mask = isna(ndarray)
+            if np.all(na_mask):
+                return type(self)(ndarray)
+            if skipna:
+                if name == "cumsum":
+                    ndarray = np.where(na_mask, "", ndarray)
+                else:
+                    # We can retain the running min/max by forward/backward filling.
+                    ndarray = ndarray.copy()
+                    missing.pad_or_backfill_inplace(
+                        ndarray.T,
+                        method="pad",
+                        axis=0,
+                    )
+                    missing.pad_or_backfill_inplace(
+                        ndarray.T,
+                        method="backfill",
+                        axis=0,
+                    )
+            else:
+                # When not skipping NA values, the result should be null from
+                # the first NA value onward.
+                idx = np.argmax(na_mask)
+                tail = np.empty(len(ndarray) - idx, dtype="object")
+                tail[:] = np.nan
+                ndarray = ndarray[:idx]
+
+        np_result = np_func(ndarray)
+
+        if tail is not None:
+            np_result = np.hstack((np_result, tail))
+        elif na_mask is not None:
+            np_result = np.where(na_mask, np.nan, np_result)
+
+        result = type(self)(np_result)
+        return result
+
     def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
         if self.dtype.na_value is np.nan and result is libmissing.NA:
             # the masked_reductions use pd.NA -> convert to np.nan
diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
@@ -5,7 +5,6 @@
 import pytest
 
 from pandas.compat import (
-    HAS_PYARROW,
     WASM,
 )
 
@@ -166,13 +165,13 @@ def test_agg_cython_table_transform_series(request, series, func, expected):
     # GH21224
     # test transforming functions in
     # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
-    if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW:
-        request.applymarker(
-            pytest.mark.xfail(
-                raises=NotImplementedError,
-                reason="TODO(infer_string) cumsum not yet implemented for string",
-            )
-        )
+    # if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW:
+    #     request.applymarker(
+    #         pytest.mark.xfail(
+    #             raises=NotImplementedError,
+    #             reason="TODO(infer_string) cumsum not yet implemented for string",
+    #         )
+    #     )
     warn = None if isinstance(func, str) else FutureWarning
     with tm.assert_produces_warning(warn, match="is currently using Series.*"):
         result = series.agg(func)
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -196,11 +196,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
 
     def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
         assert isinstance(ser.dtype, StorageExtensionDtype)
-        return ser.dtype.storage == "pyarrow" and op_name in [
-            "cummin",
-            "cummax",
-            "cumsum",
-        ]
+        return op_name in ["cummin", "cummax", "cumsum"]
 
     def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         dtype = cast(StringDtype, tm.get_dtype(obj))
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -325,9 +325,9 @@ def test_observed(request, using_infer_string, observed):
     # gh-8138 (back-compat)
     # gh-8869
 
-    if using_infer_string and not observed:
-        # TODO(infer_string) this fails with filling the string column with 0
-        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
+    # if using_infer_string and not observed:
+    #     # TODO(infer_string) this fails with filling the string column with 0
+    #     request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
 
     cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
     cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
@@ -355,10 +355,12 @@ def test_observed(request, using_infer_string, observed):
     )
     result = gb.sum()
     if not observed:
+        fill_value = "" if using_infer_string else 0
         expected = cartesian_product_for_groupers(
-            expected, [cat1, cat2], list("AB"), fill_value=0
+            expected, [cat1, cat2], list("AB"), fill_value=fill_value
         )
-
+    print(result)
+    print(expected)
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
@@ -266,12 +266,12 @@ def test_cumprod_timedelta(self):
         ],
     )
     def test_cum_methods_pyarrow_strings(
-        self, pyarrow_string_dtype, data, op, skipna, expected_data
+        self, string_dtype_no_object, data, op, skipna, expected_data
     ):
         # https://github.com/pandas-dev/pandas/pull/60633
-        ser = pd.Series(data, dtype=pyarrow_string_dtype)
+        ser = pd.Series(data, dtype=string_dtype_no_object)
         method = getattr(ser, op)
-        expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
+        expected = pd.Series(expected_data, dtype=string_dtype_no_object)
         result = method(skipna=skipna)
         tm.assert_series_equal(result, expected)