ENH: Support skipna parameter in GroupBy prod, var, std and sem methods

snitish · snitish · commit 72276dc2d31f · 2025-01-21T17:03:03.000-08:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -58,9 +58,9 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
+- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
 - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
-- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`)
 - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
 - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
 - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -76,6 +76,7 @@ def group_prod(
     mask: np.ndarray | None,
     result_mask: np.ndarray | None = ...,
     min_count: int = ...,
+    skipna: bool = ...,
 ) -> None: ...
 def group_var(
     out: np.ndarray,  # floating[:, ::1]
@@ -88,6 +89,7 @@ def group_var(
     result_mask: np.ndarray | None = ...,
     is_datetimelike: bool = ...,
     name: str = ...,
+    skipna: bool = ...,
 ) -> None: ...
 def group_skew(
     out: np.ndarray,  # float64_t[:, ::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -806,13 +806,14 @@ def group_prod(
     const uint8_t[:, ::1] mask,
     uint8_t[:, ::1] result_mask=None,
     Py_ssize_t min_count=0,
+    bint skipna=True,
 ) -> None:
     """
     Only aggregates on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        int64float_t val
+        int64float_t val, nan_val
         int64float_t[:, ::1] prodx
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
@@ -825,6 +826,13 @@ def group_prod(
     prodx = np.ones((<object>out).shape, dtype=(<object>out).base.dtype)
 
     N, K = (<object>values).shape
+    if uses_mask:
+        nan_val = 0
+    elif int64float_t is int64_t or int64float_t is uint64_t:
+        # This has no effect as int64 can't be nan. Setting to 0 to avoid type error
+        nan_val = 0
+    else:
+        nan_val = NAN
 
     with nogil:
         for i in range(N):
@@ -836,6 +844,13 @@ def group_prod(
             for j in range(K):
                 val = values[i, j]
 
+                if not skipna and (
+                    (uses_mask and result_mask[lab, j]) or
+                    _treat_as_na(prodx[lab, j], False)
+                ):
+                    # If prod is already NA, no need to update it
+                    continue
+
                 if uses_mask:
                     isna_entry = mask[i, j]
                 else:
@@ -844,6 +859,11 @@ def group_prod(
                 if not isna_entry:
                     nobs[lab, j] += 1
                     prodx[lab, j] *= val
+                elif not skipna:
+                    if uses_mask:
+                        result_mask[lab, j] = True
+                    else:
+                        prodx[lab, j] = nan_val
 
     _check_below_mincount(
         out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
@@ -864,6 +884,7 @@ def group_var(
     uint8_t[:, ::1] result_mask=None,
     bint is_datetimelike=False,
     str name="var",
+    bint skipna=True,
 ) -> None:
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
@@ -898,6 +919,16 @@ def group_var(
             for j in range(K):
                 val = values[i, j]
 
+                if not skipna and (
+                    (uses_mask and result_mask[lab, j]) or
+                    (is_datetimelike and out[lab, j] == NPY_NAT) or
+                    _treat_as_na(out[lab, j], False)
+                ):
+                    # If aggregate is already NA, don't add to it. This is important for
+                    # datetimelike because adding a value to NPY_NAT may not result
+                    # in a NPY_NAT
+                    continue
+
                 if uses_mask:
                     isna_entry = mask[i, j]
                 elif is_datetimelike:
@@ -913,6 +944,12 @@ def group_var(
                     oldmean = mean[lab, j]
                     mean[lab, j] += (val - oldmean) / nobs[lab, j]
                     out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
+                elif not skipna:
+                    nobs[lab, j] = 0
+                    if uses_mask:
+                        result_mask[lab, j] = True
+                    else:
+                        out[lab, j] = NAN
 
         for i in range(ncounts):
             for j in range(K):
diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py
@@ -176,6 +176,7 @@ def grouped_var(
     ngroups: int,
     min_periods: int,
     ddof: int = 1,
+    skipna: bool = True,
 ) -> tuple[np.ndarray, list[int]]:
     N = len(labels)
 
@@ -190,7 +191,15 @@ def grouped_var(
         lab = labels[i]
         val = values[i]
 
-        if lab < 0:
+        if lab < 0 or np.isnan(output[lab]):
+            continue
+
+        if not skipna and np.isnan(val):
+            output[lab] = np.nan
+            nobs_arr[lab] += 1
+            comp_arr[lab] = np.nan
+            consecutive_counts[lab] = 1
+            prev_vals[lab] = np.nan
             continue
 
         mean_x = means[lab]
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2349,6 +2349,7 @@ def std(
         engine: Literal["cython", "numba"] | None = None,
         engine_kwargs: dict[str, bool] | None = None,
         numeric_only: bool = False,
+        skipna: bool = True,
     ):
         """
         Compute standard deviation of groups, excluding missing values.
@@ -2387,6 +2388,12 @@ def std(
 
                 numeric_only now defaults to ``False``.
 
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+
+            .. versionadded:: 3.0.0
+
         Returns
         -------
         Series or DataFrame
@@ -2441,14 +2448,16 @@ def std(
                     engine_kwargs,
                     min_periods=0,
                     ddof=ddof,
+                    skipna=skipna,
                 )
             )
         else:
             return self._cython_agg_general(
                 "std",
-                alt=lambda x: Series(x, copy=False).std(ddof=ddof),
+                alt=lambda x: Series(x, copy=False).std(ddof=ddof, skipna=skipna),
                 numeric_only=numeric_only,
                 ddof=ddof,
+                skipna=skipna,
             )
 
     @final
@@ -2460,6 +2469,7 @@ def var(
         engine: Literal["cython", "numba"] | None = None,
         engine_kwargs: dict[str, bool] | None = None,
         numeric_only: bool = False,
+        skipna: bool = True,
     ):
         """
         Compute variance of groups, excluding missing values.
@@ -2497,6 +2507,12 @@ def var(
 
                 numeric_only now defaults to ``False``.
 
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+
+            .. versionadded:: 3.0.0
+
         Returns
         -------
         Series or DataFrame
@@ -2550,13 +2566,15 @@ def var(
                 engine_kwargs,
                 min_periods=0,
                 ddof=ddof,
+                skipna=skipna,
             )
         else:
             return self._cython_agg_general(
                 "var",
-                alt=lambda x: Series(x, copy=False).var(ddof=ddof),
+                alt=lambda x: Series(x, copy=False).var(ddof=ddof, skipna=skipna),
                 numeric_only=numeric_only,
                 ddof=ddof,
+                skipna=skipna,
             )
 
     @final
@@ -2686,7 +2704,9 @@ def _value_counts(
         return result.__finalize__(self.obj, method="value_counts")
 
     @final
-    def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:
+    def sem(
+        self, ddof: int = 1, numeric_only: bool = False, skipna: bool = True
+    ) -> NDFrameT:
         """
         Compute standard error of the mean of groups, excluding missing values.
 
@@ -2706,6 +2726,12 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:
 
                 numeric_only now defaults to ``False``.
 
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+
+            .. versionadded:: 3.0.0
+
         Returns
         -------
         Series or DataFrame
@@ -2780,9 +2806,10 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:
             )
         return self._cython_agg_general(
             "sem",
-            alt=lambda x: Series(x, copy=False).sem(ddof=ddof),
+            alt=lambda x: Series(x, copy=False).sem(ddof=ddof, skipna=skipna),
             numeric_only=numeric_only,
             ddof=ddof,
+            skipna=skipna,
         )
 
     @final
@@ -2959,7 +2986,9 @@ def sum(
             return result
 
     @final
-    def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:
+    def prod(
+        self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True
+    ) -> NDFrameT:
         """
         Compute prod of group values.
 
@@ -2976,6 +3005,12 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:
             The required number of valid values to perform the operation. If fewer
             than ``min_count`` non-NA values are present the result will be NA.
 
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+
+            .. versionadded:: 3.0.0
+
         Returns
         -------
         Series or DataFrame
@@ -3024,7 +3059,11 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:
         2   30   72
         """
         return self._agg_general(
-            numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
+            numeric_only=numeric_only,
+            min_count=min_count,
+            skipna=skipna,
+            alias="prod",
+            npfunc=np.prod,
         )
 
     @final
diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py
@@ -186,7 +186,7 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.parametrize("func", ["sum", "mean"])
+@pytest.mark.parametrize("func", ["sum", "mean", "var", "std"])
 def test_multifunc_numba_vs_cython_frame_noskipna(func):
     pytest.importorskip("numba")
     data = DataFrame(
diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py
@@ -176,14 +176,13 @@ def test_frame_consistency(groupby_func):
     elif groupby_func in ("max", "min"):
         exclude_expected = {"axis", "kwargs", "skipna"}
         exclude_result = {"min_count", "engine", "engine_kwargs"}
-    elif groupby_func in ("sum", "mean"):
+    elif groupby_func in ("sum", "mean", "std", "var"):
         exclude_expected = {"axis", "kwargs"}
         exclude_result = {"engine", "engine_kwargs"}
-    elif groupby_func in ("std", "var"):
-        exclude_expected = {"axis", "kwargs", "skipna"}
-        exclude_result = {"engine", "engine_kwargs"}
-    elif groupby_func in ("median", "prod", "sem"):
+    elif groupby_func in ("median"):
         exclude_expected = {"axis", "kwargs", "skipna"}
+    elif groupby_func in ("prod", "sem"):
+        exclude_expected = {"axis", "kwargs"}
     elif groupby_func in ("bfill", "ffill"):
         exclude_expected = {"inplace", "axis", "limit_area"}
     elif groupby_func in ("cummax", "cummin"):
@@ -237,14 +236,13 @@ def test_series_consistency(request, groupby_func):
     elif groupby_func in ("max", "min"):
         exclude_expected = {"axis", "kwargs", "skipna"}
         exclude_result = {"min_count", "engine", "engine_kwargs"}
-    elif groupby_func in ("sum", "mean"):
+    elif groupby_func in ("sum", "mean", "std", "var"):
         exclude_expected = {"axis", "kwargs"}
         exclude_result = {"engine", "engine_kwargs"}
-    elif groupby_func in ("std", "var"):
-        exclude_expected = {"axis", "kwargs", "skipna"}
-        exclude_result = {"engine", "engine_kwargs"}
-    elif groupby_func in ("median", "prod", "sem"):
+    elif groupby_func in ("median"):
         exclude_expected = {"axis", "kwargs", "skipna"}
+    elif groupby_func in ("prod", "sem"):
+        exclude_expected = {"axis", "kwargs"}
     elif groupby_func in ("bfill", "ffill"):
         exclude_expected = {"inplace", "axis", "limit_area"}
     elif groupby_func in ("cummax", "cummin"):
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
@@ -514,6 +514,42 @@ def test_sum_skipna_object(skipna):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "func, values, dtype, result_dtype",
+    [
+        ("prod", [0, 1, 3, np.nan, 4, 5, 6, 7, -8, 9], "float64", "float64"),
+        ("prod", [0, -1, 3, 4, 5, np.nan, 6, 7, 8, 9], "Float64", "Float64"),
+        ("prod", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Int64", "Int64"),
+        ("var", [0, -1, 3, 4, np.nan, 5, 6, 7, 8, 9], "float64", "float64"),
+        ("var", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Float64", "Float64"),
+        ("var", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Int64", "Float64"),
+        ("std", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "float64", "float64"),
+        ("std", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Float64", "Float64"),
+        ("std", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Int64", "Float64"),
+        ("sem", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"),
+        ("sem", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"),
+        ("sem", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"),
+    ],
+)
+def test_multifunc_skipna(func, values, dtype, result_dtype, skipna):
+    # GH#15675
+    df = DataFrame(
+        {
+            "val": values,
+            "cat": ["A", "B"] * 5,
+        }
+    ).astype({"val": dtype})
+    # We need to recast the expected values to the result_dtype as some operations
+    # change the dtype
+    expected = (
+        df.groupby("cat")["val"]
+        .apply(lambda x: getattr(x, func)(skipna=skipna))
+        .astype(result_dtype)
+    )
+    result = getattr(df.groupby("cat")["val"], func)(skipna=skipna)
+    tm.assert_series_equal(result, expected)
+
+
 def test_cython_median():
     arr = np.random.default_rng(2).standard_normal(1000)
     arr[::2] = np.nan