Add skipna to groupby median

snitish · snitish · commit e259679825fe · 2025-01-21T19:02:47.000-08:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -58,7 +58,7 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
-- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
+- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
 - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
 - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -13,6 +13,7 @@ def group_median_float64(
     mask: np.ndarray | None = ...,
     result_mask: np.ndarray | None = ...,
     is_datetimelike: bool = ...,  # bint
+    skipna: bool = ...,
 ) -> None: ...
 def group_cumprod(
     out: np.ndarray,  # float64_t[:, ::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -62,7 +62,12 @@ cdef enum InterpolationEnumType:
     INTERPOLATION_MIDPOINT
 
 
-cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil:
+cdef float64_t median_linear_mask(
+    float64_t* a,
+    int n,
+    uint8_t* mask,
+    bint skipna=True
+) noexcept nogil:
     cdef:
         int i, j, na_count = 0
         float64_t* tmp
@@ -77,7 +82,7 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
             na_count += 1
 
     if na_count:
-        if na_count == n:
+        if na_count == n or not skipna:
             return NaN
 
         tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
@@ -104,7 +109,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
 cdef float64_t median_linear(
     float64_t* a,
     int n,
-    bint is_datetimelike=False
+    bint is_datetimelike=False,
+    bint skipna=True,
 ) noexcept nogil:
     cdef:
         int i, j, na_count = 0
@@ -125,7 +131,7 @@ cdef float64_t median_linear(
                 na_count += 1
 
     if na_count:
-        if na_count == n:
+        if na_count == n or not skipna:
             return NaN
 
         tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
@@ -186,6 +192,7 @@ def group_median_float64(
     const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint is_datetimelike=False,
+    bint skipna=True,
 ) -> None:
     """
     Only aggregates on axis=0
@@ -229,7 +236,7 @@ def group_median_float64(
 
                 for j in range(ngroups):
                     size = _counts[j + 1]
-                    result = median_linear_mask(ptr, size, ptr_mask)
+                    result = median_linear_mask(ptr, size, ptr_mask, skipna)
                     out[j, i] = result
 
                     if result != result:
@@ -244,7 +251,7 @@ def group_median_float64(
                 ptr += _counts[0]
                 for j in range(ngroups):
                     size = _counts[j + 1]
-                    out[j, i] = median_linear(ptr, size, is_datetimelike)
+                    out[j, i] = median_linear(ptr, size, is_datetimelike, skipna)
                     ptr += size
 
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2248,7 +2248,7 @@ def mean(
             return result.__finalize__(self.obj, method="groupby")
 
     @final
-    def median(self, numeric_only: bool = False) -> NDFrameT:
+    def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT:
         """
         Compute median of groups, excluding missing values.
 
@@ -2263,6 +2263,12 @@ def median(self, numeric_only: bool = False) -> NDFrameT:
 
                 numeric_only no longer accepts ``None`` and defaults to False.
 
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+
+            .. versionadded:: 3.0.0
+
         Returns
         -------
         Series or DataFrame
@@ -2335,8 +2341,11 @@ def median(self, numeric_only: bool = False) -> NDFrameT:
         """
         result = self._cython_agg_general(
             "median",
-            alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only),
+            alt=lambda x: Series(x, copy=False).median(
+                numeric_only=numeric_only, skipna=skipna
+            ),
             numeric_only=numeric_only,
+            skipna=skipna,
         )
         return result.__finalize__(self.obj, method="groupby")
 
diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py
@@ -179,9 +179,7 @@ def test_frame_consistency(groupby_func):
     elif groupby_func in ("sum", "mean", "std", "var"):
         exclude_expected = {"axis", "kwargs"}
         exclude_result = {"engine", "engine_kwargs"}
-    elif groupby_func in ("median"):
-        exclude_expected = {"axis", "kwargs", "skipna"}
-    elif groupby_func in ("prod", "sem"):
+    elif groupby_func in ("median", "prod", "sem"):
         exclude_expected = {"axis", "kwargs"}
     elif groupby_func in ("bfill", "ffill"):
         exclude_expected = {"inplace", "axis", "limit_area"}
@@ -239,9 +237,7 @@ def test_series_consistency(request, groupby_func):
     elif groupby_func in ("sum", "mean", "std", "var"):
         exclude_expected = {"axis", "kwargs"}
         exclude_result = {"engine", "engine_kwargs"}
-    elif groupby_func in ("median"):
-        exclude_expected = {"axis", "kwargs", "skipna"}
-    elif groupby_func in ("prod", "sem"):
+    elif groupby_func in ("median", "prod", "sem"):
         exclude_expected = {"axis", "kwargs"}
     elif groupby_func in ("bfill", "ffill"):
         exclude_expected = {"inplace", "axis", "limit_area"}
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
@@ -585,6 +585,34 @@ def test_sum_skipna_object(skipna):
             "datetime64[ns]",
             "datetime64[ns]",
         ),
+        ("median", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"),
+        ("median", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"),
+        ("median", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"),
+        (
+            "median",
+            [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9],
+            "timedelta64[ns]",
+            "timedelta64[ns]",
+        ),
+        (
+            "median",
+            pd.to_datetime(
+                [
+                    "2019-05-09",
+                    pd.NaT,
+                    "2019-05-11",
+                    "2019-05-12",
+                    "2019-05-13",
+                    "2019-05-14",
+                    "2019-05-15",
+                    "2019-05-16",
+                    "2019-05-17",
+                    "2019-05-18",
+                ]
+            ),
+            "datetime64[ns]",
+            "datetime64[ns]",
+        ),
     ],
 )
 def test_multifunc_skipna(func, values, dtype, result_dtype, skipna):