Skip to content

Commit e259679

Browse files
committed
Add skipna to groupby median
1 parent 0c58a7d commit e259679

File tree

6 files changed

+56
-15
lines changed

6 files changed

+56
-15
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ Other enhancements
5858
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
5959
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
6060
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
61-
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
61+
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
6262
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
6363
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
6464
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)

pandas/_libs/groupby.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def group_median_float64(
1313
mask: np.ndarray | None = ...,
1414
result_mask: np.ndarray | None = ...,
1515
is_datetimelike: bool = ..., # bint
16+
skipna: bool = ...,
1617
) -> None: ...
1718
def group_cumprod(
1819
out: np.ndarray, # float64_t[:, ::1]

pandas/_libs/groupby.pyx

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,12 @@ cdef enum InterpolationEnumType:
6262
INTERPOLATION_MIDPOINT
6363

6464

65-
cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil:
65+
cdef float64_t median_linear_mask(
66+
float64_t* a,
67+
int n,
68+
uint8_t* mask,
69+
bint skipna=True
70+
) noexcept nogil:
6671
cdef:
6772
int i, j, na_count = 0
6873
float64_t* tmp
@@ -77,7 +82,7 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
7782
na_count += 1
7883

7984
if na_count:
80-
if na_count == n:
85+
if na_count == n or not skipna:
8186
return NaN
8287

8388
tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
@@ -104,7 +109,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
104109
cdef float64_t median_linear(
105110
float64_t* a,
106111
int n,
107-
bint is_datetimelike=False
112+
bint is_datetimelike=False,
113+
bint skipna=True,
108114
) noexcept nogil:
109115
cdef:
110116
int i, j, na_count = 0
@@ -125,7 +131,7 @@ cdef float64_t median_linear(
125131
na_count += 1
126132

127133
if na_count:
128-
if na_count == n:
134+
if na_count == n or not skipna:
129135
return NaN
130136

131137
tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
@@ -186,6 +192,7 @@ def group_median_float64(
186192
const uint8_t[:, :] mask=None,
187193
uint8_t[:, ::1] result_mask=None,
188194
bint is_datetimelike=False,
195+
bint skipna=True,
189196
) -> None:
190197
"""
191198
Only aggregates on axis=0
@@ -229,7 +236,7 @@ def group_median_float64(
229236

230237
for j in range(ngroups):
231238
size = _counts[j + 1]
232-
result = median_linear_mask(ptr, size, ptr_mask)
239+
result = median_linear_mask(ptr, size, ptr_mask, skipna)
233240
out[j, i] = result
234241

235242
if result != result:
@@ -244,7 +251,7 @@ def group_median_float64(
244251
ptr += _counts[0]
245252
for j in range(ngroups):
246253
size = _counts[j + 1]
247-
out[j, i] = median_linear(ptr, size, is_datetimelike)
254+
out[j, i] = median_linear(ptr, size, is_datetimelike, skipna)
248255
ptr += size
249256

250257

pandas/core/groupby/groupby.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2248,7 +2248,7 @@ def mean(
22482248
return result.__finalize__(self.obj, method="groupby")
22492249

22502250
@final
2251-
def median(self, numeric_only: bool = False) -> NDFrameT:
2251+
def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT:
22522252
"""
22532253
Compute median of groups, excluding missing values.
22542254
@@ -2263,6 +2263,12 @@ def median(self, numeric_only: bool = False) -> NDFrameT:
22632263
22642264
numeric_only no longer accepts ``None`` and defaults to False.
22652265
2266+
skipna : bool, default True
2267+
Exclude NA/null values. If an entire row/column is NA, the result
2268+
will be NA.
2269+
2270+
.. versionadded:: 3.0.0
2271+
22662272
Returns
22672273
-------
22682274
Series or DataFrame
@@ -2335,8 +2341,11 @@ def median(self, numeric_only: bool = False) -> NDFrameT:
23352341
"""
23362342
result = self._cython_agg_general(
23372343
"median",
2338-
alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only),
2344+
alt=lambda x: Series(x, copy=False).median(
2345+
numeric_only=numeric_only, skipna=skipna
2346+
),
23392347
numeric_only=numeric_only,
2348+
skipna=skipna,
23402349
)
23412350
return result.__finalize__(self.obj, method="groupby")
23422351

pandas/tests/groupby/test_api.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -179,9 +179,7 @@ def test_frame_consistency(groupby_func):
179179
elif groupby_func in ("sum", "mean", "std", "var"):
180180
exclude_expected = {"axis", "kwargs"}
181181
exclude_result = {"engine", "engine_kwargs"}
182-
elif groupby_func in ("median"):
183-
exclude_expected = {"axis", "kwargs", "skipna"}
184-
elif groupby_func in ("prod", "sem"):
182+
elif groupby_func in ("median", "prod", "sem"):
185183
exclude_expected = {"axis", "kwargs"}
186184
elif groupby_func in ("bfill", "ffill"):
187185
exclude_expected = {"inplace", "axis", "limit_area"}
@@ -239,9 +237,7 @@ def test_series_consistency(request, groupby_func):
239237
elif groupby_func in ("sum", "mean", "std", "var"):
240238
exclude_expected = {"axis", "kwargs"}
241239
exclude_result = {"engine", "engine_kwargs"}
242-
elif groupby_func in ("median"):
243-
exclude_expected = {"axis", "kwargs", "skipna"}
244-
elif groupby_func in ("prod", "sem"):
240+
elif groupby_func in ("median", "prod", "sem"):
245241
exclude_expected = {"axis", "kwargs"}
246242
elif groupby_func in ("bfill", "ffill"):
247243
exclude_expected = {"inplace", "axis", "limit_area"}

pandas/tests/groupby/test_reductions.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,34 @@ def test_sum_skipna_object(skipna):
585585
"datetime64[ns]",
586586
"datetime64[ns]",
587587
),
588+
("median", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"),
589+
("median", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"),
590+
("median", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"),
591+
(
592+
"median",
593+
[0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9],
594+
"timedelta64[ns]",
595+
"timedelta64[ns]",
596+
),
597+
(
598+
"median",
599+
pd.to_datetime(
600+
[
601+
"2019-05-09",
602+
pd.NaT,
603+
"2019-05-11",
604+
"2019-05-12",
605+
"2019-05-13",
606+
"2019-05-14",
607+
"2019-05-15",
608+
"2019-05-16",
609+
"2019-05-17",
610+
"2019-05-18",
611+
]
612+
),
613+
"datetime64[ns]",
614+
"datetime64[ns]",
615+
),
588616
],
589617
)
590618
def test_multifunc_skipna(func, values, dtype, result_dtype, skipna):

0 commit comments

Comments
 (0)