Skip to content
Merged
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def group_max(
is_datetimelike: bool = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_min(
out: np.ndarray, # groupby_t[:, ::1]
Expand All @@ -195,6 +196,7 @@ def group_min(
is_datetimelike: bool = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_idxmin_idxmax(
out: npt.NDArray[np.intp],
Expand Down
81 changes: 49 additions & 32 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -815,7 +815,7 @@ def group_prod(
int64float_t[:, ::1] prodx
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None

if len_values != len_labels:
raise ValueError("len(index) != len(labels)")
Expand All @@ -842,17 +842,16 @@ def group_prod(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
_treat_as_na(prodx[lab, j], False)
):
# If prod is already NA, no need to update it
continue

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, False)
isna_result = _treat_as_na(prodx[lab, j], False)

if not skipna and isna_result:
# If prod is already NA, no need to update it
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -890,7 +889,7 @@ def group_var(
floating[:, ::1] mean
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None
bint is_std = name == "std"
bint is_sem = name == "sem"

Expand All @@ -917,25 +916,24 @@ def group_var(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and out[lab, j] == NPY_NAT) or
_treat_as_na(out[lab, j], False)
):
# If aggregate is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_var, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
isna_result = out[lab, j] == NPY_NAT
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(out[lab, j], is_datetimelike)

if not skipna and isna_result:
# If aggregate is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -1201,7 +1199,7 @@ def group_mean(
mean_t[:, ::1] sumx, compensation
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None

assert min_count == -1, "'min_count' only used in sum and prod"

Expand Down Expand Up @@ -1231,25 +1229,24 @@ def group_mean(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and sumx[lab, j] == NPY_NAT) or
_treat_as_na(sumx[lab, j], False)
):
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_mean, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
isna_result = sumx[lab, j] == NPY_NAT
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)

if not skipna and isna_result:
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -1843,6 +1840,7 @@ cdef group_min_max(
bint compute_max=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
):
"""
Compute minimum/maximum of columns of `values`, in row groups `labels`.
Expand Down Expand Up @@ -1870,6 +1868,8 @@ cdef group_min_max(
result_mask : ndarray[bool, ndim=2], optional
If not None, these specify locations in the output that are NA.
Modified in-place.
skipna : bool, default True
If True, ignore nans in `values`.

Notes
-----
Expand All @@ -1878,17 +1878,18 @@ cdef group_min_max(
"""
cdef:
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
numeric_t val
numeric_t val, nan_val
numeric_t[:, ::1] group_min_or_max
int64_t[:, ::1] nobs
bint uses_mask = mask is not None
bint isna_entry
bint isna_entry, isna_result

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

min_count = max(min_count, 1)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
nan_val = _get_na_val(<numeric_t>0, is_datetimelike)

group_min_or_max = np.empty_like(out)
group_min_or_max[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike)
Expand All @@ -1907,8 +1908,15 @@ cdef group_min_max(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(group_min_or_max[lab, j],
is_datetimelike)

if not skipna and isna_result:
# If current min/max is already NA, it will always be NA
continue

if not isna_entry:
nobs[lab, j] += 1
Expand All @@ -1918,6 +1926,11 @@ cdef group_min_max(
else:
if val < group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
elif not skipna:
if uses_mask:
result_mask[lab, j] = True
else:
group_min_or_max[lab, j] = nan_val

_check_below_mincount(
out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
Expand Down Expand Up @@ -2049,6 +2062,7 @@ def group_max(
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_min_max.__doc__"""
group_min_max(
Expand All @@ -2061,6 +2075,7 @@ def group_max(
compute_max=True,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand All @@ -2075,6 +2090,7 @@ def group_min(
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_min_max.__doc__"""
group_min_max(
Expand All @@ -2087,6 +2103,7 @@ def group_min(
compute_max=False,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand Down
10 changes: 7 additions & 3 deletions pandas/core/_numba/kernels/min_max_.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,15 @@ def sliding_min_max(
return output, na_pos


@numba.jit(nopython=True, nogil=True, parallel=False)
@numba.jit(nopython=True, nogil=False, parallel=False)
def grouped_min_max(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
is_max: bool,
skipna: bool = True,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs = np.zeros(ngroups, dtype=np.int64)
Expand All @@ -97,13 +98,16 @@ def grouped_min_max(
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])):
continue

if values.dtype.kind == "i" or not np.isnan(val):
nobs[lab] += 1
else:
# NaN value cannot be a min/max value
if not skipna:
# If skipna is False and we encounter a NaN,
# both min and max of the group will be NaN
output[lab] = np.nan
continue

if nobs[lab] == 1:
Expand Down
12 changes: 10 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3068,12 +3068,13 @@ def prod(

@final
@doc(
_groupby_agg_method_engine_template,
_groupby_agg_method_skipna_engine_template,
fname="min",
no=False,
mc=-1,
e=None,
ek=None,
s=True,
example=dedent(
"""\
For SeriesGroupBy:
Expand Down Expand Up @@ -3113,6 +3114,7 @@ def min(
self,
numeric_only: bool = False,
min_count: int = -1,
skipna: bool = True,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
Expand All @@ -3125,23 +3127,26 @@ def min(
engine_kwargs,
min_periods=min_count,
is_max=False,
skipna=skipna,
)
else:
return self._agg_general(
numeric_only=numeric_only,
min_count=min_count,
skipna=skipna,
alias="min",
npfunc=np.min,
)

@final
@doc(
_groupby_agg_method_engine_template,
_groupby_agg_method_skipna_engine_template,
fname="max",
no=False,
mc=-1,
e=None,
ek=None,
s=True,
example=dedent(
"""\
For SeriesGroupBy:
Expand Down Expand Up @@ -3181,6 +3186,7 @@ def max(
self,
numeric_only: bool = False,
min_count: int = -1,
skipna: bool = True,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
Expand All @@ -3193,11 +3199,13 @@ def max(
engine_kwargs,
min_periods=min_count,
is_max=True,
skipna=skipna,
)
else:
return self._agg_general(
numeric_only=numeric_only,
min_count=min_count,
skipna=skipna,
alias="max",
npfunc=np.max,
)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_frame_consistency(groupby_func):
elif groupby_func in ("nunique",):
exclude_expected = {"axis"}
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_expected = {"axis", "kwargs"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("sum", "mean", "std", "var"):
exclude_expected = {"axis", "kwargs"}
Expand Down Expand Up @@ -234,7 +234,7 @@ def test_series_consistency(request, groupby_func):
if groupby_func in ("any", "all"):
exclude_expected = {"kwargs", "bool_only", "axis"}
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_expected = {"axis", "kwargs"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("sum", "mean", "std", "var"):
exclude_expected = {"axis", "kwargs"}
Expand Down
Loading
Loading