Skip to content
Merged
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def group_prod(
mask: np.ndarray | None,
result_mask: np.ndarray | None = ...,
min_count: int = ...,
skipna: bool = ...,
) -> None: ...
def group_var(
out: np.ndarray, # floating[:, ::1]
Expand All @@ -88,6 +89,7 @@ def group_var(
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ...,
name: str = ...,
skipna: bool = ...,
) -> None: ...
def group_skew(
out: np.ndarray, # float64_t[:, ::1]
Expand Down
39 changes: 38 additions & 1 deletion pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -806,13 +806,14 @@ def group_prod(
const uint8_t[:, ::1] mask,
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
int64float_t val
int64float_t val, nan_val
int64float_t[:, ::1] prodx
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
Expand All @@ -825,6 +826,13 @@ def group_prod(
prodx = np.ones((<object>out).shape, dtype=(<object>out).base.dtype)

N, K = (<object>values).shape
if uses_mask:
nan_val = 0
elif int64float_t is int64_t or int64float_t is uint64_t:
# This has no effect as int64 can't be nan. Setting to 0 to avoid type error
nan_val = 0
else:
nan_val = NAN

with nogil:
for i in range(N):
Expand All @@ -836,6 +844,13 @@ def group_prod(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
_treat_as_na(prodx[lab, j], False)
):
# If prod is already NA, no need to update it
continue

if uses_mask:
isna_entry = mask[i, j]
else:
Expand All @@ -844,6 +859,11 @@ def group_prod(
if not isna_entry:
nobs[lab, j] += 1
prodx[lab, j] *= val
elif not skipna:
if uses_mask:
result_mask[lab, j] = True
else:
prodx[lab, j] = nan_val

_check_below_mincount(
out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
Expand All @@ -864,6 +884,7 @@ def group_var(
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
str name="var",
bint skipna=True,
) -> None:
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
Expand Down Expand Up @@ -898,6 +919,16 @@ def group_var(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and out[lab, j] == NPY_NAT) or
_treat_as_na(out[lab, j], False)
):
# If aggregate is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
elif is_datetimelike:
Expand All @@ -913,6 +944,12 @@ def group_var(
oldmean = mean[lab, j]
mean[lab, j] += (val - oldmean) / nobs[lab, j]
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
elif not skipna:
nobs[lab, j] = 0
if uses_mask:
result_mask[lab, j] = True
else:
out[lab, j] = NAN

for i in range(ncounts):
for j in range(K):
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/_numba/kernels/var_.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def grouped_var(
ngroups: int,
min_periods: int,
ddof: int = 1,
skipna: bool = True,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)

Expand All @@ -190,7 +191,15 @@ def grouped_var(
lab = labels[i]
val = values[i]

if lab < 0:
if lab < 0 or np.isnan(output[lab]):
continue

if not skipna and np.isnan(val):
output[lab] = np.nan
nobs_arr[lab] += 1
comp_arr[lab] = np.nan
consecutive_counts[lab] = 1
prev_vals[lab] = np.nan
continue

mean_x = means[lab]
Expand Down
51 changes: 45 additions & 6 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2349,6 +2349,7 @@ def std(
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
numeric_only: bool = False,
skipna: bool = True,
):
"""
Compute standard deviation of groups, excluding missing values.
Expand Down Expand Up @@ -2387,6 +2388,12 @@ def std(

numeric_only now defaults to ``False``.

skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.

.. versionadded:: 3.0.0

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -2441,14 +2448,16 @@ def std(
engine_kwargs,
min_periods=0,
ddof=ddof,
skipna=skipna,
)
)
else:
return self._cython_agg_general(
"std",
alt=lambda x: Series(x, copy=False).std(ddof=ddof),
alt=lambda x: Series(x, copy=False).std(ddof=ddof, skipna=skipna),
numeric_only=numeric_only,
ddof=ddof,
skipna=skipna,
)

@final
Expand All @@ -2460,6 +2469,7 @@ def var(
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
numeric_only: bool = False,
skipna: bool = True,
):
"""
Compute variance of groups, excluding missing values.
Expand Down Expand Up @@ -2497,6 +2507,12 @@ def var(

numeric_only now defaults to ``False``.

skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.

.. versionadded:: 3.0.0

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -2550,13 +2566,15 @@ def var(
engine_kwargs,
min_periods=0,
ddof=ddof,
skipna=skipna,
)
else:
return self._cython_agg_general(
"var",
alt=lambda x: Series(x, copy=False).var(ddof=ddof),
alt=lambda x: Series(x, copy=False).var(ddof=ddof, skipna=skipna),
numeric_only=numeric_only,
ddof=ddof,
skipna=skipna,
)

@final
Expand Down Expand Up @@ -2686,7 +2704,9 @@ def _value_counts(
return result.__finalize__(self.obj, method="value_counts")

@final
def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:
def sem(
self, ddof: int = 1, numeric_only: bool = False, skipna: bool = True
) -> NDFrameT:
"""
Compute standard error of the mean of groups, excluding missing values.

Expand All @@ -2706,6 +2726,12 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:

numeric_only now defaults to ``False``.

skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.

.. versionadded:: 3.0.0

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -2780,9 +2806,10 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:
)
return self._cython_agg_general(
"sem",
alt=lambda x: Series(x, copy=False).sem(ddof=ddof),
alt=lambda x: Series(x, copy=False).sem(ddof=ddof, skipna=skipna),
numeric_only=numeric_only,
ddof=ddof,
skipna=skipna,
)

@final
Expand Down Expand Up @@ -2959,7 +2986,9 @@ def sum(
return result

@final
def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:
def prod(
self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True
) -> NDFrameT:
"""
Compute prod of group values.

Expand All @@ -2976,6 +3005,12 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:
The required number of valid values to perform the operation. If fewer
than ``min_count`` non-NA values are present the result will be NA.

skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.

.. versionadded:: 3.0.0

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -3024,7 +3059,11 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:
2 30 72
"""
return self._agg_general(
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
numeric_only=numeric_only,
min_count=min_count,
skipna=skipna,
alias="prod",
npfunc=np.prod,
)

@final
Expand Down
51 changes: 50 additions & 1 deletion pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1440,12 +1440,61 @@ def var(
return self._downsample("var", ddof=ddof, numeric_only=numeric_only)

@final
@doc(GroupBy.sem)
def sem(
self,
ddof: int = 1,
numeric_only: bool = False,
):
"""
Compute standard error of the mean of groups, excluding missing values.

For multiple groupings, the result index will be a MultiIndex.

Parameters
----------
ddof : int, default 1
Degrees of freedom.

numeric_only : bool, default False
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

.. versionchanged:: 2.0.0

numeric_only now defaults to ``False``.

Returns
-------
Series or DataFrame
Standard error of the mean of values within each group.

See Also
--------
DataFrame.sem : Return unbiased standard error of the mean over requested axis.
Series.sem : Return unbiased standard error of the mean over requested axis.

Examples
--------

>>> ser = pd.Series(
... [1, 3, 2, 4, 3, 8],
... index=pd.DatetimeIndex(
... [
... "2023-01-01",
... "2023-01-10",
... "2023-01-15",
... "2023-02-01",
... "2023-02-10",
... "2023-02-15",
... ]
... ),
... )
>>> ser.resample("MS").sem()
2023-01-01 0.577350
2023-02-01 1.527525
Freq: MS, dtype: float64
"""
return self._downsample("sem", ddof=ddof, numeric_only=numeric_only)

@final
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/aggregate/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("func", ["sum", "mean"])
@pytest.mark.parametrize("func", ["sum", "mean", "var", "std"])
def test_multifunc_numba_vs_cython_frame_noskipna(func):
pytest.importorskip("numba")
data = DataFrame(
Expand Down
Loading
Loading