Skip to content

Commit 7712840

Browse files
committed
ENH: Support kurtosis (kurt) in DataFrameGroupBy and SeriesGroupBy
1 parent 106f33c commit 7712840

File tree

18 files changed

+379
-20
lines changed

18 files changed

+379
-20
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ Other enhancements
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
5757
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
5858
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
59+
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`SeriesGroupBy.apply`, :meth:`DataFrame.apply` now support ``kurt`` (:issue:`40139`)
5960
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
6061
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
6162
- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)

pandas/_libs/groupby.pyi

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,15 @@ def group_skew(
9797
result_mask: np.ndarray | None = ...,
9898
skipna: bool = ...,
9999
) -> None: ...
100+
def group_kurt(
101+
out: np.ndarray, # float64_t[:, ::1]
102+
counts: np.ndarray, # int64_t[::1]
103+
values: np.ndarray, # ndarray[float64_T, ndim=2]
104+
labels: np.ndarray, # const intp_t[::1]
105+
mask: np.ndarray | None = ...,
106+
result_mask: np.ndarray | None = ...,
107+
skipna: bool = ...,
108+
) -> None: ...
100109
def group_mean(
101110
out: np.ndarray, # floating[:, ::1]
102111
counts: np.ndarray, # int64_t[::1]

pandas/_libs/groupby.pyx

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -995,6 +995,100 @@ def group_skew(
995995
)
996996

997997

998+
@cython.wraparound(False)
999+
@cython.boundscheck(False)
1000+
@cython.cdivision(True)
1001+
@cython.cpow
1002+
def group_kurt(
1003+
float64_t[:, ::1] out,
1004+
int64_t[::1] counts,
1005+
ndarray[float64_t, ndim=2] values,
1006+
const intp_t[::1] labels,
1007+
const uint8_t[:, ::1] mask=None,
1008+
uint8_t[:, ::1] result_mask=None,
1009+
bint skipna=True,
1010+
) -> None:
1011+
cdef:
1012+
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
1013+
int64_t[:, ::1] nobs
1014+
Py_ssize_t len_values = len(values), len_labels = len(labels)
1015+
bint isna_entry, uses_mask = mask is not None
1016+
float64_t[:, ::1] M1, M2, M3, M4
1017+
float64_t delta, delta_n, delta_n2, term1, val
1018+
int64_t n1, n
1019+
float64_t ct, num, den, adj
1020+
1021+
if len_values != len_labels:
1022+
raise ValueError("len(index) != len(labels)")
1023+
1024+
nobs = np.zeros((<object>out).shape, dtype=np.int64)
1025+
1026+
# M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments
1027+
M1 = np.zeros((<object>out).shape, dtype=np.float64)
1028+
M2 = np.zeros((<object>out).shape, dtype=np.float64)
1029+
M3 = np.zeros((<object>out).shape, dtype=np.float64)
1030+
M4 = np.zeros((<object>out).shape, dtype=np.float64)
1031+
1032+
N, K = (<object>values).shape
1033+
1034+
out[:, :] = 0.0
1035+
1036+
with nogil:
1037+
for i in range(N):
1038+
lab = labels[i]
1039+
if lab < 0:
1040+
continue
1041+
1042+
counts[lab] += 1
1043+
1044+
for j in range(K):
1045+
val = values[i, j]
1046+
1047+
if uses_mask:
1048+
isna_entry = mask[i, j]
1049+
else:
1050+
isna_entry = _treat_as_na(val, False)
1051+
1052+
if not isna_entry:
1053+
# Based on RunningStats::Push from
1054+
# https://www.johndcook.com/blog/skewness_kurtosis/
1055+
n1 = nobs[lab, j]
1056+
n = n1 + 1
1057+
1058+
nobs[lab, j] = n
1059+
delta = val - M1[lab, j]
1060+
delta_n = delta / n
1061+
delta_n2 = delta_n * delta_n
1062+
term1 = delta * delta_n * n1
1063+
1064+
M1[lab, j] += delta_n
1065+
M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3)
1066+
+ 6 * delta_n2 * M2[lab, j]
1067+
- 4 * delta_n * M3[lab, j])
1068+
M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j]
1069+
M2[lab, j] += term1
1070+
elif not skipna:
1071+
M1[lab, j] = NaN
1072+
M2[lab, j] = NaN
1073+
M3[lab, j] = NaN
1074+
M4[lab, j] = NaN
1075+
1076+
for i in range(ngroups):
1077+
for j in range(K):
1078+
ct = <float64_t>nobs[i, j]
1079+
if ct < 4:
1080+
if result_mask is not None:
1081+
result_mask[i, j] = 1
1082+
out[i, j] = NaN
1083+
elif M2[i, j] == 0:
1084+
out[i, j] = 0
1085+
else:
1086+
num = ct * (ct + 1) * (ct - 1) * M4[i, j]
1087+
den = (ct - 2) * (ct - 3) * M2[i, j] ** 2
1088+
adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3))
1089+
out[i, j] = num / den - adj
1090+
1091+
9981092
@cython.wraparound(False)
9991093
@cython.boundscheck(False)
10001094
def group_mean(

pandas/core/arrays/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2618,6 +2618,7 @@ def _groupby_op(
26182618
"sem",
26192619
"var",
26202620
"skew",
2621+
"kurt",
26212622
]:
26222623
raise TypeError(
26232624
f"dtype '{self.dtype}' does not support operation '{how}'"

pandas/core/arrays/categorical.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2736,7 +2736,7 @@ def _groupby_op(
27362736
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)
27372737

27382738
dtype = self.dtype
2739-
if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
2739+
if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]:
27402740
raise TypeError(f"{dtype} type does not support {how} operations")
27412741
if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
27422742
# raise TypeError instead of NotImplementedError to ensure we

pandas/core/arrays/datetimelike.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,7 +1656,7 @@ def _groupby_op(
16561656
dtype = self.dtype
16571657
if dtype.kind == "M":
16581658
# Adding/multiplying datetimes is not valid
1659-
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
1659+
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
16601660
raise TypeError(f"datetime64 type does not support operation '{how}'")
16611661
if how in ["any", "all"]:
16621662
# GH#34479
@@ -1667,7 +1667,7 @@ def _groupby_op(
16671667

16681668
elif isinstance(dtype, PeriodDtype):
16691669
# Adding/multiplying Periods is not valid
1670-
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
1670+
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
16711671
raise TypeError(f"Period type does not support {how} operations")
16721672
if how in ["any", "all"]:
16731673
# GH#34479
@@ -1677,7 +1677,7 @@ def _groupby_op(
16771677
)
16781678
else:
16791679
# timedeltas we can add but not multiply
1680-
if how in ["prod", "cumprod", "skew", "var"]:
1680+
if how in ["prod", "cumprod", "skew", "kurt", "var"]:
16811681
raise TypeError(f"timedelta64 type does not support {how} operations")
16821682

16831683
# All of the functions implemented here are ordinal, so we can

pandas/core/groupby/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class OutputKey:
5050
"sem",
5151
"size",
5252
"skew",
53+
"kurt",
5354
"std",
5455
"sum",
5556
"var",

pandas/core/groupby/generic.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,84 @@ def alt(obj):
12791279
"skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
12801280
)
12811281

1282+
def kurt(
1283+
self,
1284+
skipna: bool = True,
1285+
numeric_only: bool = False,
1286+
**kwargs,
1287+
) -> Series:
1288+
"""
1289+
Return unbiased kurtosis within groups.
1290+
1291+
Parameters
1292+
----------
1293+
skipna : bool, default True
1294+
Exclude NA/null values when computing the result.
1295+
1296+
numeric_only : bool, default False
1297+
Include only float, int, boolean columns. Not implemented for Series.
1298+
1299+
**kwargs
1300+
Additional keyword arguments to be passed to the function.
1301+
1302+
Returns
1303+
-------
1304+
Series
1305+
Unbiased kurtosis within groups.
1306+
1307+
See Also
1308+
--------
1309+
Series.kurt : Return unbiased kurtosis over requested axis.
1310+
1311+
Examples
1312+
--------
1313+
>>> ser = pd.Series(
1314+
... [390.0, 350.0, 357.0, 333.0, np.nan, 22.0, 20.0, 30.0, 40.0, 41.0],
1315+
... index=[
1316+
... "Falcon",
1317+
... "Falcon",
1318+
... "Falcon",
1319+
... "Falcon",
1320+
... "Falcon",
1321+
... "Parrot",
1322+
... "Parrot",
1323+
... "Parrot",
1324+
... "Parrot",
1325+
... "Parrot",
1326+
... ],
1327+
... name="Max Speed",
1328+
... )
1329+
>>> ser
1330+
Falcon 390.0
1331+
Falcon 350.0
1332+
Falcon 357.0
1333+
Falcon 333.0
1334+
Falcon NaN
1335+
Parrot 22.0
1336+
Parrot 20.0
1337+
Parrot 30.0
1338+
Parrot 40.0
1339+
Parrot 41.0
1340+
Name: Max Speed, dtype: float64
1341+
>>> ser.groupby(level=0).kurt()
1342+
Falcon 1.622109
1343+
Parrot -2.878714
1344+
Name: Max Speed, dtype: float64
1345+
>>> ser.groupby(level=0).kurt(skipna=False)
1346+
Falcon NaN
1347+
Parrot -2.878714
1348+
Name: Max Speed, dtype: float64
1349+
"""
1350+
1351+
def alt(obj):
1352+
# This should not be reached since the cython path should raise
1353+
# TypeError and not NotImplementedError.
1354+
raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}")
1355+
1356+
return self._cython_agg_general(
1357+
"kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
1358+
)
1359+
12821360
@property
12831361
@doc(Series.plot.__doc__)
12841362
def plot(self) -> GroupByPlot:
@@ -2905,6 +2983,116 @@ def alt(obj):
29052983
"skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
29062984
)
29072985

2986+
def kurt(
2987+
self,
2988+
skipna: bool = True,
2989+
numeric_only: bool = False,
2990+
**kwargs,
2991+
) -> DataFrame:
2992+
"""
2993+
Return unbiased kurtosis within groups.
2994+
2995+
Parameters
2996+
----------
2997+
skipna : bool, default True
2998+
Exclude NA/null values when computing the result.
2999+
3000+
numeric_only : bool, default False
3001+
Include only float, int, boolean columns.
3002+
3003+
**kwargs
3004+
Additional keyword arguments to be passed to the function.
3005+
3006+
Returns
3007+
-------
3008+
DataFrame
3009+
Unbiased kurtosis within groups.
3010+
3011+
See Also
3012+
--------
3013+
DataFrame.kurt : Return unbiased kurtosis over requested axis.
3014+
3015+
Examples
3016+
--------
3017+
>>> arrays = [
3018+
... [
3019+
... "falcon",
3020+
... "parrot",
3021+
... "cockatoo",
3022+
... "kiwi",
3023+
... "eagle",
3024+
... "lion",
3025+
... "monkey",
3026+
... "rabbit",
3027+
... "dog",
3028+
... "wolf",
3029+
... ],
3030+
... [
3031+
... "bird",
3032+
... "bird",
3033+
... "bird",
3034+
... "bird",
3035+
... "bird",
3036+
... "mammal",
3037+
... "mammal",
3038+
... "mammal",
3039+
... "mammal",
3040+
... "mammal",
3041+
... ],
3042+
... ]
3043+
>>> index = pd.MultiIndex.from_arrays(arrays, names=("name", "class"))
3044+
>>> df = pd.DataFrame(
3045+
... {
3046+
... "max_speed": [
3047+
... 389.0,
3048+
... 24.0,
3049+
... 70.0,
3050+
... np.nan,
3051+
... 350.0,
3052+
... 80.5,
3053+
... 21.5,
3054+
... 15.0,
3055+
... 40.0,
3056+
... 50.0,
3057+
... ]
3058+
... },
3059+
... index=index,
3060+
... )
3061+
>>> df
3062+
max_speed
3063+
name class
3064+
falcon bird 389.0
3065+
parrot bird 24.0
3066+
cockatoo bird 70.0
3067+
kiwi bird NaN
3068+
eagle bird 350.0
3069+
lion mammal 80.5
3070+
monkey mammal 21.5
3071+
rabbit mammal 15.0
3072+
dog mammal 40.0
3073+
wolf mammal 50.0
3074+
>>> gb = df.groupby(["class"])
3075+
>>> gb.kurt()
3076+
max_speed
3077+
class
3078+
bird -5.493277
3079+
mammal 0.204125
3080+
>>> gb.kurt(skipna=False)
3081+
max_speed
3082+
class
3083+
bird NaN
3084+
mammal 0.204125
3085+
"""
3086+
3087+
def alt(obj):
3088+
# This should not be reached since the cython path should raise
3089+
# TypeError and not NotImplementedError.
3090+
raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}")
3091+
3092+
return self._cython_agg_general(
3093+
"kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
3094+
)
3095+
29083096
@property
29093097
@doc(DataFrame.plot.__doc__)
29103098
def plot(self) -> GroupByPlot:

0 commit comments

Comments
 (0)