Skip to content

Commit 4e92c63

Browse files
authored
BUG: zero result in Series.kurt for low variance arrays (#62405)
1 parent dab2f73 commit 4e92c63

File tree

3 files changed

+76
-17
lines changed

3 files changed

+76
-17
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,6 +1148,7 @@ Other
11481148
- Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
11491149
- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
11501150
- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`)
1151+
- Bug in :meth:`Series.kurt` and :meth:`Series.skew` resulting in zero for low variance arrays (:issue:`57972`)
11511152
- Bug in :meth:`Series.map` with a ``timestamp[pyarrow]`` dtype or ``duration[pyarrow]`` dtype incorrectly returning all-``NaN`` entries (:issue:`61231`)
11521153
- Bug in :meth:`Series.mode` where an exception was raised when taking the mode with nullable types with no null values in the series. (:issue:`58926`)
11531154
- Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)

pandas/core/nanops.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1273,12 +1273,13 @@ def nanskew(
12731273
m2 = adjusted2.sum(axis, dtype=np.float64)
12741274
m3 = adjusted3.sum(axis, dtype=np.float64)
12751275

1276-
# floating point error
1277-
#
1278-
# #18044 in _libs/windows.pyx calc_skew follow this behavior
1279-
# to fix the fperr to treat m2 <1e-14 as zero
1280-
m2 = _zero_out_fperr(m2)
1281-
m3 = _zero_out_fperr(m3)
1276+
# floating point error. See comment in [nankurt]
1277+
max_abs = np.abs(values).max(axis, initial=0.0)
1278+
eps = np.finfo(m2.dtype).eps
1279+
constant_tolerance2 = ((eps * max_abs) ** 2) * count
1280+
constant_tolerance3 = ((eps * max_abs) ** 3) * count
1281+
m2 = _zero_out_fperr(m2, constant_tolerance2)
1282+
m3 = _zero_out_fperr(m3, constant_tolerance3)
12821283

12831284
with np.errstate(invalid="ignore", divide="ignore"):
12841285
result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)
@@ -1361,18 +1362,40 @@ def nankurt(
13611362
m2 = adjusted2.sum(axis, dtype=np.float64)
13621363
m4 = adjusted4.sum(axis, dtype=np.float64)
13631364

1365+
# Several floating point errors may occur during the summation due to rounding.
1366+
# This computation is similar to the one in Scipy
1367+
# https://github.com/scipy/scipy/blob/04d6d9c460b1fed83f2919ecec3d743cfa2e8317/scipy/stats/_stats_py.py#L1429
1368+
# With a few modifications, like using the maximum value instead of the averages
1369+
# and some adaptations because they use the average and we use the sum for `m2`.
1370+
# We need to estimate an upper bound to the error to consider the data constant.
1371+
# Lets call:
1372+
# x: true value in data
1373+
# y: floating point representation
1374+
# e: relative approximation error
1375+
# n: number of observations in array
1376+
#
1377+
# We have that:
1378+
# |x - y|/|x| <= e (See https://en.wikipedia.org/wiki/Machine_epsilon)
1379+
# (|x - y|/|x|)² <= e²
1380+
# Σ (|x - y|/|x|)² <= ne²
1381+
#
1382+
# Lets say that the fperr upper bound for m2 is constrained by the summation.
1383+
# |m2 - y|/|m2| <= ne²
1384+
# |m2 - y| <= n|m2|e²
1385+
#
1386+
# We will use max (x²) to estimate |m2|
1387+
max_abs = np.abs(values).max(axis, initial=0.0)
1388+
eps = np.finfo(m2.dtype).eps
1389+
constant_tolerance2 = ((eps * max_abs) ** 2) * count
1390+
constant_tolerance4 = ((eps * max_abs) ** 4) * count
1391+
m2 = _zero_out_fperr(m2, constant_tolerance2)
1392+
m4 = _zero_out_fperr(m4, constant_tolerance4)
1393+
13641394
with np.errstate(invalid="ignore", divide="ignore"):
13651395
adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
13661396
numerator = count * (count + 1) * (count - 1) * m4
13671397
denominator = (count - 2) * (count - 3) * m2**2
13681398

1369-
# floating point error
1370-
#
1371-
# #18044 in _libs/windows.pyx calc_kurt follow this behavior
1372-
# to fix the fperr to treat denom <1e-14 as zero
1373-
numerator = _zero_out_fperr(numerator)
1374-
denominator = _zero_out_fperr(denominator)
1375-
13761399
if not isinstance(denominator, np.ndarray):
13771400
# if ``denom`` is a scalar, check these corner cases first before
13781401
# doing division
@@ -1587,12 +1610,12 @@ def check_below_min_count(
15871610
return False
15881611

15891612

1590-
def _zero_out_fperr(arg):
1613+
def _zero_out_fperr(arg, tol: float | np.ndarray):
15911614
# #18044 reference this behavior to fix rolling skew/kurt issue
15921615
if isinstance(arg, np.ndarray):
1593-
return np.where(np.abs(arg) < 1e-14, 0, arg)
1616+
return np.where(np.abs(arg) < tol, 0, arg)
15941617
else:
1595-
return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
1618+
return arg.dtype.type(0) if np.abs(arg) < tol else arg
15961619

15971620

15981621
@disallow("M8", "m8")

pandas/tests/test_nanops.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1051,6 +1051,23 @@ def test_nans_skipna(self, samples, actual_skew):
10511051
skew = nanops.nanskew(samples, skipna=True)
10521052
tm.assert_almost_equal(skew, actual_skew)
10531053

1054+
@pytest.mark.parametrize(
1055+
"initial_data, nobs",
1056+
[
1057+
([-2.05191341e-05, -4.10391103e-05], 27),
1058+
([-2.05191341e-10, -4.10391103e-10], 27),
1059+
([-2.05191341e-05, -4.10391103e-05], 10_000),
1060+
([-2.05191341e-10, -4.10391103e-10], 10_000),
1061+
],
1062+
)
1063+
def test_low_variance(self, initial_data, nobs):
1064+
st = pytest.importorskip("scipy.stats")
1065+
data = np.zeros((nobs,), dtype=np.float64)
1066+
data[: len(initial_data)] = initial_data
1067+
skew = nanops.nanskew(data)
1068+
expected = st.skew(data, bias=False)
1069+
tm.assert_almost_equal(skew, expected)
1070+
10541071
@property
10551072
def prng(self):
10561073
return np.random.default_rng(2)
@@ -1072,7 +1089,7 @@ def test_constant_series(self, val):
10721089
# xref GH 11974
10731090
data = val * np.ones(300)
10741091
kurt = nanops.nankurt(data)
1075-
assert kurt == 0.0
1092+
tm.assert_equal(kurt, 0.0)
10761093

10771094
def test_all_finite(self):
10781095
alpha, beta = 0.3, 0.1
@@ -1102,6 +1119,24 @@ def test_nans_skipna(self, samples, actual_kurt):
11021119
kurt = nanops.nankurt(samples, skipna=True)
11031120
tm.assert_almost_equal(kurt, actual_kurt)
11041121

1122+
@pytest.mark.parametrize(
1123+
"initial_data, nobs",
1124+
[
1125+
([-2.05191341e-05, -4.10391103e-05], 27),
1126+
([-2.05191341e-10, -4.10391103e-10], 27),
1127+
([-2.05191341e-05, -4.10391103e-05], 10_000),
1128+
([-2.05191341e-10, -4.10391103e-10], 10_000),
1129+
],
1130+
)
1131+
def test_low_variance(self, initial_data, nobs):
1132+
# GH#57972
1133+
st = pytest.importorskip("scipy.stats")
1134+
data = np.zeros((nobs,), dtype=np.float64)
1135+
data[: len(initial_data)] = initial_data
1136+
kurt = nanops.nankurt(data)
1137+
expected = st.kurtosis(data, bias=False)
1138+
tm.assert_almost_equal(kurt, expected)
1139+
11051140
@property
11061141
def prng(self):
11071142
return np.random.default_rng(2)

0 commit comments

Comments
 (0)