Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1148,6 +1148,7 @@ Other
- Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`)
- Bug in :meth:`Series.kurt` and :meth:`Series.skew` resulting in zero for low variance arrays (:issue:`57972`)
- Bug in :meth:`Series.map` with a ``timestamp[pyarrow]`` dtype or ``duration[pyarrow]`` dtype incorrectly returning all-``NaN`` entries (:issue:`61231`)
- Bug in :meth:`Series.mode` where an exception was raised when taking the mode with nullable types with no null values in the series. (:issue:`58926`)
- Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
Expand Down
51 changes: 35 additions & 16 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1273,12 +1273,13 @@ def nanskew(
m2 = adjusted2.sum(axis, dtype=np.float64)
m3 = adjusted3.sum(axis, dtype=np.float64)

# floating point error
#
# #18044 in _libs/windows.pyx calc_skew follow this behavior
# to fix the fperr to treat m2 <1e-14 as zero
m2 = _zero_out_fperr(m2)
m3 = _zero_out_fperr(m3)
# floating point error. See comment in [nankurt]
max_abs = np.abs(values).max(axis, initial=0.0)
eps = np.finfo(m2.dtype).eps
constant_tolerance2 = ((eps * max_abs) ** 2) * count
constant_tolerance3 = ((eps * max_abs) ** 3) * count
m2 = _zero_out_fperr(m2, constant_tolerance2)
m3 = _zero_out_fperr(m3, constant_tolerance3)

with np.errstate(invalid="ignore", divide="ignore"):
result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)
Expand Down Expand Up @@ -1361,18 +1362,36 @@ def nankurt(
m2 = adjusted2.sum(axis, dtype=np.float64)
m4 = adjusted4.sum(axis, dtype=np.float64)

# Several floating point errors may occur during the summation due to rounding.
# We need to estimate an upper bound to the error to consider the data constant.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you also mention and maybe link to where Scipy does a similar calculation?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the permalink. It's also important to point out that I adapted their code to use the maximum instead of the mean for scaling. I was worried about zero mean, resulting in tolerance=0. Let me know if it needs any further modifications.

# Lets call:
# x: true value in data
# y: floating point representation
# e: relative approximation error
# n: number of observations in array
#
# We have that:
# |x - y|/|x| <= e (See https://en.wikipedia.org/wiki/Machine_epsilon)
# (|x - y|/|x|)² <= e²
# Σ (|x - y|/|x|)² <= ne²
#
# Lets say that the fperr upper bound for m2 is constrained by the summation.
# |m2 - y|/|m2| <= ne²
# |m2 - y| <= n|m2|e²
#
# We will use max (x²) to estimate |m2|
max_abs = np.abs(values).max(axis, initial=0.0)
eps = np.finfo(m2.dtype).eps
constant_tolerance2 = ((eps * max_abs) ** 2) * count
constant_tolerance4 = ((eps * max_abs) ** 4) * count
m2 = _zero_out_fperr(m2, constant_tolerance2)
m4 = _zero_out_fperr(m4, constant_tolerance4)

with np.errstate(invalid="ignore", divide="ignore"):
adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
numerator = count * (count + 1) * (count - 1) * m4
denominator = (count - 2) * (count - 3) * m2**2

# floating point error
#
# #18044 in _libs/windows.pyx calc_kurt follow this behavior
# to fix the fperr to treat denom <1e-14 as zero
numerator = _zero_out_fperr(numerator)
denominator = _zero_out_fperr(denominator)

if not isinstance(denominator, np.ndarray):
# if ``denom`` is a scalar, check these corner cases first before
# doing division
Expand Down Expand Up @@ -1587,12 +1606,12 @@ def check_below_min_count(
return False


def _zero_out_fperr(arg):
def _zero_out_fperr(arg, tol: float | np.ndarray):
# #18044 reference this behavior to fix rolling skew/kurt issue
if isinstance(arg, np.ndarray):
return np.where(np.abs(arg) < 1e-14, 0, arg)
return np.where(np.abs(arg) < tol, 0, arg)
else:
return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
return arg.dtype.type(0) if np.abs(arg) < tol else arg


@disallow("M8", "m8")
Expand Down
37 changes: 36 additions & 1 deletion pandas/tests/test_nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1051,6 +1051,23 @@ def test_nans_skipna(self, samples, actual_skew):
skew = nanops.nanskew(samples, skipna=True)
tm.assert_almost_equal(skew, actual_skew)

@pytest.mark.parametrize(
"initial_data, nobs",
[
([-2.05191341e-05, -4.10391103e-05], 27),
([-2.05191341e-10, -4.10391103e-10], 27),
([-2.05191341e-05, -4.10391103e-05], 10_000),
([-2.05191341e-10, -4.10391103e-10], 10_000),
],
)
def test_low_variance(self, initial_data, nobs):
st = pytest.importorskip("scipy.stats")
data = np.zeros((nobs,), dtype=np.float64)
data[: len(initial_data)] = initial_data
skew = nanops.nanskew(data)
expected = st.skew(data, bias=False)
tm.assert_almost_equal(skew, expected)

@property
def prng(self):
return np.random.default_rng(2)
Expand All @@ -1072,7 +1089,7 @@ def test_constant_series(self, val):
# xref GH 11974
data = val * np.ones(300)
kurt = nanops.nankurt(data)
assert kurt == 0.0
tm.assert_equal(kurt, 0.0)

def test_all_finite(self):
alpha, beta = 0.3, 0.1
Expand Down Expand Up @@ -1102,6 +1119,24 @@ def test_nans_skipna(self, samples, actual_kurt):
kurt = nanops.nankurt(samples, skipna=True)
tm.assert_almost_equal(kurt, actual_kurt)

@pytest.mark.parametrize(
"initial_data, nobs",
[
([-2.05191341e-05, -4.10391103e-05], 27),
([-2.05191341e-10, -4.10391103e-10], 27),
([-2.05191341e-05, -4.10391103e-05], 10_000),
([-2.05191341e-10, -4.10391103e-10], 10_000),
],
)
def test_low_variance(self, initial_data, nobs):
# GH#57972
st = pytest.importorskip("scipy.stats")
data = np.zeros((nobs,), dtype=np.float64)
data[: len(initial_data)] = initial_data
kurt = nanops.nankurt(data)
expected = st.kurtosis(data, bias=False)
tm.assert_almost_equal(kurt, expected)

@property
def prng(self):
return np.random.default_rng(2)
Expand Down
Loading