BUG: zero result in Series.kurt for low variance arrays (#62405)

Alvaro-Kothe · web-flow · commit 4e92c631cd82 · 2025-09-22T10:27:13.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1148,6 +1148,7 @@ Other
 - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
 - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
 - Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`)
+- Bug in :meth:`Series.kurt` and :meth:`Series.skew` resulting in zero for low variance arrays (:issue:`57972`)
 - Bug in :meth:`Series.map` with a ``timestamp[pyarrow]`` dtype or ``duration[pyarrow]`` dtype incorrectly returning all-``NaN`` entries (:issue:`61231`)
 - Bug in :meth:`Series.mode` where an exception was raised when taking the mode with nullable types with no null values in the series. (:issue:`58926`)
 - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -1273,12 +1273,13 @@ def nanskew(
     m2 = adjusted2.sum(axis, dtype=np.float64)
     m3 = adjusted3.sum(axis, dtype=np.float64)
 
-    # floating point error
-    #
-    # #18044 in _libs/windows.pyx calc_skew follow this behavior
-    # to fix the fperr to treat m2 <1e-14 as zero
-    m2 = _zero_out_fperr(m2)
-    m3 = _zero_out_fperr(m3)
+    # floating point error. See comment in [nankurt]
+    max_abs = np.abs(values).max(axis, initial=0.0)
+    eps = np.finfo(m2.dtype).eps
+    constant_tolerance2 = ((eps * max_abs) ** 2) * count
+    constant_tolerance3 = ((eps * max_abs) ** 3) * count
+    m2 = _zero_out_fperr(m2, constant_tolerance2)
+    m3 = _zero_out_fperr(m3, constant_tolerance3)
 
     with np.errstate(invalid="ignore", divide="ignore"):
         result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)
@@ -1361,18 +1362,40 @@ def nankurt(
     m2 = adjusted2.sum(axis, dtype=np.float64)
     m4 = adjusted4.sum(axis, dtype=np.float64)
 
+    # Several floating point errors may occur during the summation due to rounding.
+    # This computation is similar to the one in Scipy
+    # https://github.com/scipy/scipy/blob/04d6d9c460b1fed83f2919ecec3d743cfa2e8317/scipy/stats/_stats_py.py#L1429
+    # With a few modifications, like using the maximum value instead of the averages
+    # and some adaptations because they use the average and we use the sum for `m2`.
+    # We need to estimate an upper bound to the error to consider the data constant.
+    # Lets call:
+    # x: true value in data
+    # y: floating point representation
+    # e: relative approximation error
+    # n: number of observations in array
+    #
+    # We have that:
+    # |x - y|/|x| <= e (See https://en.wikipedia.org/wiki/Machine_epsilon)
+    # (|x - y|/|x|)² <= e²
+    # Σ (|x - y|/|x|)² <= ne²
+    #
+    # Lets say that the fperr upper bound for m2 is constrained by the summation.
+    # |m2 - y|/|m2| <= ne²
+    # |m2 - y| <= n|m2|e²
+    #
+    # We will use max (x²) to estimate |m2|
+    max_abs = np.abs(values).max(axis, initial=0.0)
+    eps = np.finfo(m2.dtype).eps
+    constant_tolerance2 = ((eps * max_abs) ** 2) * count
+    constant_tolerance4 = ((eps * max_abs) ** 4) * count
+    m2 = _zero_out_fperr(m2, constant_tolerance2)
+    m4 = _zero_out_fperr(m4, constant_tolerance4)
+
     with np.errstate(invalid="ignore", divide="ignore"):
         adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
         numerator = count * (count + 1) * (count - 1) * m4
         denominator = (count - 2) * (count - 3) * m2**2
 
-    # floating point error
-    #
-    # #18044 in _libs/windows.pyx calc_kurt follow this behavior
-    # to fix the fperr to treat denom <1e-14 as zero
-    numerator = _zero_out_fperr(numerator)
-    denominator = _zero_out_fperr(denominator)
-
     if not isinstance(denominator, np.ndarray):
         # if ``denom`` is a scalar, check these corner cases first before
         # doing division
@@ -1587,12 +1610,12 @@ def check_below_min_count(
     return False
 
 
-def _zero_out_fperr(arg):
+def _zero_out_fperr(arg, tol: float | np.ndarray):
     # #18044 reference this behavior to fix rolling skew/kurt issue
     if isinstance(arg, np.ndarray):
-        return np.where(np.abs(arg) < 1e-14, 0, arg)
+        return np.where(np.abs(arg) < tol, 0, arg)
     else:
-        return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
+        return arg.dtype.type(0) if np.abs(arg) < tol else arg
 
 
 @disallow("M8", "m8")
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -1051,6 +1051,23 @@ def test_nans_skipna(self, samples, actual_skew):
         skew = nanops.nanskew(samples, skipna=True)
         tm.assert_almost_equal(skew, actual_skew)
 
+    @pytest.mark.parametrize(
+        "initial_data, nobs",
+        [
+            ([-2.05191341e-05, -4.10391103e-05], 27),
+            ([-2.05191341e-10, -4.10391103e-10], 27),
+            ([-2.05191341e-05, -4.10391103e-05], 10_000),
+            ([-2.05191341e-10, -4.10391103e-10], 10_000),
+        ],
+    )
+    def test_low_variance(self, initial_data, nobs):
+        st = pytest.importorskip("scipy.stats")
+        data = np.zeros((nobs,), dtype=np.float64)
+        data[: len(initial_data)] = initial_data
+        skew = nanops.nanskew(data)
+        expected = st.skew(data, bias=False)
+        tm.assert_almost_equal(skew, expected)
+
     @property
     def prng(self):
         return np.random.default_rng(2)
@@ -1072,7 +1089,7 @@ def test_constant_series(self, val):
         # xref GH 11974
         data = val * np.ones(300)
         kurt = nanops.nankurt(data)
-        assert kurt == 0.0
+        tm.assert_equal(kurt, 0.0)
 
     def test_all_finite(self):
         alpha, beta = 0.3, 0.1
@@ -1102,6 +1119,24 @@ def test_nans_skipna(self, samples, actual_kurt):
         kurt = nanops.nankurt(samples, skipna=True)
         tm.assert_almost_equal(kurt, actual_kurt)
 
+    @pytest.mark.parametrize(
+        "initial_data, nobs",
+        [
+            ([-2.05191341e-05, -4.10391103e-05], 27),
+            ([-2.05191341e-10, -4.10391103e-10], 27),
+            ([-2.05191341e-05, -4.10391103e-05], 10_000),
+            ([-2.05191341e-10, -4.10391103e-10], 10_000),
+        ],
+    )
+    def test_low_variance(self, initial_data, nobs):
+        # GH#57972
+        st = pytest.importorskip("scipy.stats")
+        data = np.zeros((nobs,), dtype=np.float64)
+        data[: len(initial_data)] = initial_data
+        kurt = nanops.nankurt(data)
+        expected = st.kurtosis(data, bias=False)
+        tm.assert_almost_equal(kurt, expected)
+
     @property
     def prng(self):
         return np.random.default_rng(2)