MAINT: stats.goodness_of_fit: transition to rng (SPEC 7)

mdhaber · mdhaber · commit f51a5dfb831b · 2024-11-12T23:23:43.000-08:00
diff --git a/scipy/stats/_fit.py b/scipy/stats/_fit.py
@@ -2,7 +2,7 @@
 from collections import namedtuple
 import numpy as np
 from scipy import optimize, stats
-from scipy._lib._util import check_random_state
+from scipy._lib._util import check_random_state, _transition_to_rng
 
 
 def _combine_bounds(name, user_bounds, shape_domain, integral):
@@ -738,9 +738,10 @@ def nlpsf(free_params, data=data):  # bind data NOW
                                   'null_distribution'))
 
 
+@_transition_to_rng('random_state')
 def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
                     guessed_params=None, statistic='ad', n_mc_samples=9999,
-                    random_state=None):
+                    rng=None):
     r"""
     Perform a goodness of fit test comparing data to a distribution family.
 
@@ -797,18 +798,11 @@ def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
         The number of Monte Carlo samples drawn from the null hypothesized
         distribution to form the null distribution of the statistic. The
         sample size of each is the same as the given `data`.
-    random_state : {None, int, `numpy.random.Generator`,
-                    `numpy.random.RandomState`}, optional
-
-        Pseudorandom number generator state used to generate the Monte Carlo
-        samples.
-
-        If `random_state` is ``None`` (default), the
-        `numpy.random.RandomState` singleton is used.
-        If `random_state` is an int, a new ``RandomState`` instance is used,
-        seeded with `random_state`.
-        If `random_state` is already a ``Generator`` or ``RandomState``
-        instance, then the provided instance is used.
+    rng : `numpy.random.Generator`, optional
+        Pseudorandom number generator state. When `rng` is None, a new
+        `numpy.random.Generator` is created using entropy from the
+        operating system. Types other than `numpy.random.Generator` are
+        passed to `numpy.random.default_rng` to instantiate a ``Generator``.
 
     Returns
     -------
@@ -996,7 +990,7 @@ def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
 
     >>> known_params = {'loc': loc, 'scale': scale}
     >>> res = stats.goodness_of_fit(stats.norm, x, known_params=known_params,
-    ...                             statistic='ks', random_state=rng)
+    ...                             statistic='ks', rng=rng)
     >>> res.statistic, res.pvalue
     (0.1119257570456813, 0.2788)
 
@@ -1030,7 +1024,7 @@ def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
     as described above. This is where `goodness_of_fit` excels.
 
     >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ks',
-    ...                             random_state=rng)
+    ...                             rng=rng)
     >>> res.statistic, res.pvalue
     (0.1119257570456813, 0.0196)
 
@@ -1062,7 +1056,7 @@ def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
     estimate it directly.
 
     >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ad',
-    ...                             random_state=rng)
+    ...                             rng=rng)
     >>> res.statistic, res.pvalue
     (1.2139573337497467, 0.0034)
 
@@ -1078,7 +1072,7 @@ def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
     >>> rng = np.random.default_rng()
     >>> x = stats.chi(df=2.2, loc=0, scale=2).rvs(size=1000, random_state=rng)
     >>> res = stats.goodness_of_fit(stats.rayleigh, x, statistic='cvm',
-    ...                             known_params={'loc': 0}, random_state=rng)
+    ...                             known_params={'loc': 0}, rng=rng)
 
     This executes fairly quickly, but to check the reliability of the ``fit``
     method, we should inspect the fit result.
@@ -1118,9 +1112,9 @@ def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
 
     """
     args = _gof_iv(dist, data, known_params, fit_params, guessed_params,
-                   statistic, n_mc_samples, random_state)
+                   statistic, n_mc_samples, rng)
     (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
-     guessed_rfd_params, statistic, n_mc_samples_int, random_state) = args
+     guessed_rfd_params, statistic, n_mc_samples_int, rng) = args
 
     # Fit null hypothesis distribution to data
     nhd_fit_fun = _get_fit_fun(dist, data, guessed_nhd_params,
@@ -1129,7 +1123,7 @@ def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
     nhd_dist = dist(*nhd_vals)
 
     def rvs(size):
-        return nhd_dist.rvs(size=size, random_state=random_state)
+        return nhd_dist.rvs(size=size, random_state=rng)
 
     # Define statistic
     fit_fun = _get_fit_fun(dist, data, guessed_rfd_params, fixed_rfd_params)
@@ -1299,7 +1293,7 @@ def _cramer_von_mises(dist, data, axis):
 
 
 def _gof_iv(dist, data, known_params, fit_params, guessed_params, statistic,
-            n_mc_samples, random_state):
+            n_mc_samples, rng):
 
     if not isinstance(dist, stats.rv_continuous):
         message = ("`dist` must be a (non-frozen) instance of "
@@ -1349,7 +1343,7 @@ def _gof_iv(dist, data, known_params, fit_params, guessed_params, statistic,
         message = "`n_mc_samples` must be an integer."
         raise TypeError(message)
 
-    random_state = check_random_state(random_state)
+    rng = check_random_state(rng)
 
     return (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
-            guessed_rfd_params, statistic, n_mc_samples_int, random_state)
+            guessed_rfd_params, statistic, n_mc_samples_int, rng)
diff --git a/scipy/stats/tests/test_fit.py b/scipy/stats/tests/test_fit.py
@@ -828,23 +828,24 @@ def test_gof_iv(self):
         with pytest.raises(TypeError, match=message):
             goodness_of_fit(dist, x, n_mc_samples=1000.5)
 
-        message = "'herring' cannot be used to seed a"
-        with pytest.raises(ValueError, match=message):
-            goodness_of_fit(dist, x, random_state='herring')
+        message = "SeedSequence expects int or sequence"
+        with pytest.raises(TypeError, match=message):
+            goodness_of_fit(dist, x, rng='herring')
 
     def test_against_ks(self):
         rng = np.random.default_rng(8517426291317196949)
         x = examgrades
         known_params = {'loc': np.mean(x), 'scale': np.std(x, ddof=1)}
         res = goodness_of_fit(stats.norm, x, known_params=known_params,
-                              statistic='ks', random_state=rng)
+                              statistic='ks', rng=rng)
         ref = stats.kstest(x, stats.norm(**known_params).cdf, method='exact')
         assert_allclose(res.statistic, ref.statistic)  # ~0.0848
         assert_allclose(res.pvalue, ref.pvalue, atol=5e-3)  # ~0.335
 
     def test_against_lilliefors(self):
         rng = np.random.default_rng(2291803665717442724)
         x = examgrades
+        # preserve use of old random_state during SPEC 7 transition
         res = goodness_of_fit(stats.norm, x, statistic='ks', random_state=rng)
         known_params = {'loc': np.mean(x), 'scale': np.std(x, ddof=1)}
         ref = stats.kstest(x, stats.norm(**known_params).cdf, method='exact')
@@ -856,7 +857,7 @@ def test_against_cvm(self):
         x = examgrades
         known_params = {'loc': np.mean(x), 'scale': np.std(x, ddof=1)}
         res = goodness_of_fit(stats.norm, x, known_params=known_params,
-                              statistic='cvm', random_state=rng)
+                              statistic='cvm', rng=rng)
         ref = stats.cramervonmises(x, stats.norm(**known_params).cdf)
         assert_allclose(res.statistic, ref.statistic)  # ~0.090
         assert_allclose(res.pvalue, ref.pvalue, atol=5e-3)  # ~0.636
@@ -868,7 +869,7 @@ def test_against_anderson_case_0(self):
         # loc that produced critical value of statistic found w/ root_scalar
         known_params = {'loc': 45.01575354024957, 'scale': 30}
         res = goodness_of_fit(stats.norm, x, known_params=known_params,
-                              statistic='ad', random_state=rng)
+                              statistic='ad', rng=rng)
         assert_allclose(res.statistic, 2.492)  # See [1] Table 1A 1.0
         assert_allclose(res.pvalue, 0.05, atol=5e-3)
 
@@ -879,7 +880,7 @@ def test_against_anderson_case_1(self):
         # scale that produced critical value of statistic found w/ root_scalar
         known_params = {'scale': 29.957112639101933}
         res = goodness_of_fit(stats.norm, x, known_params=known_params,
-                              statistic='ad', random_state=rng)
+                              statistic='ad', rng=rng)
         assert_allclose(res.statistic, 0.908)  # See [1] Table 1B 1.1
         assert_allclose(res.pvalue, 0.1, atol=5e-3)
 
@@ -890,7 +891,7 @@ def test_against_anderson_case_2(self):
         # loc that produced critical value of statistic found w/ root_scalar
         known_params = {'loc': 44.5680212261933}
         res = goodness_of_fit(stats.norm, x, known_params=known_params,
-                              statistic='ad', random_state=rng)
+                              statistic='ad', rng=rng)
         assert_allclose(res.statistic, 2.904)  # See [1] Table 1B 1.2
         assert_allclose(res.pvalue, 0.025, atol=5e-3)
 
@@ -900,7 +901,7 @@ def test_against_anderson_case_3(self):
         # c that produced critical value of statistic found w/ root_scalar
         x = stats.skewnorm.rvs(1.4477847789132101, loc=1, scale=2, size=100,
                                random_state=rng)
-        res = goodness_of_fit(stats.norm, x, statistic='ad', random_state=rng)
+        res = goodness_of_fit(stats.norm, x, statistic='ad', rng=rng)
         assert_allclose(res.statistic, 0.559)  # See [1] Table 1B 1.2
         assert_allclose(res.pvalue, 0.15, atol=5e-3)
 
@@ -911,7 +912,7 @@ def test_against_anderson_gumbel_r(self):
         x = stats.genextreme(0.051896837188595134, loc=0.5,
                              scale=1.5).rvs(size=1000, random_state=rng)
         res = goodness_of_fit(stats.gumbel_r, x, statistic='ad',
-                              random_state=rng)
+                              rng=rng)
         ref = stats.anderson(x, dist='gumbel_r')
         assert_allclose(res.statistic, ref.critical_values[0])
         assert_allclose(res.pvalue, ref.significance_level[0]/100, atol=5e-3)
@@ -922,7 +923,7 @@ def test_against_filliben_norm(self):
         y = [6, 1, -4, 8, -2, 5, 0]
         known_params = {'loc': 0, 'scale': 1}
         res = stats.goodness_of_fit(stats.norm, y, known_params=known_params,
-                                    statistic="filliben", random_state=rng)
+                                    statistic="filliben", rng=rng)
         # Slight discrepancy presumably due to roundoff in Filliben's
         # calculation. Using exact order statistic medians instead of
         # Filliben's approximation doesn't account for it.
@@ -944,10 +945,10 @@ def test_filliben_property(self):
         rng = np.random.default_rng(8535677809395478813)
         x = rng.normal(loc=10, scale=0.5, size=100)
         res = stats.goodness_of_fit(stats.norm, x,
-                                    statistic="filliben", random_state=rng)
+                                    statistic="filliben", rng=rng)
         known_params = {'loc': 0, 'scale': 1}
         ref = stats.goodness_of_fit(stats.norm, x, known_params=known_params,
-                                    statistic="filliben", random_state=rng)
+                                    statistic="filliben", rng=rng)
         assert_allclose(res.statistic, ref.statistic, rtol=1e-15)
 
     @pytest.mark.parametrize('case', [(25, [.928, .937, .950, .958, .966]),
@@ -960,7 +961,7 @@ def test_against_filliben_norm_table(self, case):
         x = rng.random(n)
         known_params = {'loc': 0, 'scale': 1}
         res = stats.goodness_of_fit(stats.norm, x, known_params=known_params,
-                                    statistic="filliben", random_state=rng)
+                                    statistic="filliben", rng=rng)
         percentiles = np.array([0.005, 0.01, 0.025, 0.05, 0.1])
         res = stats.scoreatpercentile(res.null_distribution, percentiles*100)
         assert_allclose(res, ref, atol=2e-3)
@@ -980,7 +981,7 @@ def test_against_ppcc(self, case):
         rng = np.random.default_rng(7777775561439803116)
         x = rng.normal(size=n)
         res = stats.goodness_of_fit(stats.rayleigh, x, statistic="filliben",
-                                    random_state=rng)
+                                    rng=rng)
         assert_allclose(res.statistic, ref_statistic, rtol=1e-4)
         assert_allclose(res.pvalue, ref_pvalue, atol=1.5e-2)
 
@@ -1000,7 +1001,7 @@ def test_params_effects(self):
         res1 = goodness_of_fit(stats.weibull_min, x, n_mc_samples=2,
                                guessed_params=guessed_params,
                                fit_params=fit_params,
-                               known_params=known_params, random_state=rng)
+                               known_params=known_params, rng=rng)
         assert not np.allclose(res1.fit_result.params.c, 13.4)
         assert_equal(res1.fit_result.params.scale, 13.73)
         assert_equal(res1.fit_result.params.loc, -13.85)
@@ -1012,7 +1013,7 @@ def test_params_effects(self):
         res2 = goodness_of_fit(stats.weibull_min, x, n_mc_samples=2,
                                guessed_params=guessed_params,
                                fit_params=fit_params,
-                               known_params=known_params, random_state=rng)
+                               known_params=known_params, rng=rng)
         assert not np.allclose(res2.fit_result.params.c,
                                res1.fit_result.params.c, rtol=1e-8)
         assert not np.allclose(res2.null_distribution,
@@ -1028,7 +1029,7 @@ def test_params_effects(self):
         res3 = goodness_of_fit(stats.weibull_min, x, n_mc_samples=2,
                                guessed_params=guessed_params,
                                fit_params=fit_params,
-                               known_params=known_params, random_state=rng)
+                               known_params=known_params, rng=rng)
         assert_equal(res3.fit_result.params.c, 13.4)
         assert_equal(res3.fit_result.params.scale, 13.73)
         assert_equal(res3.fit_result.params.loc, -13.85)
@@ -1058,7 +1059,7 @@ def greenwood(dist, data, *, axis):
         data = stats.expon.rvs(size=5, random_state=rng)
         result = goodness_of_fit(stats.expon, data,
                                  known_params={'loc': 0, 'scale': 1},
-                                 statistic=greenwood, random_state=rng)
+                                 statistic=greenwood, rng=rng)
         p = [.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
         exact_quantiles = [
             .183863, .199403, .210088, .226040, .239947, .253677, .268422,