Merge pull request numpy#27034 from WarrenWeckesser/fix-beta-small-params

charris · web-flow · commit 08d600490f16 · 2024-07-26T16:55:35.000-06:00
BUG: random: Fix edge case of Johnk's algorithm for the beta distribution.
diff --git a/numpy/random/src/distributions/distributions.c b/numpy/random/src/distributions/distributions.c
@@ -436,16 +436,23 @@ double random_beta(bitgen_t *bitgen_state, double a, double b) {
       XpY = X + Y;
       /* Reject if both U and V are 0.0, which is approx 1 in 10^106 */
       if ((XpY <= 1.0) && (U + V > 0.0)) {
-        if (XpY > 0) {
+        if ((X > 0) && (Y > 0)) {
           return X / XpY;
         } else {
-          double logX = log(U) / a;
-          double logY = log(V) / b;
-          double logM = logX > logY ? logX : logY;
-          logX -= logM;
-          logY -= logM;
-
-          return exp(logX - log(exp(logX) + exp(logY)));
+          /*
+           * Either X or Y underflowed to 0, so we lost information in
+           * U**(1/a) or V**(1/b). We still compute X/(X+Y) here, but we
+           * work with logarithms as much as we can to avoid the underflow.
+           */
+          double logX = log(U)/a;
+          double logY = log(V)/b;
+          double delta = logX - logY;
+          if (delta > 0) {
+            return exp(-log1p(exp(-delta)));
+          }
+          else {
+            return exp(delta - log1p(exp(delta)));
+          }
         }
       }
     }
diff --git a/numpy/random/tests/test_generator_mt19937_regressions.py b/numpy/random/tests/test_generator_mt19937_regressions.py
@@ -86,6 +86,29 @@ def test_beta_ridiculously_small_parameters(self):
         x = self.mt19937.beta(tiny/32, tiny/40, size=50)
         assert not np.any(np.isnan(x))
 
+    def test_beta_expected_zero_frequency(self):
+        # gh-24475: For small a and b (e.g. a=0.0025, b=0.0025), beta
+        # would generate too many zeros.
+        a = 0.0025
+        b = 0.0025
+        n = 1000000
+        x = self.mt19937.beta(a, b, size=n)
+        nzeros = np.count_nonzero(x == 0)
+        # beta CDF at x = np.finfo(np.double).smallest_subnormal/2
+        # is p = 0.0776169083131899, e.g,
+        #
+        #    import numpy as np
+        #    from mpmath import mp
+        #    mp.dps = 160
+        #    x = mp.mpf(np.finfo(np.float64).smallest_subnormal)/2
+        #    # CDF of the beta distribution at x:
+        #    p = mp.betainc(a, b, x1=0, x2=x, regularized=True)
+        #    n = 1000000
+        #    exprected_freq = float(n*p)
+        #
+        expected_freq = 77616.90831318991
+        assert 0.95*expected_freq < nzeros < 1.05*expected_freq
+
     def test_choice_sum_of_probs_tolerance(self):
         # The sum of probs should be 1.0 with some tolerance.
         # For low precision dtypes the tolerance was too tight.