From b1a73253f42453cc8ff6b36915a73278f7ba3c77 Mon Sep 17 00:00:00 2001
From: Daniel Habermann <133031176+daniel-habermann@users.noreply.github.com>
Date: Fri, 11 Jul 2025 10:21:48 +0200
Subject: [PATCH 1/3] fix test_calibration_log_gamma_end_to_end unit test
 failing too often than expected

---
 tests/test_diagnostics/test_diagnostics_metrics.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
index f2c4c73c4..fb2d3d3c2 100644
--- a/tests/test_diagnostics/test_diagnostics_metrics.py
+++ b/tests/test_diagnostics/test_diagnostics_metrics.py
@@ -116,8 +116,8 @@ def run_sbc(N=N, S=S, D=D, bias=0):
         ranks = np.sum(posterior_draws < prior_draws, axis=0)
 
         # this is the distribution of gamma under uniform ranks
-        gamma_null = bf.diagnostics.metrics.gamma_null_distribution(D, S, num_null_draws=100)
-        lower, upper = np.quantile(gamma_null, (0.05, 0.995))
+        gamma_null = bf.diagnostics.metrics.gamma_null_distribution(D, S, num_null_draws=200)
+        lower, upper = np.quantile(gamma_null, (0.025, 0.975))
 
         # this is the empirical gamma
         observed_gamma = bf.diagnostics.metrics.gamma_discrepancy(ranks, num_post_draws=S)
@@ -127,7 +127,7 @@ def run_sbc(N=N, S=S, D=D, bias=0):
         return in_interval
 
     sbc_calibration = [run_sbc(N=N, S=S, D=D) for _ in range(100)]
-    lower_expected, upper_expected = binom.ppf((0.0005, 0.9995), 100, 0.95)
+    lower_expected, upper_expected = binom.ppf((0.0001, 0.9999), 100, 0.95)
 
     # this test should fail with a probability of 0.1%
     assert lower_expected <= np.sum(sbc_calibration) <= upper_expected

From ad4a7357345066398615f44de02d9d218e416972 Mon Sep 17 00:00:00 2001
From: Daniel Habermann <133031176+daniel-habermann@users.noreply.github.com>
Date: Fri, 11 Jul 2025 10:29:32 +0200
Subject: [PATCH 2/3] set alpha to 0.1% in binom.ppf

---
 tests/test_diagnostics/test_diagnostics_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
index fb2d3d3c2..54ee7262d 100644
--- a/tests/test_diagnostics/test_diagnostics_metrics.py
+++ b/tests/test_diagnostics/test_diagnostics_metrics.py
@@ -127,7 +127,7 @@ def run_sbc(N=N, S=S, D=D, bias=0):
         return in_interval
 
     sbc_calibration = [run_sbc(N=N, S=S, D=D) for _ in range(100)]
-    lower_expected, upper_expected = binom.ppf((0.0001, 0.9999), 100, 0.95)
+    lower_expected, upper_expected = binom.ppf((0.0005, 0.9995), 100, 0.95)
 
     # this test should fail with a probability of 0.1%
     assert lower_expected <= np.sum(sbc_calibration) <= upper_expected

From cb951caacc507fe1ae8e68f44ecde0fda27863aa Mon Sep 17 00:00:00 2001
From: Daniel Habermann <133031176+daniel-habermann@users.noreply.github.com>
Date: Fri, 11 Jul 2025 12:30:27 +0200
Subject: [PATCH 3/3] fix typo in comment

---
 tests/test_diagnostics/test_diagnostics_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
index 54ee7262d..92de891c4 100644
--- a/tests/test_diagnostics/test_diagnostics_metrics.py
+++ b/tests/test_diagnostics/test_diagnostics_metrics.py
@@ -132,7 +132,7 @@ def run_sbc(N=N, S=S, D=D, bias=0):
     # this test should fail with a probability of 0.1%
     assert lower_expected <= np.sum(sbc_calibration) <= upper_expected
 
-    # sbc should almost always fial for slightly biased posterior draws
+    # sbc should almost always fail for slightly biased posterior draws
     sbc_calibration = [run_sbc(N=N, S=S, D=D, bias=1) for _ in range(100)]
     assert not lower_expected <= np.sum(sbc_calibration) <= upper_expected