From b1a73253f42453cc8ff6b36915a73278f7ba3c77 Mon Sep 17 00:00:00 2001 From: Daniel Habermann <133031176+daniel-habermann@users.noreply.github.com> Date: Fri, 11 Jul 2025 10:21:48 +0200 Subject: [PATCH 1/3] fix test_calibration_log_gamma_end_to_end unit test failing too often than expected --- tests/test_diagnostics/test_diagnostics_metrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py index f2c4c73c4..fb2d3d3c2 100644 --- a/tests/test_diagnostics/test_diagnostics_metrics.py +++ b/tests/test_diagnostics/test_diagnostics_metrics.py @@ -116,8 +116,8 @@ def run_sbc(N=N, S=S, D=D, bias=0): ranks = np.sum(posterior_draws < prior_draws, axis=0) # this is the distribution of gamma under uniform ranks - gamma_null = bf.diagnostics.metrics.gamma_null_distribution(D, S, num_null_draws=100) - lower, upper = np.quantile(gamma_null, (0.05, 0.995)) + gamma_null = bf.diagnostics.metrics.gamma_null_distribution(D, S, num_null_draws=200) + lower, upper = np.quantile(gamma_null, (0.025, 0.975)) # this is the empirical gamma observed_gamma = bf.diagnostics.metrics.gamma_discrepancy(ranks, num_post_draws=S) @@ -127,7 +127,7 @@ def run_sbc(N=N, S=S, D=D, bias=0): return in_interval sbc_calibration = [run_sbc(N=N, S=S, D=D) for _ in range(100)] - lower_expected, upper_expected = binom.ppf((0.0005, 0.9995), 100, 0.95) + lower_expected, upper_expected = binom.ppf((0.0001, 0.9999), 100, 0.95) # this test should fail with a probability of 0.1% assert lower_expected <= np.sum(sbc_calibration) <= upper_expected From ad4a7357345066398615f44de02d9d218e416972 Mon Sep 17 00:00:00 2001 From: Daniel Habermann <133031176+daniel-habermann@users.noreply.github.com> Date: Fri, 11 Jul 2025 10:29:32 +0200 Subject: [PATCH 2/3] set alpha to 0.1% in binom.ppf --- tests/test_diagnostics/test_diagnostics_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py index fb2d3d3c2..54ee7262d 100644 --- a/tests/test_diagnostics/test_diagnostics_metrics.py +++ b/tests/test_diagnostics/test_diagnostics_metrics.py @@ -127,7 +127,7 @@ def run_sbc(N=N, S=S, D=D, bias=0): return in_interval sbc_calibration = [run_sbc(N=N, S=S, D=D) for _ in range(100)] - lower_expected, upper_expected = binom.ppf((0.0001, 0.9999), 100, 0.95) + lower_expected, upper_expected = binom.ppf((0.0005, 0.9995), 100, 0.95) # this test should fail with a probability of 0.1% assert lower_expected <= np.sum(sbc_calibration) <= upper_expected From cb951caacc507fe1ae8e68f44ecde0fda27863aa Mon Sep 17 00:00:00 2001 From: Daniel Habermann <133031176+daniel-habermann@users.noreply.github.com> Date: Fri, 11 Jul 2025 12:30:27 +0200 Subject: [PATCH 3/3] fix typo in comment --- tests/test_diagnostics/test_diagnostics_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py index 54ee7262d..92de891c4 100644 --- a/tests/test_diagnostics/test_diagnostics_metrics.py +++ b/tests/test_diagnostics/test_diagnostics_metrics.py @@ -132,7 +132,7 @@ def run_sbc(N=N, S=S, D=D, bias=0): # this test should fail with a probability of 0.1% assert lower_expected <= np.sum(sbc_calibration) <= upper_expected - # sbc should almost always fial for slightly biased posterior draws + # sbc should almost always fail for slightly biased posterior draws sbc_calibration = [run_sbc(N=N, S=S, D=D, bias=1) for _ in range(100)] assert not lower_expected <= np.sum(sbc_calibration) <= upper_expected