diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py index 92de891c4..8f132732c 100644 --- a/tests/test_diagnostics/test_diagnostics_metrics.py +++ b/tests/test_diagnostics/test_diagnostics_metrics.py @@ -103,6 +103,8 @@ def test_calibration_log_gamma_end_to_end(): S = 1000 # number of posterior draws D = 1000 # number of datasets + gamma_null = bf.diagnostics.metrics.gamma_null_distribution(D, S, num_null_draws=10000) + def run_sbc(N=N, S=S, D=D, bias=0): rng = np.random.default_rng() prior_draws = rng.beta(2, 2, size=D) @@ -110,13 +112,12 @@ def run_sbc(N=N, S=S, D=D, bias=0): # Analytical posterior: # if theta ~ Beta(2, 2), then p(theta|successes) is Beta(2 + successes | 2 + N - successes). - posterior_draws = rng.beta(2 + successes + bias, 2 + N - successes + bias, size=(S, D)) + posterior_draws = rng.beta(2 + successes + bias, 2 + N - successes, size=(S, D)) # these ranks are uniform if bias=0 ranks = np.sum(posterior_draws < prior_draws, axis=0) # this is the distribution of gamma under uniform ranks - gamma_null = bf.diagnostics.metrics.gamma_null_distribution(D, S, num_null_draws=200) lower, upper = np.quantile(gamma_null, (0.025, 0.975)) # this is the empirical gamma @@ -127,13 +128,13 @@ def run_sbc(N=N, S=S, D=D, bias=0): return in_interval sbc_calibration = [run_sbc(N=N, S=S, D=D) for _ in range(100)] - lower_expected, upper_expected = binom.ppf((0.0005, 0.9995), 100, 0.95) + lower_expected, upper_expected = binom.ppf((0.00005, 0.99995), 100, 0.95) - # this test should fail with a probability of 0.1% + # this test should fail with a probability of 0.01% assert lower_expected <= np.sum(sbc_calibration) <= upper_expected # sbc should almost always fail for slightly biased posterior draws - sbc_calibration = [run_sbc(N=N, S=S, D=D, bias=1) for _ in range(100)] + sbc_calibration = [run_sbc(N=N, S=S, D=D, bias=2) for _ in range(100)] assert not lower_expected <= np.sum(sbc_calibration) <= upper_expected