add log_gamma diagnostic

daniel-habermann · daniel-habermann · commit b050ffbd1fb2 · 2025-06-27T00:49:49.000+02:00
diff --git a/bayesflow/diagnostics/metrics/log_gamma.py b/bayesflow/diagnostics/metrics/log_gamma.py
@@ -0,0 +1,149 @@
+from collections.abc import Mapping, Sequence
+
+import numpy as np
+from scipy.stats import binom
+
+from ...utils.dict_utils import dicts_to_arrays
+
+
+def log_gamma(
+    estimates: Mapping[str, np.ndarray] | np.ndarray,
+    targets: Mapping[str, np.ndarray] | np.ndarray,
+    variable_keys: Sequence[str] = None,
+    variable_names: Sequence[str] = None,
+    num_null_draws: int = 1000,
+    quantile: float = 0.05,
+):
+    """
+    Compute the log gamma discrepancy statistic, see [1] for additional information.
+    Log gamma is log(gamma/gamma_null), where gamma_null is the 5th percentile of the
+    null distribution under uniformity of ranks.
+    That is, if adopting a hypothesis testing framework,then log_gamma < 0 implies
+    a rejection of the hypothesis of uniform ranks at the 5\% level.
+    This diagnostic is typically more sensitive than the Kolmogorov-Smirnoff test or
+    ChiSq test.
+
+    [1]  Martin Modrák. Angie H. Moon. Shinyoung Kim. Paul Bürkner. Niko Huurre.
+    Kateřina Faltejsková. Andrew Gelman. Aki Vehtari.
+    "Simulation-Based Calibration Checking for Bayesian Computation:
+    The Choice of Test Quantities Shapes Sensitivity."
+    Bayesian Anal. 20 (2) 461 - 488, June 2025. https://doi.org/10.1214/23-BA1404
+
+    Parameters
+    ----------
+    estimates  : np.ndarray of shape (num_datasets, num_draws, num_variables)
+        The random draws from the approximate posteriors over ``num_datasets``
+    targets : np.ndarray of shape (num_datasets, num_variables)
+        The corresponding ground-truth values sampled from the prior
+    variable_keys : Sequence[str], optional (default = None)
+       Select keys from the dictionaries provided in estimates and targets.
+       By default, select all keys.
+    variable_names : Sequence[str], optional (default = None)
+        Optional variable names to show in the output.
+    quantile : float in (0, 1), optional, default 0.05
+        The quantile from the null distribution to be used as a threshold.
+        A lower quantile increases sensitivity to deviations from uniformity.
+    """
+    samples = dicts_to_arrays(
+        estimates=estimates,
+        targets=targets,
+        variable_keys=variable_keys,
+        variable_names=variable_names,
+    )
+
+    num_ranks = samples["estimates"].shape[0]
+    num_post_draws = samples["estimates"].shape[1]
+
+    # rank statistics
+    ranks = np.sum(samples["estimates"] < samples["targets"][:, None], axis=1)
+
+    # null distribution and threshold
+    null_distribution = gamma_null_distribution(num_ranks, num_post_draws, num_null_draws)
+    null_quantile = np.quantile(null_distribution, quantile)
+
+    # compute log gamma for each parameter
+    log_gammas = []
+    for i in range(ranks.shape[-1]):
+        gamma = gamma_discrepancy(ranks[:, i], num_post_draws=num_post_draws)
+        log_gammas.append(np.log(gamma / null_quantile))
+
+    output = {
+        "values": np.array(log_gammas),
+        "metric_name": "Log Gamma",
+        "variable_names": samples["estimates"].variable_names,
+    }
+
+    return output
+
+
+def gamma_null_distribution(num_ranks: int, num_post_draws: int = 1000, num_null_draws: int = 1000) -> np.ndarray:
+    """
+    Computes the distribution of expected gamma values under uniformity of ranks.
+
+    Parameters
+    ----------
+    num_ranks : int
+        Number of ranks to use for each gamma.
+    num_post_draws : int, optional, default 1000
+        Number of posterior draws that were used to calculate the rank distribution.
+    num_null_draws : int, optional, default 1000
+        Number of returned gamma values under uniformity of ranks.
+
+    Returns
+    -------
+    result : np.ndarray
+        Array of shape (num_null_draws,) containing gamma values under uniformity of ranks.
+    """
+    z_i = np.arange(1, num_post_draws + 2) / (num_post_draws + 1)
+    gamma = np.empty(num_null_draws)
+
+    # loop non-vectorized to reduce memory footprint
+    for i in range(num_null_draws):
+        u = np.random.uniform(size=num_ranks)
+        F_z = np.mean(u[:, None] < z_i, axis=0)
+        bin_1 = binom.cdf(num_ranks * F_z, num_ranks, z_i)
+        bin_2 = 1 - binom.cdf(num_ranks * F_z - 1, num_ranks, z_i)
+
+        gamma[i] = 2 * np.min(np.minimum(bin_1, bin_2))
+
+    return gamma
+
+
+def gamma_discrepancy(ranks: np.ndarray, num_post_draws: int = 100) -> float:
+    """
+    Quantifies deviation from uniformity by the likelihood of observing the
+    most extreme point on the empirical CDF of the given rank distribution
+    according to [1] (equation 7).
+
+    [1]  Martin Modrák. Angie H. Moon. Shinyoung Kim. Paul Bürkner. Niko Huurre.
+    Kateřina Faltejsková. Andrew Gelman. Aki Vehtari.
+    "Simulation-Based Calibration Checking for Bayesian Computation:
+    The Choice of Test Quantities Shapes Sensitivity."
+    Bayesian Anal. 20 (2) 461 - 488, June 2025. https://doi.org/10.1214/23-BA1404
+
+    Parameters
+    ----------
+    ranks : array of shape (num_ranks,)
+        Empirical rank distribution
+    num_post_draws : int, optional, default 100
+        Number of posterior draws used to generate ranks.
+
+    Returns
+    -------
+    result : float
+        Gamma discrepancy values for each parameter.
+    """
+    num_ranks = len(ranks)
+
+    # observed count of ranks smaller than i
+    R_i = np.array([sum(ranks < i) for i in range(1, num_post_draws + 2)])
+
+    # expected proportion of ranks smaller than i
+    z_i = np.arange(1, num_post_draws + 2) / (num_post_draws + 1)
+
+    bin_1 = binom.cdf(R_i, num_ranks, z_i)
+    bin_2 = 1 - binom.cdf(R_i - 1, num_ranks, z_i)
+
+    # likelihood of obtaining the most extreme point on the empirical CDF
+    # if the rank distribution was indeed uniform
+    return float(2 * np.min(np.minimum(bin_1, bin_2)))
diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
@@ -1,6 +1,7 @@
-import numpy as np
 import keras
+import numpy as np
 import pytest
+from scipy.stats import binom
 
 import bayesflow as bf
 
@@ -84,6 +85,50 @@ def test_expected_calibration_error(pred_models, true_models, model_names):
         out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models.transpose)
 
 
+def test_log_gamma():
+    # This is a function test for simulation-based calibration.
+    # First, we sample from a known generative process and then run SBC.
+    # If the log gamma statistic is correctly implemented, a 95% interval should exclude
+    # the true value 5% of the time.
+
+    N = 30  # number of samples
+    S = 1000  # number of posterior draws
+    D = 1000  # number of datasets
+
+    def run_sbc(N=N, S=S, D=D, bias=0):
+        rng = np.random.default_rng()
+        prior_draws = rng.beta(2, 2, size=D)
+        successes = rng.binomial(N, prior_draws)
+
+        # Analytical posterior:
+        # if theta ~ Beta(2, 2), then p(theta|successes) is Beta(2 + successes | 2 + N - successes).
+        posterior_draws = rng.beta(2 + successes + bias, 2 + N - successes + bias, size=(S, D))
+
+        # these ranks are uniform if bias=0
+        ranks = np.sum(posterior_draws < prior_draws, axis=0)
+
+        # this is the distribution of gamma under uniform ranks
+        gamma_null = bf.diagnostics.metrics.log_gamma.gamma_null_distribution(D, S, num_null_draws=100)
+        lower, upper = np.quantile(gamma_null, (0.05, 0.995))
+
+        # this is the empirical gamma
+        observed_gamma = bf.diagnostics.metrics.log_gamma.gamma_discrepancy(ranks, num_post_draws=S)
+
+        in_interval = lower <= observed_gamma < upper
+
+        return in_interval
+
+    sbc_calibration = [run_sbc(N=N, S=S, D=D) for _ in range(100)]
+    lower_expected, upper_expected = binom.ppf((0.005, 0.995), 100, 0.95)
+
+    # this test should fail with a probability of 1%
+    assert lower_expected <= np.sum(sbc_calibration) <= upper_expected
+
+    # sbc should almost always fial for slightly biased posterior draws
+    sbc_calibration = [run_sbc(N=N, S=S, D=D, bias=1) for _ in range(100)]
+    assert not lower_expected <= np.sum(sbc_calibration) <= upper_expected
+
+
 def test_bootstrap_comparison_shapes():
     """Test the bootstrap_comparison output shapes."""
     observed_samples = np.random.rand(10, 5)