From e329f4ba788ad487985613801af326f0a0c495cc Mon Sep 17 00:00:00 2001
From: Valentin Pratz <112951103+vpratz@users.noreply.github.com>
Date: Sun, 22 Jun 2025 10:02:09 +0200
Subject: [PATCH 1/6] fix trainable parameters in distributions (#520)

---
 bayesflow/distributions/diagonal_normal.py    | 13 +++++-----
 bayesflow/distributions/diagonal_student_t.py | 25 ++++++++++---------
 bayesflow/distributions/mixture.py            |  2 +-
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/bayesflow/distributions/diagonal_normal.py b/bayesflow/distributions/diagonal_normal.py
index f8d93b945..6b64445c7 100644
--- a/bayesflow/distributions/diagonal_normal.py
+++ b/bayesflow/distributions/diagonal_normal.py
@@ -58,7 +58,6 @@ def __init__(
         self.seed_generator = seed_generator or keras.random.SeedGenerator()
 
         self.dim = None
-        self.log_normalization_constant = None
         self._mean = None
         self._std = None
 
@@ -71,17 +70,18 @@ def build(self, input_shape: Shape) -> None:
         self.mean = ops.cast(ops.broadcast_to(self.mean, (self.dim,)), "float32")
         self.std = ops.cast(ops.broadcast_to(self.std, (self.dim,)), "float32")
 
-        self.log_normalization_constant = -0.5 * self.dim * math.log(2.0 * math.pi) - ops.sum(ops.log(self.std))
-
         if self.trainable_parameters:
             self._mean = self.add_weight(
                 shape=ops.shape(self.mean),
-                initializer=keras.initializers.get(self.mean),
+                initializer=keras.initializers.get(keras.ops.copy(self.mean)),
                 dtype="float32",
                 trainable=True,
             )
             self._std = self.add_weight(
-                shape=ops.shape(self.std), initializer=keras.initializers.get(self.std), dtype="float32", trainable=True
+                shape=ops.shape(self.std),
+                initializer=keras.initializers.get(keras.ops.copy(self.std)),
+                dtype="float32",
+                trainable=True,
             )
         else:
             self._mean = self.mean
@@ -91,7 +91,8 @@ def log_prob(self, samples: Tensor, *, normalize: bool = True) -> Tensor:
         result = -0.5 * ops.sum((samples - self._mean) ** 2 / self._std**2, axis=-1)
 
         if normalize:
-            result += self.log_normalization_constant
+            log_normalization_constant = -0.5 * self.dim * math.log(2.0 * math.pi) - ops.sum(ops.log(self._std))
+            result += log_normalization_constant
 
         return result
 
diff --git a/bayesflow/distributions/diagonal_student_t.py b/bayesflow/distributions/diagonal_student_t.py
index 98e3fb7eb..9b02ee821 100644
--- a/bayesflow/distributions/diagonal_student_t.py
+++ b/bayesflow/distributions/diagonal_student_t.py
@@ -63,7 +63,6 @@ def __init__(
 
         self.seed_generator = seed_generator or keras.random.SeedGenerator()
 
-        self.log_normalization_constant = None
         self.dim = None
         self._loc = None
         self._scale = None
@@ -78,21 +77,16 @@ def build(self, input_shape: Shape) -> None:
         self.loc = ops.cast(ops.broadcast_to(self.loc, (self.dim,)), "float32")
         self.scale = ops.cast(ops.broadcast_to(self.scale, (self.dim,)), "float32")
 
-        self.log_normalization_constant = (
-            -0.5 * self.dim * math.log(self.df)
-            - 0.5 * self.dim * math.log(math.pi)
-            - math.lgamma(0.5 * self.df)
-            + math.lgamma(0.5 * (self.df + self.dim))
-            - ops.sum(keras.ops.log(self.scale))
-        )
-
         if self.trainable_parameters:
             self._loc = self.add_weight(
-                shape=ops.shape(self.loc), initializer=keras.initializers.get(self.loc), dtype="float32", trainable=True
+                shape=ops.shape(self.loc),
+                initializer=keras.initializers.get(keras.ops.copy(self.loc)),
+                dtype="float32",
+                trainable=True,
             )
             self._scale = self.add_weight(
                 shape=ops.shape(self.scale),
-                initializer=keras.initializers.get(self.scale),
+                initializer=keras.initializers.get(keras.ops.copy(self.scale)),
                 dtype="float32",
                 trainable=True,
             )
@@ -105,7 +99,14 @@ def log_prob(self, samples: Tensor, *, normalize: bool = True) -> Tensor:
         result = -0.5 * (self.df + self.dim) * ops.log1p(mahalanobis_term / self.df)
 
         if normalize:
-            result += self.log_normalization_constant
+            log_normalization_constant = (
+                -0.5 * self.dim * math.log(self.df)
+                - 0.5 * self.dim * math.log(math.pi)
+                - math.lgamma(0.5 * self.df)
+                + math.lgamma(0.5 * (self.df + self.dim))
+                - ops.sum(keras.ops.log(self._scale))
+            )
+            result += log_normalization_constant
 
         return result
 
diff --git a/bayesflow/distributions/mixture.py b/bayesflow/distributions/mixture.py
index a7bf2ea27..e1f04e88f 100644
--- a/bayesflow/distributions/mixture.py
+++ b/bayesflow/distributions/mixture.py
@@ -144,7 +144,7 @@ def build(self, input_shape: Shape) -> None:
 
         self._mixture_logits = self.add_weight(
             shape=(len(self.distributions),),
-            initializer=keras.initializers.get(self.mixture_logits),
+            initializer=keras.initializers.get(keras.ops.copy(self.mixture_logits)),
             dtype="float32",
             trainable=self.trainable_mixture,
         )

From f916855f9ba6b13e94fcbfc61f6b724dd3072d66 Mon Sep 17 00:00:00 2001
From: han-ol <g@hans.olischlaeger.com>
Date: Mon, 23 Jun 2025 15:29:04 +0200
Subject: [PATCH 2/6] Improve numerical precision in MVNScore.log_prob

---
 bayesflow/scores/multivariate_normal_score.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/bayesflow/scores/multivariate_normal_score.py b/bayesflow/scores/multivariate_normal_score.py
index 16e851427..71d45d4b2 100644
--- a/bayesflow/scores/multivariate_normal_score.py
+++ b/bayesflow/scores/multivariate_normal_score.py
@@ -82,13 +82,15 @@ def log_prob(self, x: Tensor, mean: Tensor, cov_chol: Tensor) -> Tensor:
         """
         diff = x - mean
 
-        # Calculate covariance from Cholesky factors
-        covariance = keras.ops.matmul(
-            cov_chol,
-            keras.ops.swapaxes(cov_chol, -2, -1),
+        # Calculate precision from Cholesky factors of covariance matrix
+        cov_chol_inv = keras.ops.inv(cov_chol)
+        precision = keras.ops.matmul(
+            keras.ops.swapaxes(cov_chol_inv, -2, -1),
+            cov_chol_inv,
         )
-        precision = keras.ops.inv(covariance)
-        log_det_covariance = keras.ops.slogdet(covariance)[1]  # Only take the log of the determinant part
+
+        # Compute log determinant, exploiting Cholesky factors
+        log_det_covariance = keras.ops.log(keras.ops.prod(keras.ops.diagonal(cov_chol, axis1=1, axis2=2), axis=1)) * 2
 
         # Compute the quadratic term in the exponential of the multivariate Gaussian
         quadratic_term = keras.ops.einsum("...i,...ij,...j->...", diff, precision, diff)

From 0c99bd9bee5e23821696fe228366f0d3ebd88fed Mon Sep 17 00:00:00 2001
From: Daniel Habermann <133031176+daniel-habermann@users.noreply.github.com>
Date: Mon, 30 Jun 2025 12:03:22 +0200
Subject: [PATCH 3/6] add log_gamma diagnostic (#522)

* add log_gamma diagnostic

* add missing export for log_gamma

* add missing export for gamma_null_distribution, gamma_discrepancy

* fix broken unit tests

* rename log_gamma module to sbc

* add test_log_gamma unit test

* add return information to log_gamma doc string

* fix typo in docstring, use fixed-length np array to collect log_gammas instead of appending to an empty list
---
 bayesflow/diagnostics/metrics/__init__.py     |   1 +
 bayesflow/diagnostics/metrics/sbc.py          | 162 ++++++++++++++++++
 .../test_diagnostics_metrics.py               |  55 +++++-
 3 files changed, 217 insertions(+), 1 deletion(-)
 create mode 100644 bayesflow/diagnostics/metrics/sbc.py

diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py
index 3e3496cda..2acd6b5b1 100644
--- a/bayesflow/diagnostics/metrics/__init__.py
+++ b/bayesflow/diagnostics/metrics/__init__.py
@@ -4,3 +4,4 @@
 from .expected_calibration_error import expected_calibration_error
 from .classifier_two_sample_test import classifier_two_sample_test
 from .model_misspecification import bootstrap_comparison, summary_space_comparison
+from .sbc import log_gamma
diff --git a/bayesflow/diagnostics/metrics/sbc.py b/bayesflow/diagnostics/metrics/sbc.py
new file mode 100644
index 000000000..57b364a63
--- /dev/null
+++ b/bayesflow/diagnostics/metrics/sbc.py
@@ -0,0 +1,162 @@
+from collections.abc import Mapping, Sequence
+
+import numpy as np
+from scipy.stats import binom
+
+from ...utils.dict_utils import dicts_to_arrays
+
+
+def log_gamma(
+    estimates: Mapping[str, np.ndarray] | np.ndarray,
+    targets: Mapping[str, np.ndarray] | np.ndarray,
+    variable_keys: Sequence[str] = None,
+    variable_names: Sequence[str] = None,
+    num_null_draws: int = 1000,
+    quantile: float = 0.05,
+):
+    """
+    Compute the log gamma discrepancy statistic, see [1] for additional information.
+    Log gamma is log(gamma/gamma_null), where gamma_null is the 5th percentile of the
+    null distribution under uniformity of ranks.
+    That is, if adopting a hypothesis testing framework,then log_gamma < 0 implies
+    a rejection of the hypothesis of uniform ranks at the 5% level.
+    This diagnostic is typically more sensitive than the Kolmogorov-Smirnoff test or
+    ChiSq test.
+
+    [1]  Martin Modrák. Angie H. Moon. Shinyoung Kim. Paul Bürkner. Niko Huurre.
+    Kateřina Faltejsková. Andrew Gelman. Aki Vehtari.
+    "Simulation-Based Calibration Checking for Bayesian Computation:
+    The Choice of Test Quantities Shapes Sensitivity."
+    Bayesian Anal. 20 (2) 461 - 488, June 2025. https://doi.org/10.1214/23-BA1404
+
+    Parameters
+    ----------
+    estimates  : np.ndarray of shape (num_datasets, num_draws, num_variables)
+        The random draws from the approximate posteriors over ``num_datasets``
+    targets : np.ndarray of shape (num_datasets, num_variables)
+        The corresponding ground-truth values sampled from the prior
+    variable_keys : Sequence[str], optional (default = None)
+       Select keys from the dictionaries provided in estimates and targets.
+       By default, select all keys.
+    variable_names : Sequence[str], optional (default = None)
+        Optional variable names to show in the output.
+    quantile : float in (0, 1), optional, default 0.05
+        The quantile from the null distribution to be used as a threshold.
+        A lower quantile increases sensitivity to deviations from uniformity.
+
+    Returns
+    -------
+    result : dict
+        Dictionary containing:
+
+        - "values" : float or np.ndarray
+            The log gamma values per variable
+        - "metric_name" : str
+            The name of the metric ("Log Gamma").
+        - "variable_names" : str
+            The (inferred) variable names.
+    """
+    samples = dicts_to_arrays(
+        estimates=estimates,
+        targets=targets,
+        variable_keys=variable_keys,
+        variable_names=variable_names,
+    )
+
+    num_ranks = samples["estimates"].shape[0]
+    num_post_draws = samples["estimates"].shape[1]
+
+    # rank statistics
+    ranks = np.sum(samples["estimates"] < samples["targets"][:, None], axis=1)
+
+    # null distribution and threshold
+    null_distribution = gamma_null_distribution(num_ranks, num_post_draws, num_null_draws)
+    null_quantile = np.quantile(null_distribution, quantile)
+
+    # compute log gamma for each parameter
+    log_gammas = np.empty(ranks.shape[-1])
+
+    for i in range(ranks.shape[-1]):
+        gamma = gamma_discrepancy(ranks[:, i], num_post_draws=num_post_draws)
+        log_gammas[i] = np.log(gamma / null_quantile)
+
+    output = {
+        "values": log_gammas,
+        "metric_name": "Log Gamma",
+        "variable_names": samples["estimates"].variable_names,
+    }
+
+    return output
+
+
+def gamma_null_distribution(num_ranks: int, num_post_draws: int = 1000, num_null_draws: int = 1000) -> np.ndarray:
+    """
+    Computes the distribution of expected gamma values under uniformity of ranks.
+
+    Parameters
+    ----------
+    num_ranks : int
+        Number of ranks to use for each gamma.
+    num_post_draws : int, optional, default 1000
+        Number of posterior draws that were used to calculate the rank distribution.
+    num_null_draws : int, optional, default 1000
+        Number of returned gamma values under uniformity of ranks.
+
+    Returns
+    -------
+    result : np.ndarray
+        Array of shape (num_null_draws,) containing gamma values under uniformity of ranks.
+    """
+    z_i = np.arange(1, num_post_draws + 2) / (num_post_draws + 1)
+    gamma = np.empty(num_null_draws)
+
+    # loop non-vectorized to reduce memory footprint
+    for i in range(num_null_draws):
+        u = np.random.uniform(size=num_ranks)
+        F_z = np.mean(u[:, None] < z_i, axis=0)
+        bin_1 = binom.cdf(num_ranks * F_z, num_ranks, z_i)
+        bin_2 = 1 - binom.cdf(num_ranks * F_z - 1, num_ranks, z_i)
+
+        gamma[i] = 2 * np.min(np.minimum(bin_1, bin_2))
+
+    return gamma
+
+
+def gamma_discrepancy(ranks: np.ndarray, num_post_draws: int = 100) -> float:
+    """
+    Quantifies deviation from uniformity by the likelihood of observing the
+    most extreme point on the empirical CDF of the given rank distribution
+    according to [1] (equation 7).
+
+    [1]  Martin Modrák. Angie H. Moon. Shinyoung Kim. Paul Bürkner. Niko Huurre.
+    Kateřina Faltejsková. Andrew Gelman. Aki Vehtari.
+    "Simulation-Based Calibration Checking for Bayesian Computation:
+    The Choice of Test Quantities Shapes Sensitivity."
+    Bayesian Anal. 20 (2) 461 - 488, June 2025. https://doi.org/10.1214/23-BA1404
+
+    Parameters
+    ----------
+    ranks : array of shape (num_ranks,)
+        Empirical rank distribution
+    num_post_draws : int, optional, default 100
+        Number of posterior draws used to generate ranks.
+
+    Returns
+    -------
+    result : float
+        Gamma discrepancy values for each parameter.
+    """
+    num_ranks = len(ranks)
+
+    # observed count of ranks smaller than i
+    R_i = np.array([sum(ranks < i) for i in range(1, num_post_draws + 2)])
+
+    # expected proportion of ranks smaller than i
+    z_i = np.arange(1, num_post_draws + 2) / (num_post_draws + 1)
+
+    bin_1 = binom.cdf(R_i, num_ranks, z_i)
+    bin_2 = 1 - binom.cdf(R_i - 1, num_ranks, z_i)
+
+    # likelihood of obtaining the most extreme point on the empirical CDF
+    # if the rank distribution was indeed uniform
+    return float(2 * np.min(np.minimum(bin_1, bin_2)))
diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
index 3a2c711bc..789b6ea0c 100644
--- a/tests/test_diagnostics/test_diagnostics_metrics.py
+++ b/tests/test_diagnostics/test_diagnostics_metrics.py
@@ -1,6 +1,7 @@
-import numpy as np
 import keras
+import numpy as np
 import pytest
+from scipy.stats import binom
 
 import bayesflow as bf
 
@@ -84,6 +85,58 @@ def test_expected_calibration_error(pred_models, true_models, model_names):
         out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models.transpose)
 
 
+def test_log_gamma(random_estimates, random_targets):
+    out = bf.diagnostics.metrics.log_gamma(random_estimates, random_targets)
+    assert list(out.keys()) == ["values", "metric_name", "variable_names"]
+    assert out["values"].shape == (num_variables(random_estimates),)
+    assert out["metric_name"] == "Log Gamma"
+    assert out["variable_names"] == ["beta_0", "beta_1", "sigma"]
+
+
+def test_log_gamma_end_to_end():
+    # This is a function test for simulation-based calibration.
+    # First, we sample from a known generative process and then run SBC.
+    # If the log gamma statistic is correctly implemented, a 95% interval should exclude
+    # the true value 5% of the time.
+
+    N = 30  # number of samples
+    S = 1000  # number of posterior draws
+    D = 1000  # number of datasets
+
+    def run_sbc(N=N, S=S, D=D, bias=0):
+        rng = np.random.default_rng()
+        prior_draws = rng.beta(2, 2, size=D)
+        successes = rng.binomial(N, prior_draws)
+
+        # Analytical posterior:
+        # if theta ~ Beta(2, 2), then p(theta|successes) is Beta(2 + successes | 2 + N - successes).
+        posterior_draws = rng.beta(2 + successes + bias, 2 + N - successes + bias, size=(S, D))
+
+        # these ranks are uniform if bias=0
+        ranks = np.sum(posterior_draws < prior_draws, axis=0)
+
+        # this is the distribution of gamma under uniform ranks
+        gamma_null = bf.diagnostics.metrics.sbc.gamma_null_distribution(D, S, num_null_draws=100)
+        lower, upper = np.quantile(gamma_null, (0.05, 0.995))
+
+        # this is the empirical gamma
+        observed_gamma = bf.diagnostics.metrics.sbc.gamma_discrepancy(ranks, num_post_draws=S)
+
+        in_interval = lower <= observed_gamma < upper
+
+        return in_interval
+
+    sbc_calibration = [run_sbc(N=N, S=S, D=D) for _ in range(100)]
+    lower_expected, upper_expected = binom.ppf((0.0005, 0.9995), 100, 0.95)
+
+    # this test should fail with a probability of 0.1%
+    assert lower_expected <= np.sum(sbc_calibration) <= upper_expected
+
+    # sbc should almost always fial for slightly biased posterior draws
+    sbc_calibration = [run_sbc(N=N, S=S, D=D, bias=1) for _ in range(100)]
+    assert not lower_expected <= np.sum(sbc_calibration) <= upper_expected
+
+
 def test_bootstrap_comparison_shapes():
     """Test the bootstrap_comparison output shapes."""
     observed_samples = np.random.rand(10, 5)

From 55d51dfff3ba4704cfc55fa830cbc9bb9c8e2258 Mon Sep 17 00:00:00 2001
From: Valentin Pratz <112951103+vpratz@users.noreply.github.com>
Date: Tue, 1 Jul 2025 10:39:15 +0200
Subject: [PATCH 4/6] Breaking changes: Fix bugs regarding counts in
 standardization layer (#525)

* standardization: add test for multi-input values (failing)

This test reveals to bugs in the standarization layer:

- count is updated multiple times
- batch_count is too small, as the sizes from reduce_axes have to be
  multiplied

* breaking: fix bugs regarding count in standardization layer

Fixes #524

This fixes the two bugs described in c4cc13373:

- count was accidentally updated, leading to wrong values
- count was calculated wrongly, as only the batch size was used. Correct
  is the product of all reduce dimensions. This lead to wrong standard
  deviations

While the batch dimension is the same for all inputs, the size of the
second dimension might vary. For this reason, we need to introduce an
input-specific `count` variable. This breaks serialization.

* fix assert statement in test
---
 .../standardization/standardization.py        | 10 +++---
 .../test_approximator_standardization.py      |  3 +-
 tests/test_approximators/test_build.py        |  3 +-
 tests/test_networks/test_standardization.py   | 33 +++++++++++++++++++
 4 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/bayesflow/networks/standardization/standardization.py b/bayesflow/networks/standardization/standardization.py
index a30a9a1e9..9dfdf2cb2 100644
--- a/bayesflow/networks/standardization/standardization.py
+++ b/bayesflow/networks/standardization/standardization.py
@@ -40,7 +40,7 @@ def moving_std(self, index: int) -> Tensor:
         """
         return keras.ops.where(
             self.moving_m2[index] > 0,
-            keras.ops.sqrt(self.moving_m2[index] / self.count),
+            keras.ops.sqrt(self.moving_m2[index] / self.count[index]),
             1.0,
         )
 
@@ -53,7 +53,7 @@ def build(self, input_shape: Shape):
         self.moving_m2 = [
             self.add_weight(shape=(shape[-1],), initializer="zeros", trainable=False) for shape in flattened_shapes
         ]
-        self.count = self.add_weight(shape=(), initializer="zeros", trainable=False)
+        self.count = [self.add_weight(shape=(), initializer="zeros", trainable=False) for _ in flattened_shapes]
 
     def call(
         self,
@@ -150,7 +150,7 @@ def _update_moments(self, x: Tensor, index: int):
         """
 
         reduce_axes = tuple(range(x.ndim - 1))
-        batch_count = keras.ops.cast(keras.ops.shape(x)[0], self.count.dtype)
+        batch_count = keras.ops.cast(keras.ops.prod(keras.ops.shape(x)[:-1]), self.count[index].dtype)
 
         # Compute batch mean and M2 per feature
         batch_mean = keras.ops.mean(x, axis=reduce_axes)
@@ -159,7 +159,7 @@ def _update_moments(self, x: Tensor, index: int):
         # Read current totals
         mean = self.moving_mean[index]
         m2 = self.moving_m2[index]
-        count = self.count
+        count = self.count[index]
 
         total_count = count + batch_count
         delta = batch_mean - mean
@@ -169,4 +169,4 @@ def _update_moments(self, x: Tensor, index: int):
 
         self.moving_mean[index].assign(new_mean)
         self.moving_m2[index].assign(new_m2)
-        self.count.assign(total_count)
+        self.count[index].assign(total_count)
diff --git a/tests/test_approximators/test_approximator_standardization/test_approximator_standardization.py b/tests/test_approximators/test_approximator_standardization/test_approximator_standardization.py
index 9c73d4717..7cf0f6aba 100644
--- a/tests/test_approximators/test_approximator_standardization/test_approximator_standardization.py
+++ b/tests/test_approximators/test_approximator_standardization/test_approximator_standardization.py
@@ -8,7 +8,8 @@ def test_save_and_load(tmp_path, approximator, train_dataset, validation_dataset
     approximator.build(data_shapes)
     for layer in approximator.standardize_layers.values():
         assert layer.built
-        assert layer.count == 0
+        for count in layer.count:
+            assert count == 0.0
     approximator.compute_metrics(**train_dataset[0])
 
     keras.saving.save_model(approximator, tmp_path / "model.keras")
diff --git a/tests/test_approximators/test_build.py b/tests/test_approximators/test_build.py
index 5947783dd..bea897115 100644
--- a/tests/test_approximators/test_build.py
+++ b/tests/test_approximators/test_build.py
@@ -14,4 +14,5 @@ def test_build(approximator, simulator, batch_size, adapter):
     approximator.build(batch_shapes)
     for layer in approximator.standardize_layers.values():
         assert layer.built
-        assert layer.count == 0
+        for count in layer.count:
+            assert count == 0.0
diff --git a/tests/test_networks/test_standardization.py b/tests/test_networks/test_standardization.py
index 8b83de498..86881a384 100644
--- a/tests/test_networks/test_standardization.py
+++ b/tests/test_networks/test_standardization.py
@@ -91,6 +91,39 @@ def test_nested_consistency_forward_inverse():
     np.testing.assert_allclose(random_input["b"], recovered["b"], atol=1e-4)
 
 
+def test_nested_accuracy_forward():
+    from bayesflow.utils import tree_concatenate
+
+    # create inputs for two training passes
+    random_input_a_1 = keras.random.normal((2, 3, 5))
+    random_input_b_1 = keras.random.normal((4, 3))
+    random_input_1 = {"a": random_input_a_1, "b": random_input_b_1}
+
+    random_input_a_2 = keras.random.normal((3, 3, 5))
+    random_input_b_2 = keras.random.normal((3, 3))
+    random_input_2 = {"a": random_input_a_2, "b": random_input_b_2}
+
+    # complete data for testing mean and std are 0 and 1
+    random_input = tree_concatenate([random_input_1, random_input_2], axis=0)
+
+    layer = Standardization()
+
+    _ = layer(random_input_1, stage="training", forward=True)
+    _ = layer(random_input_2, stage="training", forward=True)
+
+    standardized = layer(random_input, stage="inference", forward=True)
+    standardized = keras.tree.map_structure(keras.ops.convert_to_numpy, standardized)
+
+    np.testing.assert_allclose(
+        np.mean(standardized["a"], axis=tuple(range(standardized["a"].ndim - 1))), 0.0, atol=1e-4
+    )
+    np.testing.assert_allclose(
+        np.mean(standardized["b"], axis=tuple(range(standardized["b"].ndim - 1))), 0.0, atol=1e-4
+    )
+    np.testing.assert_allclose(np.std(standardized["a"], axis=tuple(range(standardized["a"].ndim - 1))), 1.0, atol=1e-4)
+    np.testing.assert_allclose(np.std(standardized["b"], axis=tuple(range(standardized["b"].ndim - 1))), 1.0, atol=1e-4)
+
+
 def test_transformation_type_both_sides_scale():
     # Fix a known covariance and mean in original (not standardized space)
     covariance = np.array([[1, 0.5], [0.5, 2.0]], dtype="float32")

From 7c094c5462a8339e35fbabaf2c152910c84f094d Mon Sep 17 00:00:00 2001
From: Valentin Pratz <git@valentinpratz.de>
Date: Tue, 1 Jul 2025 09:45:53 +0000
Subject: [PATCH 5/6] bump version to 2.0.5, adjust deprecation warnings

---
 bayesflow/approximators/continuous_approximator.py       | 4 ++--
 bayesflow/approximators/model_comparison_approximator.py | 4 ++--
 pyproject.toml                                           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/bayesflow/approximators/continuous_approximator.py b/bayesflow/approximators/continuous_approximator.py
index 46fe4bb0d..e9c9c1be3 100644
--- a/bayesflow/approximators/continuous_approximator.py
+++ b/bayesflow/approximators/continuous_approximator.py
@@ -578,9 +578,9 @@ def summarize(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray:
     def summaries(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray:
         """
         .. deprecated:: 2.0.4
-            `summaries` will be removed in version 2.0.5, it was renamed to `summarize` which should be used instead.
+            `summaries` will be removed in version 2.0.6, it was renamed to `summarize` which should be used instead.
         """
-        warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.5.", FutureWarning)
+        warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.6.", FutureWarning)
         return self.summarize(data=data, **kwargs)
 
     def log_prob(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray:
diff --git a/bayesflow/approximators/model_comparison_approximator.py b/bayesflow/approximators/model_comparison_approximator.py
index 12038ea9c..c946da93a 100644
--- a/bayesflow/approximators/model_comparison_approximator.py
+++ b/bayesflow/approximators/model_comparison_approximator.py
@@ -442,9 +442,9 @@ def summarize(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray:
     def summaries(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray:
         """
         .. deprecated:: 2.0.4
-            `summaries` will be removed in version 2.0.5, it was renamed to `summarize` which should be used instead.
+            `summaries` will be removed in version 2.0.6, it was renamed to `summarize` which should be used instead.
         """
-        warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.5.", FutureWarning)
+        warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.6.", FutureWarning)
         return self.summarize(data=data, **kwargs)
 
     def _compute_logits(self, classifier_conditions: Tensor) -> Tensor:
diff --git a/pyproject.toml b/pyproject.toml
index 902ffc1d2..0c1dd941b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "bayesflow"
-version = "2.0.4"
+version = "2.0.5"
 authors = [{ name = "The BayesFlow Team" }]
 classifiers = [
     "Development Status :: 5 - Production/Stable",

From 2a19d32f11c60fd68214d1dfc0768b228096c6bd Mon Sep 17 00:00:00 2001
From: Daniel Habermann <133031176+daniel-habermann@users.noreply.github.com>
Date: Tue, 1 Jul 2025 14:33:52 +0200
Subject: [PATCH 6/6] rename log_gamma to calibration_log_gamma (#527)

---
 bayesflow/diagnostics/metrics/__init__.py              |  2 +-
 .../metrics/{sbc.py => calibration_log_gamma.py}       |  5 +++--
 tests/test_diagnostics/test_diagnostics_metrics.py     | 10 +++++-----
 3 files changed, 9 insertions(+), 8 deletions(-)
 rename bayesflow/diagnostics/metrics/{sbc.py => calibration_log_gamma.py} (97%)

diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py
index 2acd6b5b1..10d499e11 100644
--- a/bayesflow/diagnostics/metrics/__init__.py
+++ b/bayesflow/diagnostics/metrics/__init__.py
@@ -4,4 +4,4 @@
 from .expected_calibration_error import expected_calibration_error
 from .classifier_two_sample_test import classifier_two_sample_test
 from .model_misspecification import bootstrap_comparison, summary_space_comparison
-from .sbc import log_gamma
+from .calibration_log_gamma import calibration_log_gamma, gamma_null_distribution, gamma_discrepancy
diff --git a/bayesflow/diagnostics/metrics/sbc.py b/bayesflow/diagnostics/metrics/calibration_log_gamma.py
similarity index 97%
rename from bayesflow/diagnostics/metrics/sbc.py
rename to bayesflow/diagnostics/metrics/calibration_log_gamma.py
index 57b364a63..54551c857 100644
--- a/bayesflow/diagnostics/metrics/sbc.py
+++ b/bayesflow/diagnostics/metrics/calibration_log_gamma.py
@@ -6,7 +6,7 @@
 from ...utils.dict_utils import dicts_to_arrays
 
 
-def log_gamma(
+def calibration_log_gamma(
     estimates: Mapping[str, np.ndarray] | np.ndarray,
     targets: Mapping[str, np.ndarray] | np.ndarray,
     variable_keys: Sequence[str] = None,
@@ -15,7 +15,8 @@ def log_gamma(
     quantile: float = 0.05,
 ):
     """
-    Compute the log gamma discrepancy statistic, see [1] for additional information.
+    Compute the log gamma discrepancy statistic to test posterior calibration,
+    see [1] for additional information.
     Log gamma is log(gamma/gamma_null), where gamma_null is the 5th percentile of the
     null distribution under uniformity of ranks.
     That is, if adopting a hypothesis testing framework,then log_gamma < 0 implies
diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
index 789b6ea0c..f2c4c73c4 100644
--- a/tests/test_diagnostics/test_diagnostics_metrics.py
+++ b/tests/test_diagnostics/test_diagnostics_metrics.py
@@ -85,15 +85,15 @@ def test_expected_calibration_error(pred_models, true_models, model_names):
         out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models.transpose)
 
 
-def test_log_gamma(random_estimates, random_targets):
-    out = bf.diagnostics.metrics.log_gamma(random_estimates, random_targets)
+def test_calibration_log_gamma(random_estimates, random_targets):
+    out = bf.diagnostics.metrics.calibration_log_gamma(random_estimates, random_targets)
     assert list(out.keys()) == ["values", "metric_name", "variable_names"]
     assert out["values"].shape == (num_variables(random_estimates),)
     assert out["metric_name"] == "Log Gamma"
     assert out["variable_names"] == ["beta_0", "beta_1", "sigma"]
 
 
-def test_log_gamma_end_to_end():
+def test_calibration_log_gamma_end_to_end():
     # This is a function test for simulation-based calibration.
     # First, we sample from a known generative process and then run SBC.
     # If the log gamma statistic is correctly implemented, a 95% interval should exclude
@@ -116,11 +116,11 @@ def run_sbc(N=N, S=S, D=D, bias=0):
         ranks = np.sum(posterior_draws < prior_draws, axis=0)
 
         # this is the distribution of gamma under uniform ranks
-        gamma_null = bf.diagnostics.metrics.sbc.gamma_null_distribution(D, S, num_null_draws=100)
+        gamma_null = bf.diagnostics.metrics.gamma_null_distribution(D, S, num_null_draws=100)
         lower, upper = np.quantile(gamma_null, (0.05, 0.995))
 
         # this is the empirical gamma
-        observed_gamma = bf.diagnostics.metrics.sbc.gamma_discrepancy(ranks, num_post_draws=S)
+        observed_gamma = bf.diagnostics.metrics.gamma_discrepancy(ranks, num_post_draws=S)
 
         in_interval = lower <= observed_gamma < upper