bayesflow-org · paul-buerkner · Oct 31, 2025 · Oct 29, 2025 · Oct 31, 2025 · Oct 31, 2025
diff --git a/bayesflow/diagnostics/__init__.py b/bayesflow/diagnostics/__init__.py
@@ -7,6 +7,7 @@
     calibration_error,
     calibration_log_gamma,
     posterior_contraction,
+    posterior_z_score,
     summary_space_comparison,
 )
 

diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py
@@ -5,3 +5,4 @@
 from .classifier_two_sample_test import classifier_two_sample_test
 from .model_misspecification import bootstrap_comparison, summary_space_comparison
 from .calibration_log_gamma import calibration_log_gamma, gamma_null_distribution, gamma_discrepancy
+from .posterior_z_score import posterior_z_score
diff --git a/bayesflow/diagnostics/metrics/posterior_z_score.py b/bayesflow/diagnostics/metrics/posterior_z_score.py
@@ -0,0 +1,108 @@
+from collections.abc import Sequence, Mapping, Callable
+
+import numpy as np
+
+from ...utils.dict_utils import dicts_to_arrays, compute_test_quantities
+
+
+def posterior_z_score(
+    estimates: Mapping[str, np.ndarray] | np.ndarray,
+    targets: Mapping[str, np.ndarray] | np.ndarray,
+    variable_keys: Sequence[str] = None,
+    variable_names: Sequence[str] = None,
+    test_quantities: dict[str, Callable] = None,
+    aggregation: Callable | None = np.median,
+) -> dict[str, any]:
+    """
+    Computes the posterior z-score from prior to posterior for the given samples according to [1]:
+
+    post_z_score = (posterior_mean - true_parameters) / posterior_std
+
+    The score is adequate if it centers around zero and spreads roughly
+    in the interval [-3, 3]
+
+    [1] Schad, D. J., Betancourt, M., & Vasishth, S. (2021).
+    Toward a principled Bayesian workflow in cognitive science.
+    Psychological methods, 26(1), 103.
+
+    Paper also available at https://arxiv.org/abs/1904.12765
+
+    Parameters
+    ----------
+    estimates   : np.ndarray of shape (num_datasets, num_draws_post, num_variables)
+        Posterior samples, comprising `num_draws_post` random draws from the posterior distribution
+        for each data set from `num_datasets`.
+    targets  : np.ndarray of shape (num_datasets, num_variables)
+        Prior samples, comprising `num_datasets` ground truths.
+    variable_keys : Sequence[str], optional (default = None)
+       Select keys from the dictionaries provided in estimates and targets.
+       By default, select all keys.
+    variable_names : Sequence[str], optional (default = None)
+        Optional variable names to show in the output.
+    test_quantities   : dict or None, optional, default: None
+        A dict that maps plot titles to functions that compute
+        test quantities based on estimate/target draws.
+
+        The dict keys are automatically added to ``variable_keys``
+        and ``variable_names``.
+        Test quantity functions are expected to accept a dict of draws with
+        shape ``(batch_size, ...)`` as the first (typically only)
+        positional argument and return an NumPy array of shape
+        ``(batch_size,)``.
+        The functions do not have to deal with an additional
+        sample dimension, as appropriate reshaping is done internally.
+    aggregation    : callable or None, optional (default = np.median)
+        Function to aggregate the PC across draws. Typically `np.mean` or `np.median`.
+        If None is provided, the individual values are returned.
+
+    Returns
+    -------
+    result : dict
+        Dictionary containing:
+
+        - "values" : float or np.ndarray
+            The (optionally aggregated) posterior z-score per variable
+        - "metric_name" : str
+            The name of the metric ("Posterior z-score").
+        - "variable_names" : str
+            The (inferred) variable names.
+
+    Notes
+    -----
+    Posterior z-score quantifies how far the posterior mean lies from
+    the true generating parameter, in standard-error units. Values near 0
+    (in absolute terms) mean the posterior mean is close to the truth;
+    large absolute values indicate substantial deviation.
+    The sign shows the direction of the bias.
+
+    """
+
+    # Optionally, compute and prepend test quantities from draws
+    if test_quantities is not None:
+        updated_data = compute_test_quantities(
+            targets=targets,
+            estimates=estimates,
+            variable_keys=variable_keys,
+            variable_names=variable_names,
+            test_quantities=test_quantities,
+        )
+        variable_names = updated_data["variable_names"]
+        variable_keys = updated_data["variable_keys"]
+        estimates = updated_data["estimates"]
+        targets = updated_data["targets"]
+
+    samples = dicts_to_arrays(
+        estimates=estimates,
+        targets=targets,
+        variable_keys=variable_keys,
+        variable_names=variable_names,
+    )
+
+    post_vars = samples["estimates"].var(axis=1, ddof=1)
+    post_means = samples["estimates"].mean(axis=1)
+    post_stds = np.sqrt(post_vars)
+    z_score = (post_means - samples["targets"]) / post_stds
+    if aggregation is not None:
+        z_score = aggregation(z_score, axis=0)
+    variable_names = samples["estimates"].variable_names
+    return {"values": z_score, "metric_name": "Posterior z-score", "variable_names": variable_names}
diff --git a/bayesflow/diagnostics/plots/plot_quantity.py b/bayesflow/diagnostics/plots/plot_quantity.py
@@ -213,7 +213,12 @@ def _prepare_values(
 
     if estimates is not None:
         if is_values_callable:
-            values = values(estimates=estimates, targets=targets, **filter_kwargs({"aggregation": None}, values))
+            values = values(
+                estimates=estimates,
+                targets=targets,
+                variable_keys=variable_keys,
+                **filter_kwargs({"aggregation": None}, values),
+            )
 
         data = dicts_to_arrays(
             estimates=estimates,

diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
@@ -66,6 +66,26 @@ def test_posterior_contraction(random_estimates, random_targets):
     assert out["values"].shape[0] == len(test_quantities) + num_variables(random_estimates)
 
 
+def test_posterior_z_score(random_estimates, random_targets):
+    # basic functionality: automatic variable names
+    out = bf.diagnostics.metrics.posterior_z_score(random_estimates, random_targets)
+    assert list(out.keys()) == ["values", "metric_name", "variable_names"]
+    assert out["values"].shape == (num_variables(random_estimates),)
+    assert out["metric_name"] == "Posterior z-score"
+    assert out["variable_names"] == ["beta_0", "beta_1", "sigma"]
+    # test without aggregation
+    out = bf.diagnostics.metrics.posterior_z_score(random_estimates, random_targets, aggregation=None)
+    assert out["values"].shape == (random_estimates["sigma"].shape[0], num_variables(random_estimates))
+
+    # test quantities
+    test_quantities = {
+        r"$\beta_1 + \beta_2$": lambda data: np.sum(data["beta"], axis=-1),
+        r"$\beta_1 \cdot \beta_2$": lambda data: np.prod(data["beta"], axis=-1),
+    }
+    out = bf.diagnostics.metrics.posterior_z_score(random_estimates, random_targets, test_quantities=test_quantities)
+    assert out["values"].shape[0] == len(test_quantities) + num_variables(random_estimates)
+
+
 def test_root_mean_squared_error(random_estimates, random_targets):
     # basic functionality: automatic variable names
     out = bf.diagnostics.metrics.root_mean_squared_error(random_estimates, random_targets)