From 0e2d64672b0ea61bb9a968206e2f5cc12b57805a Mon Sep 17 00:00:00 2001 From: Svenja Jedhoff Date: Wed, 29 Oct 2025 15:06:44 +0100 Subject: [PATCH 1/3] Add posterior z score --- bayesflow/diagnostics/__init__.py | 1 + bayesflow/diagnostics/metrics/__init__.py | 1 + .../diagnostics/metrics/posterior_z_score.py | 109 ++++++++++++++++++ .../test_diagnostics_metrics.py | 20 ++++ 4 files changed, 131 insertions(+) create mode 100644 bayesflow/diagnostics/metrics/posterior_z_score.py diff --git a/bayesflow/diagnostics/__init__.py b/bayesflow/diagnostics/__init__.py index caef625cd..2f06fe4af 100644 --- a/bayesflow/diagnostics/__init__.py +++ b/bayesflow/diagnostics/__init__.py @@ -7,6 +7,7 @@ calibration_error, calibration_log_gamma, posterior_contraction, + posterior_z_score, summary_space_comparison, ) diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py index 10d499e11..81f7a7485 100644 --- a/bayesflow/diagnostics/metrics/__init__.py +++ b/bayesflow/diagnostics/metrics/__init__.py @@ -5,3 +5,4 @@ from .classifier_two_sample_test import classifier_two_sample_test from .model_misspecification import bootstrap_comparison, summary_space_comparison from .calibration_log_gamma import calibration_log_gamma, gamma_null_distribution, gamma_discrepancy +from .posterior_z_score import posterior_z_score diff --git a/bayesflow/diagnostics/metrics/posterior_z_score.py b/bayesflow/diagnostics/metrics/posterior_z_score.py new file mode 100644 index 000000000..184df4f54 --- /dev/null +++ b/bayesflow/diagnostics/metrics/posterior_z_score.py @@ -0,0 +1,109 @@ +from collections.abc import Sequence, Mapping, Callable + +import numpy as np + +from ...utils.dict_utils import dicts_to_arrays, compute_test_quantities + + +def posterior_z_score( + estimates: Mapping[str, np.ndarray] | np.ndarray, + targets: Mapping[str, np.ndarray] | np.ndarray, + variable_keys: Sequence[str] = None, + variable_names: Sequence[str] = None, + test_quantities: dict[str, Callable] = None, + aggregation: Callable | None = np.median, +) -> dict[str, any]: + """ + Computes the posterior z-score from prior to posterior for the given samples according to [1]: + + post_z_score = (posterior_mean - true_parameters) / posterior_std + + The score is adequate if it centers around zero and spreads roughly + in the interval [-3, 3] + + [1] Schad, D. J., Betancourt, M., & Vasishth, S. (2021). + Toward a principled Bayesian workflow in cognitive science. + Psychological methods, 26(1), 103. + + Paper also available at https://arxiv.org/abs/1904.12765 + + Parameters + ---------- + estimates : np.ndarray of shape (num_datasets, num_draws_post, num_variables) + Posterior samples, comprising `num_draws_post` random draws from the posterior distribution + for each data set from `num_datasets`. + targets : np.ndarray of shape (num_datasets, num_variables) + Prior samples, comprising `num_datasets` ground truths. + variable_keys : Sequence[str], optional (default = None) + Select keys from the dictionaries provided in estimates and targets. + By default, select all keys. + variable_names : Sequence[str], optional (default = None) + Optional variable names to show in the output. + test_quantities : dict or None, optional, default: None + A dict that maps plot titles to functions that compute + test quantities based on estimate/target draws. + + The dict keys are automatically added to ``variable_keys`` + and ``variable_names``. + Test quantity functions are expected to accept a dict of draws with + shape ``(batch_size, ...)`` as the first (typically only) + positional argument and return an NumPy array of shape + ``(batch_size,)``. + The functions do not have to deal with an additional + sample dimension, as appropriate reshaping is done internally. + aggregation : callable or None, optional (default = np.median) + Function to aggregate the PC across draws. Typically `np.mean` or `np.median`. + If None is provided, the individual values are returned. + + Returns + ------- + result : dict + Dictionary containing: + + - "values" : float or np.ndarray + The (optionally aggregated) posterior z-score per variable + - "metric_name" : str + The name of the metric ("Posterior z-score"). + - "variable_names" : str + The (inferred) variable names. + + Notes + ----- + Posterior z-score quantifies how far the posterior mean lies from + the true generating parameter, in standard-error units. Values near 0 + (in absolute terms) mean the posterior mean is close to the truth; + large absolute values indicate substantial deviation. + The sign shows the direction of the bias. + + """ + + # Optionally, compute and prepend test quantities from draws + if test_quantities is not None: + updated_data = compute_test_quantities( + targets=targets, + estimates=estimates, + variable_keys=variable_keys, + variable_names=variable_names, + test_quantities=test_quantities, + ) + variable_names = updated_data["variable_names"] + variable_keys = updated_data["variable_keys"] + estimates = updated_data["estimates"] + targets = updated_data["targets"] + + samples = dicts_to_arrays( + estimates=estimates, + targets=targets, + variable_keys=variable_keys, + variable_names=variable_names, + ) + + post_vars = samples["estimates"].var(axis=1, ddof=1) + post_means = samples["estimates"].mean(axis=1) + post_stds = np.sqrt(post_vars) + prior_vars = samples["targets"].var(axis=0, keepdims=True, ddof=1) + z_score = (post_means - samples["targets"]) / post_stds + if aggregation is not None: + z_score = aggregation(z_score, axis=0) + variable_names = samples["estimates"].variable_names + return {"values": z_score, "metric_name": "Posterior z-score", "variable_names": variable_names} diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py index daad874d0..75fa147f6 100644 --- a/tests/test_diagnostics/test_diagnostics_metrics.py +++ b/tests/test_diagnostics/test_diagnostics_metrics.py @@ -66,6 +66,26 @@ def test_posterior_contraction(random_estimates, random_targets): assert out["values"].shape[0] == len(test_quantities) + num_variables(random_estimates) +def test_posterior_z_score(random_estimates, random_targets): + # basic functionality: automatic variable names + out = bf.diagnostics.metrics.posterior_z_score(random_estimates, random_targets) + assert list(out.keys()) == ["values", "metric_name", "variable_names"] + assert out["values"].shape == (num_variables(random_estimates),) + assert out["metric_name"] == "Posterior z-score" + assert out["variable_names"] == ["beta_0", "beta_1", "sigma"] + # test without aggregation + out = bf.diagnostics.metrics.posterior_z_score(random_estimates, random_targets, aggregation=None) + assert out["values"].shape == (random_estimates["sigma"].shape[0], num_variables(random_estimates)) + + # test quantities + test_quantities = { + r"$\beta_1 + \beta_2$": lambda data: np.sum(data["beta"], axis=-1), + r"$\beta_1 \cdot \beta_2$": lambda data: np.prod(data["beta"], axis=-1), + } + out = bf.diagnostics.metrics.posterior_z_score(random_estimates, random_targets, test_quantities=test_quantities) + assert out["values"].shape[0] == len(test_quantities) + num_variables(random_estimates) + + def test_root_mean_squared_error(random_estimates, random_targets): # basic functionality: automatic variable names out = bf.diagnostics.metrics.root_mean_squared_error(random_estimates, random_targets) From 0f5cb5c39e50ad995e9c12a2e6061bd5848ca546 Mon Sep 17 00:00:00 2001 From: Svenja Jedhoff Date: Fri, 31 Oct 2025 10:48:25 +0100 Subject: [PATCH 2/3] Fixing test_quantities problem in plot_quantity/pairs_quantity --- bayesflow/diagnostics/plots/plot_quantity.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bayesflow/diagnostics/plots/plot_quantity.py b/bayesflow/diagnostics/plots/plot_quantity.py index 2fd0c6841..c608afa1c 100644 --- a/bayesflow/diagnostics/plots/plot_quantity.py +++ b/bayesflow/diagnostics/plots/plot_quantity.py @@ -213,7 +213,12 @@ def _prepare_values( if estimates is not None: if is_values_callable: - values = values(estimates=estimates, targets=targets, **filter_kwargs({"aggregation": None}, values)) + values = values( + estimates=estimates, + targets=targets, + variable_keys=variable_keys, + **filter_kwargs({"aggregation": None}, values), + ) data = dicts_to_arrays( estimates=estimates, From bdfb903b5da0691d92ea9157bd44a5835804dd3d Mon Sep 17 00:00:00 2001 From: Svenja Jedhoff Date: Fri, 31 Oct 2025 11:05:17 +0100 Subject: [PATCH 3/3] Removing unused variable found by code style check --- bayesflow/diagnostics/metrics/posterior_z_score.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bayesflow/diagnostics/metrics/posterior_z_score.py b/bayesflow/diagnostics/metrics/posterior_z_score.py index 184df4f54..896c5a26d 100644 --- a/bayesflow/diagnostics/metrics/posterior_z_score.py +++ b/bayesflow/diagnostics/metrics/posterior_z_score.py @@ -101,7 +101,6 @@ def posterior_z_score( post_vars = samples["estimates"].var(axis=1, ddof=1) post_means = samples["estimates"].mean(axis=1) post_stds = np.sqrt(post_vars) - prior_vars = samples["targets"].var(axis=0, keepdims=True, ddof=1) z_score = (post_means - samples["targets"]) / post_stds if aggregation is not None: z_score = aggregation(z_score, axis=0)