Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bayesflow/diagnostics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
calibration_error,
calibration_log_gamma,
posterior_contraction,
posterior_z_score,
summary_space_comparison,
)

Expand Down
1 change: 1 addition & 0 deletions bayesflow/diagnostics/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from .classifier_two_sample_test import classifier_two_sample_test
from .model_misspecification import bootstrap_comparison, summary_space_comparison
from .calibration_log_gamma import calibration_log_gamma, gamma_null_distribution, gamma_discrepancy
from .posterior_z_score import posterior_z_score
108 changes: 108 additions & 0 deletions bayesflow/diagnostics/metrics/posterior_z_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from collections.abc import Sequence, Mapping, Callable

import numpy as np

from ...utils.dict_utils import dicts_to_arrays, compute_test_quantities


def posterior_z_score(
estimates: Mapping[str, np.ndarray] | np.ndarray,
targets: Mapping[str, np.ndarray] | np.ndarray,
variable_keys: Sequence[str] = None,
variable_names: Sequence[str] = None,
test_quantities: dict[str, Callable] = None,
aggregation: Callable | None = np.median,
) -> dict[str, any]:
"""
Computes the posterior z-score from prior to posterior for the given samples according to [1]:

post_z_score = (posterior_mean - true_parameters) / posterior_std

The score is adequate if it centers around zero and spreads roughly
in the interval [-3, 3]

[1] Schad, D. J., Betancourt, M., & Vasishth, S. (2021).
Toward a principled Bayesian workflow in cognitive science.
Psychological methods, 26(1), 103.

Paper also available at https://arxiv.org/abs/1904.12765

Parameters
----------
estimates : np.ndarray of shape (num_datasets, num_draws_post, num_variables)
Posterior samples, comprising `num_draws_post` random draws from the posterior distribution
for each data set from `num_datasets`.
targets : np.ndarray of shape (num_datasets, num_variables)
Prior samples, comprising `num_datasets` ground truths.
variable_keys : Sequence[str], optional (default = None)
Select keys from the dictionaries provided in estimates and targets.
By default, select all keys.
variable_names : Sequence[str], optional (default = None)
Optional variable names to show in the output.
test_quantities : dict or None, optional, default: None
A dict that maps plot titles to functions that compute
test quantities based on estimate/target draws.

The dict keys are automatically added to ``variable_keys``
and ``variable_names``.
Test quantity functions are expected to accept a dict of draws with
shape ``(batch_size, ...)`` as the first (typically only)
positional argument and return an NumPy array of shape
``(batch_size,)``.
The functions do not have to deal with an additional
sample dimension, as appropriate reshaping is done internally.
aggregation : callable or None, optional (default = np.median)
Function to aggregate the PC across draws. Typically `np.mean` or `np.median`.
If None is provided, the individual values are returned.

Returns
-------
result : dict
Dictionary containing:

- "values" : float or np.ndarray
The (optionally aggregated) posterior z-score per variable
- "metric_name" : str
The name of the metric ("Posterior z-score").
- "variable_names" : str
The (inferred) variable names.

Notes
-----
Posterior z-score quantifies how far the posterior mean lies from
the true generating parameter, in standard-error units. Values near 0
(in absolute terms) mean the posterior mean is close to the truth;
large absolute values indicate substantial deviation.
The sign shows the direction of the bias.

"""

# Optionally, compute and prepend test quantities from draws
if test_quantities is not None:
updated_data = compute_test_quantities(
targets=targets,
estimates=estimates,
variable_keys=variable_keys,
variable_names=variable_names,
test_quantities=test_quantities,
)
variable_names = updated_data["variable_names"]
variable_keys = updated_data["variable_keys"]
estimates = updated_data["estimates"]
targets = updated_data["targets"]

samples = dicts_to_arrays(
estimates=estimates,
targets=targets,
variable_keys=variable_keys,
variable_names=variable_names,
)

post_vars = samples["estimates"].var(axis=1, ddof=1)
post_means = samples["estimates"].mean(axis=1)
post_stds = np.sqrt(post_vars)
z_score = (post_means - samples["targets"]) / post_stds
if aggregation is not None:
z_score = aggregation(z_score, axis=0)
variable_names = samples["estimates"].variable_names
return {"values": z_score, "metric_name": "Posterior z-score", "variable_names": variable_names}
7 changes: 6 additions & 1 deletion bayesflow/diagnostics/plots/plot_quantity.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,12 @@ def _prepare_values(

if estimates is not None:
if is_values_callable:
values = values(estimates=estimates, targets=targets, **filter_kwargs({"aggregation": None}, values))
values = values(
estimates=estimates,
targets=targets,
variable_keys=variable_keys,
**filter_kwargs({"aggregation": None}, values),
)

data = dicts_to_arrays(
estimates=estimates,
Expand Down
20 changes: 20 additions & 0 deletions tests/test_diagnostics/test_diagnostics_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,26 @@ def test_posterior_contraction(random_estimates, random_targets):
assert out["values"].shape[0] == len(test_quantities) + num_variables(random_estimates)


def test_posterior_z_score(random_estimates, random_targets):
# basic functionality: automatic variable names
out = bf.diagnostics.metrics.posterior_z_score(random_estimates, random_targets)
assert list(out.keys()) == ["values", "metric_name", "variable_names"]
assert out["values"].shape == (num_variables(random_estimates),)
assert out["metric_name"] == "Posterior z-score"
assert out["variable_names"] == ["beta_0", "beta_1", "sigma"]
# test without aggregation
out = bf.diagnostics.metrics.posterior_z_score(random_estimates, random_targets, aggregation=None)
assert out["values"].shape == (random_estimates["sigma"].shape[0], num_variables(random_estimates))

# test quantities
test_quantities = {
r"$\beta_1 + \beta_2$": lambda data: np.sum(data["beta"], axis=-1),
r"$\beta_1 \cdot \beta_2$": lambda data: np.prod(data["beta"], axis=-1),
}
out = bf.diagnostics.metrics.posterior_z_score(random_estimates, random_targets, test_quantities=test_quantities)
assert out["values"].shape[0] == len(test_quantities) + num_variables(random_estimates)


def test_root_mean_squared_error(random_estimates, random_targets):
# basic functionality: automatic variable names
out = bf.diagnostics.metrics.root_mean_squared_error(random_estimates, random_targets)
Expand Down