diff --git a/bayesflow/approximators/continuous_approximator.py b/bayesflow/approximators/continuous_approximator.py index 46fe4bb0d..e9c9c1be3 100644 --- a/bayesflow/approximators/continuous_approximator.py +++ b/bayesflow/approximators/continuous_approximator.py @@ -578,9 +578,9 @@ def summarize(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray: def summaries(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray: """ .. deprecated:: 2.0.4 - `summaries` will be removed in version 2.0.5, it was renamed to `summarize` which should be used instead. + `summaries` will be removed in version 2.0.6, it was renamed to `summarize` which should be used instead. """ - warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.5.", FutureWarning) + warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.6.", FutureWarning) return self.summarize(data=data, **kwargs) def log_prob(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray: diff --git a/bayesflow/approximators/model_comparison_approximator.py b/bayesflow/approximators/model_comparison_approximator.py index 12038ea9c..c946da93a 100644 --- a/bayesflow/approximators/model_comparison_approximator.py +++ b/bayesflow/approximators/model_comparison_approximator.py @@ -442,9 +442,9 @@ def summarize(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray: def summaries(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray: """ .. deprecated:: 2.0.4 - `summaries` will be removed in version 2.0.5, it was renamed to `summarize` which should be used instead. + `summaries` will be removed in version 2.0.6, it was renamed to `summarize` which should be used instead. """ - warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.5.", FutureWarning) + warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.6.", FutureWarning) return self.summarize(data=data, **kwargs) def _compute_logits(self, classifier_conditions: Tensor) -> Tensor: diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py index 3e3496cda..10d499e11 100644 --- a/bayesflow/diagnostics/metrics/__init__.py +++ b/bayesflow/diagnostics/metrics/__init__.py @@ -4,3 +4,4 @@ from .expected_calibration_error import expected_calibration_error from .classifier_two_sample_test import classifier_two_sample_test from .model_misspecification import bootstrap_comparison, summary_space_comparison +from .calibration_log_gamma import calibration_log_gamma, gamma_null_distribution, gamma_discrepancy diff --git a/bayesflow/diagnostics/metrics/calibration_log_gamma.py b/bayesflow/diagnostics/metrics/calibration_log_gamma.py new file mode 100644 index 000000000..54551c857 --- /dev/null +++ b/bayesflow/diagnostics/metrics/calibration_log_gamma.py @@ -0,0 +1,163 @@ +from collections.abc import Mapping, Sequence + +import numpy as np +from scipy.stats import binom + +from ...utils.dict_utils import dicts_to_arrays + + +def calibration_log_gamma( + estimates: Mapping[str, np.ndarray] | np.ndarray, + targets: Mapping[str, np.ndarray] | np.ndarray, + variable_keys: Sequence[str] = None, + variable_names: Sequence[str] = None, + num_null_draws: int = 1000, + quantile: float = 0.05, +): + """ + Compute the log gamma discrepancy statistic to test posterior calibration, + see [1] for additional information. + Log gamma is log(gamma/gamma_null), where gamma_null is the 5th percentile of the + null distribution under uniformity of ranks. + That is, if adopting a hypothesis testing framework,then log_gamma < 0 implies + a rejection of the hypothesis of uniform ranks at the 5% level. + This diagnostic is typically more sensitive than the Kolmogorov-Smirnoff test or + ChiSq test. + + [1] Martin Modrák. Angie H. Moon. Shinyoung Kim. Paul Bürkner. Niko Huurre. + Kateřina Faltejsková. Andrew Gelman. Aki Vehtari. + "Simulation-Based Calibration Checking for Bayesian Computation: + The Choice of Test Quantities Shapes Sensitivity." + Bayesian Anal. 20 (2) 461 - 488, June 2025. https://doi.org/10.1214/23-BA1404 + + Parameters + ---------- + estimates : np.ndarray of shape (num_datasets, num_draws, num_variables) + The random draws from the approximate posteriors over ``num_datasets`` + targets : np.ndarray of shape (num_datasets, num_variables) + The corresponding ground-truth values sampled from the prior + variable_keys : Sequence[str], optional (default = None) + Select keys from the dictionaries provided in estimates and targets. + By default, select all keys. + variable_names : Sequence[str], optional (default = None) + Optional variable names to show in the output. + quantile : float in (0, 1), optional, default 0.05 + The quantile from the null distribution to be used as a threshold. + A lower quantile increases sensitivity to deviations from uniformity. + + Returns + ------- + result : dict + Dictionary containing: + + - "values" : float or np.ndarray + The log gamma values per variable + - "metric_name" : str + The name of the metric ("Log Gamma"). + - "variable_names" : str + The (inferred) variable names. + """ + samples = dicts_to_arrays( + estimates=estimates, + targets=targets, + variable_keys=variable_keys, + variable_names=variable_names, + ) + + num_ranks = samples["estimates"].shape[0] + num_post_draws = samples["estimates"].shape[1] + + # rank statistics + ranks = np.sum(samples["estimates"] < samples["targets"][:, None], axis=1) + + # null distribution and threshold + null_distribution = gamma_null_distribution(num_ranks, num_post_draws, num_null_draws) + null_quantile = np.quantile(null_distribution, quantile) + + # compute log gamma for each parameter + log_gammas = np.empty(ranks.shape[-1]) + + for i in range(ranks.shape[-1]): + gamma = gamma_discrepancy(ranks[:, i], num_post_draws=num_post_draws) + log_gammas[i] = np.log(gamma / null_quantile) + + output = { + "values": log_gammas, + "metric_name": "Log Gamma", + "variable_names": samples["estimates"].variable_names, + } + + return output + + +def gamma_null_distribution(num_ranks: int, num_post_draws: int = 1000, num_null_draws: int = 1000) -> np.ndarray: + """ + Computes the distribution of expected gamma values under uniformity of ranks. + + Parameters + ---------- + num_ranks : int + Number of ranks to use for each gamma. + num_post_draws : int, optional, default 1000 + Number of posterior draws that were used to calculate the rank distribution. + num_null_draws : int, optional, default 1000 + Number of returned gamma values under uniformity of ranks. + + Returns + ------- + result : np.ndarray + Array of shape (num_null_draws,) containing gamma values under uniformity of ranks. + """ + z_i = np.arange(1, num_post_draws + 2) / (num_post_draws + 1) + gamma = np.empty(num_null_draws) + + # loop non-vectorized to reduce memory footprint + for i in range(num_null_draws): + u = np.random.uniform(size=num_ranks) + F_z = np.mean(u[:, None] < z_i, axis=0) + bin_1 = binom.cdf(num_ranks * F_z, num_ranks, z_i) + bin_2 = 1 - binom.cdf(num_ranks * F_z - 1, num_ranks, z_i) + + gamma[i] = 2 * np.min(np.minimum(bin_1, bin_2)) + + return gamma + + +def gamma_discrepancy(ranks: np.ndarray, num_post_draws: int = 100) -> float: + """ + Quantifies deviation from uniformity by the likelihood of observing the + most extreme point on the empirical CDF of the given rank distribution + according to [1] (equation 7). + + [1] Martin Modrák. Angie H. Moon. Shinyoung Kim. Paul Bürkner. Niko Huurre. + Kateřina Faltejsková. Andrew Gelman. Aki Vehtari. + "Simulation-Based Calibration Checking for Bayesian Computation: + The Choice of Test Quantities Shapes Sensitivity." + Bayesian Anal. 20 (2) 461 - 488, June 2025. https://doi.org/10.1214/23-BA1404 + + Parameters + ---------- + ranks : array of shape (num_ranks,) + Empirical rank distribution + num_post_draws : int, optional, default 100 + Number of posterior draws used to generate ranks. + + Returns + ------- + result : float + Gamma discrepancy values for each parameter. + """ + num_ranks = len(ranks) + + # observed count of ranks smaller than i + R_i = np.array([sum(ranks < i) for i in range(1, num_post_draws + 2)]) + + # expected proportion of ranks smaller than i + z_i = np.arange(1, num_post_draws + 2) / (num_post_draws + 1) + + bin_1 = binom.cdf(R_i, num_ranks, z_i) + bin_2 = 1 - binom.cdf(R_i - 1, num_ranks, z_i) + + # likelihood of obtaining the most extreme point on the empirical CDF + # if the rank distribution was indeed uniform + return float(2 * np.min(np.minimum(bin_1, bin_2))) diff --git a/bayesflow/distributions/diagonal_normal.py b/bayesflow/distributions/diagonal_normal.py index f8d93b945..6b64445c7 100644 --- a/bayesflow/distributions/diagonal_normal.py +++ b/bayesflow/distributions/diagonal_normal.py @@ -58,7 +58,6 @@ def __init__( self.seed_generator = seed_generator or keras.random.SeedGenerator() self.dim = None - self.log_normalization_constant = None self._mean = None self._std = None @@ -71,17 +70,18 @@ def build(self, input_shape: Shape) -> None: self.mean = ops.cast(ops.broadcast_to(self.mean, (self.dim,)), "float32") self.std = ops.cast(ops.broadcast_to(self.std, (self.dim,)), "float32") - self.log_normalization_constant = -0.5 * self.dim * math.log(2.0 * math.pi) - ops.sum(ops.log(self.std)) - if self.trainable_parameters: self._mean = self.add_weight( shape=ops.shape(self.mean), - initializer=keras.initializers.get(self.mean), + initializer=keras.initializers.get(keras.ops.copy(self.mean)), dtype="float32", trainable=True, ) self._std = self.add_weight( - shape=ops.shape(self.std), initializer=keras.initializers.get(self.std), dtype="float32", trainable=True + shape=ops.shape(self.std), + initializer=keras.initializers.get(keras.ops.copy(self.std)), + dtype="float32", + trainable=True, ) else: self._mean = self.mean @@ -91,7 +91,8 @@ def log_prob(self, samples: Tensor, *, normalize: bool = True) -> Tensor: result = -0.5 * ops.sum((samples - self._mean) ** 2 / self._std**2, axis=-1) if normalize: - result += self.log_normalization_constant + log_normalization_constant = -0.5 * self.dim * math.log(2.0 * math.pi) - ops.sum(ops.log(self._std)) + result += log_normalization_constant return result diff --git a/bayesflow/distributions/diagonal_student_t.py b/bayesflow/distributions/diagonal_student_t.py index 98e3fb7eb..9b02ee821 100644 --- a/bayesflow/distributions/diagonal_student_t.py +++ b/bayesflow/distributions/diagonal_student_t.py @@ -63,7 +63,6 @@ def __init__( self.seed_generator = seed_generator or keras.random.SeedGenerator() - self.log_normalization_constant = None self.dim = None self._loc = None self._scale = None @@ -78,21 +77,16 @@ def build(self, input_shape: Shape) -> None: self.loc = ops.cast(ops.broadcast_to(self.loc, (self.dim,)), "float32") self.scale = ops.cast(ops.broadcast_to(self.scale, (self.dim,)), "float32") - self.log_normalization_constant = ( - -0.5 * self.dim * math.log(self.df) - - 0.5 * self.dim * math.log(math.pi) - - math.lgamma(0.5 * self.df) - + math.lgamma(0.5 * (self.df + self.dim)) - - ops.sum(keras.ops.log(self.scale)) - ) - if self.trainable_parameters: self._loc = self.add_weight( - shape=ops.shape(self.loc), initializer=keras.initializers.get(self.loc), dtype="float32", trainable=True + shape=ops.shape(self.loc), + initializer=keras.initializers.get(keras.ops.copy(self.loc)), + dtype="float32", + trainable=True, ) self._scale = self.add_weight( shape=ops.shape(self.scale), - initializer=keras.initializers.get(self.scale), + initializer=keras.initializers.get(keras.ops.copy(self.scale)), dtype="float32", trainable=True, ) @@ -105,7 +99,14 @@ def log_prob(self, samples: Tensor, *, normalize: bool = True) -> Tensor: result = -0.5 * (self.df + self.dim) * ops.log1p(mahalanobis_term / self.df) if normalize: - result += self.log_normalization_constant + log_normalization_constant = ( + -0.5 * self.dim * math.log(self.df) + - 0.5 * self.dim * math.log(math.pi) + - math.lgamma(0.5 * self.df) + + math.lgamma(0.5 * (self.df + self.dim)) + - ops.sum(keras.ops.log(self._scale)) + ) + result += log_normalization_constant return result diff --git a/bayesflow/distributions/mixture.py b/bayesflow/distributions/mixture.py index a7bf2ea27..e1f04e88f 100644 --- a/bayesflow/distributions/mixture.py +++ b/bayesflow/distributions/mixture.py @@ -144,7 +144,7 @@ def build(self, input_shape: Shape) -> None: self._mixture_logits = self.add_weight( shape=(len(self.distributions),), - initializer=keras.initializers.get(self.mixture_logits), + initializer=keras.initializers.get(keras.ops.copy(self.mixture_logits)), dtype="float32", trainable=self.trainable_mixture, ) diff --git a/bayesflow/networks/standardization/standardization.py b/bayesflow/networks/standardization/standardization.py index a30a9a1e9..9dfdf2cb2 100644 --- a/bayesflow/networks/standardization/standardization.py +++ b/bayesflow/networks/standardization/standardization.py @@ -40,7 +40,7 @@ def moving_std(self, index: int) -> Tensor: """ return keras.ops.where( self.moving_m2[index] > 0, - keras.ops.sqrt(self.moving_m2[index] / self.count), + keras.ops.sqrt(self.moving_m2[index] / self.count[index]), 1.0, ) @@ -53,7 +53,7 @@ def build(self, input_shape: Shape): self.moving_m2 = [ self.add_weight(shape=(shape[-1],), initializer="zeros", trainable=False) for shape in flattened_shapes ] - self.count = self.add_weight(shape=(), initializer="zeros", trainable=False) + self.count = [self.add_weight(shape=(), initializer="zeros", trainable=False) for _ in flattened_shapes] def call( self, @@ -150,7 +150,7 @@ def _update_moments(self, x: Tensor, index: int): """ reduce_axes = tuple(range(x.ndim - 1)) - batch_count = keras.ops.cast(keras.ops.shape(x)[0], self.count.dtype) + batch_count = keras.ops.cast(keras.ops.prod(keras.ops.shape(x)[:-1]), self.count[index].dtype) # Compute batch mean and M2 per feature batch_mean = keras.ops.mean(x, axis=reduce_axes) @@ -159,7 +159,7 @@ def _update_moments(self, x: Tensor, index: int): # Read current totals mean = self.moving_mean[index] m2 = self.moving_m2[index] - count = self.count + count = self.count[index] total_count = count + batch_count delta = batch_mean - mean @@ -169,4 +169,4 @@ def _update_moments(self, x: Tensor, index: int): self.moving_mean[index].assign(new_mean) self.moving_m2[index].assign(new_m2) - self.count.assign(total_count) + self.count[index].assign(total_count) diff --git a/bayesflow/scores/multivariate_normal_score.py b/bayesflow/scores/multivariate_normal_score.py index 16e851427..71d45d4b2 100644 --- a/bayesflow/scores/multivariate_normal_score.py +++ b/bayesflow/scores/multivariate_normal_score.py @@ -82,13 +82,15 @@ def log_prob(self, x: Tensor, mean: Tensor, cov_chol: Tensor) -> Tensor: """ diff = x - mean - # Calculate covariance from Cholesky factors - covariance = keras.ops.matmul( - cov_chol, - keras.ops.swapaxes(cov_chol, -2, -1), + # Calculate precision from Cholesky factors of covariance matrix + cov_chol_inv = keras.ops.inv(cov_chol) + precision = keras.ops.matmul( + keras.ops.swapaxes(cov_chol_inv, -2, -1), + cov_chol_inv, ) - precision = keras.ops.inv(covariance) - log_det_covariance = keras.ops.slogdet(covariance)[1] # Only take the log of the determinant part + + # Compute log determinant, exploiting Cholesky factors + log_det_covariance = keras.ops.log(keras.ops.prod(keras.ops.diagonal(cov_chol, axis1=1, axis2=2), axis=1)) * 2 # Compute the quadratic term in the exponential of the multivariate Gaussian quadratic_term = keras.ops.einsum("...i,...ij,...j->...", diff, precision, diff) diff --git a/pyproject.toml b/pyproject.toml index 902ffc1d2..0c1dd941b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "bayesflow" -version = "2.0.4" +version = "2.0.5" authors = [{ name = "The BayesFlow Team" }] classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/tests/test_approximators/test_approximator_standardization/test_approximator_standardization.py b/tests/test_approximators/test_approximator_standardization/test_approximator_standardization.py index 9c73d4717..7cf0f6aba 100644 --- a/tests/test_approximators/test_approximator_standardization/test_approximator_standardization.py +++ b/tests/test_approximators/test_approximator_standardization/test_approximator_standardization.py @@ -8,7 +8,8 @@ def test_save_and_load(tmp_path, approximator, train_dataset, validation_dataset approximator.build(data_shapes) for layer in approximator.standardize_layers.values(): assert layer.built - assert layer.count == 0 + for count in layer.count: + assert count == 0.0 approximator.compute_metrics(**train_dataset[0]) keras.saving.save_model(approximator, tmp_path / "model.keras") diff --git a/tests/test_approximators/test_build.py b/tests/test_approximators/test_build.py index 5947783dd..bea897115 100644 --- a/tests/test_approximators/test_build.py +++ b/tests/test_approximators/test_build.py @@ -14,4 +14,5 @@ def test_build(approximator, simulator, batch_size, adapter): approximator.build(batch_shapes) for layer in approximator.standardize_layers.values(): assert layer.built - assert layer.count == 0 + for count in layer.count: + assert count == 0.0 diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py index 3a2c711bc..f2c4c73c4 100644 --- a/tests/test_diagnostics/test_diagnostics_metrics.py +++ b/tests/test_diagnostics/test_diagnostics_metrics.py @@ -1,6 +1,7 @@ -import numpy as np import keras +import numpy as np import pytest +from scipy.stats import binom import bayesflow as bf @@ -84,6 +85,58 @@ def test_expected_calibration_error(pred_models, true_models, model_names): out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models.transpose) +def test_calibration_log_gamma(random_estimates, random_targets): + out = bf.diagnostics.metrics.calibration_log_gamma(random_estimates, random_targets) + assert list(out.keys()) == ["values", "metric_name", "variable_names"] + assert out["values"].shape == (num_variables(random_estimates),) + assert out["metric_name"] == "Log Gamma" + assert out["variable_names"] == ["beta_0", "beta_1", "sigma"] + + +def test_calibration_log_gamma_end_to_end(): + # This is a function test for simulation-based calibration. + # First, we sample from a known generative process and then run SBC. + # If the log gamma statistic is correctly implemented, a 95% interval should exclude + # the true value 5% of the time. + + N = 30 # number of samples + S = 1000 # number of posterior draws + D = 1000 # number of datasets + + def run_sbc(N=N, S=S, D=D, bias=0): + rng = np.random.default_rng() + prior_draws = rng.beta(2, 2, size=D) + successes = rng.binomial(N, prior_draws) + + # Analytical posterior: + # if theta ~ Beta(2, 2), then p(theta|successes) is Beta(2 + successes | 2 + N - successes). + posterior_draws = rng.beta(2 + successes + bias, 2 + N - successes + bias, size=(S, D)) + + # these ranks are uniform if bias=0 + ranks = np.sum(posterior_draws < prior_draws, axis=0) + + # this is the distribution of gamma under uniform ranks + gamma_null = bf.diagnostics.metrics.gamma_null_distribution(D, S, num_null_draws=100) + lower, upper = np.quantile(gamma_null, (0.05, 0.995)) + + # this is the empirical gamma + observed_gamma = bf.diagnostics.metrics.gamma_discrepancy(ranks, num_post_draws=S) + + in_interval = lower <= observed_gamma < upper + + return in_interval + + sbc_calibration = [run_sbc(N=N, S=S, D=D) for _ in range(100)] + lower_expected, upper_expected = binom.ppf((0.0005, 0.9995), 100, 0.95) + + # this test should fail with a probability of 0.1% + assert lower_expected <= np.sum(sbc_calibration) <= upper_expected + + # sbc should almost always fial for slightly biased posterior draws + sbc_calibration = [run_sbc(N=N, S=S, D=D, bias=1) for _ in range(100)] + assert not lower_expected <= np.sum(sbc_calibration) <= upper_expected + + def test_bootstrap_comparison_shapes(): """Test the bootstrap_comparison output shapes.""" observed_samples = np.random.rand(10, 5) diff --git a/tests/test_networks/test_standardization.py b/tests/test_networks/test_standardization.py index 8b83de498..86881a384 100644 --- a/tests/test_networks/test_standardization.py +++ b/tests/test_networks/test_standardization.py @@ -91,6 +91,39 @@ def test_nested_consistency_forward_inverse(): np.testing.assert_allclose(random_input["b"], recovered["b"], atol=1e-4) +def test_nested_accuracy_forward(): + from bayesflow.utils import tree_concatenate + + # create inputs for two training passes + random_input_a_1 = keras.random.normal((2, 3, 5)) + random_input_b_1 = keras.random.normal((4, 3)) + random_input_1 = {"a": random_input_a_1, "b": random_input_b_1} + + random_input_a_2 = keras.random.normal((3, 3, 5)) + random_input_b_2 = keras.random.normal((3, 3)) + random_input_2 = {"a": random_input_a_2, "b": random_input_b_2} + + # complete data for testing mean and std are 0 and 1 + random_input = tree_concatenate([random_input_1, random_input_2], axis=0) + + layer = Standardization() + + _ = layer(random_input_1, stage="training", forward=True) + _ = layer(random_input_2, stage="training", forward=True) + + standardized = layer(random_input, stage="inference", forward=True) + standardized = keras.tree.map_structure(keras.ops.convert_to_numpy, standardized) + + np.testing.assert_allclose( + np.mean(standardized["a"], axis=tuple(range(standardized["a"].ndim - 1))), 0.0, atol=1e-4 + ) + np.testing.assert_allclose( + np.mean(standardized["b"], axis=tuple(range(standardized["b"].ndim - 1))), 0.0, atol=1e-4 + ) + np.testing.assert_allclose(np.std(standardized["a"], axis=tuple(range(standardized["a"].ndim - 1))), 1.0, atol=1e-4) + np.testing.assert_allclose(np.std(standardized["b"], axis=tuple(range(standardized["b"].ndim - 1))), 1.0, atol=1e-4) + + def test_transformation_type_both_sides_scale(): # Fix a known covariance and mean in original (not standardized space) covariance = np.array([[1, 0.5], [0.5, 2.0]], dtype="float32")