bayesflow-org
diff --git a/‎bayesflow/diagnostics/metrics/calibration_error.py‎
Lines changed: 27 additions & 1 deletion b/‎bayesflow/diagnostics/metrics/calibration_error.py‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎bayesflow/diagnostics/metrics/calibration_log_gamma.py‎
Lines changed: 30 additions & 2 deletions b/‎bayesflow/diagnostics/metrics/calibration_log_gamma.py‎
Lines changed: 30 additions & 2 deletions
diff --git a/‎bayesflow/diagnostics/metrics/posterior_contraction.py‎
Lines changed: 28 additions & 1 deletion b/‎bayesflow/diagnostics/metrics/posterior_contraction.py‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎bayesflow/diagnostics/metrics/root_mean_squared_error.py‎
Lines changed: 28 additions & 1 deletion b/‎bayesflow/diagnostics/metrics/root_mean_squared_error.py‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎bayesflow/diagnostics/plots/calibration_ecdf.py‎
Lines changed: 10 additions & 6 deletions b/‎bayesflow/diagnostics/plots/calibration_ecdf.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎bayesflow/diagnostics/plots/calibration_ecdf_from_quantiles.py‎
Lines changed: 6 additions & 4 deletions b/‎bayesflow/diagnostics/plots/calibration_ecdf_from_quantiles.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎bayesflow/diagnostics/plots/calibration_histogram.py‎
Lines changed: 29 additions & 1 deletion b/‎bayesflow/diagnostics/plots/calibration_histogram.py‎
Lines changed: 29 additions & 1 deletion
@@ -2,14 +2,15 @@
 
 import numpy as np
 
-from ...utils.dict_utils import dicts_to_arrays
+from ...utils.dict_utils import dicts_to_arrays, compute_test_quantities
 
 
 def calibration_error(
     estimates: Mapping[str, np.ndarray] | np.ndarray,
     targets: Mapping[str, np.ndarray] | np.ndarray,
     variable_keys: Sequence[str] = None,
     variable_names: Sequence[str] = None,
+    test_quantities: dict[str, Callable] = None,
     resolution: int = 20,
     aggregation: Callable = np.median,
     min_quantile: float = 0.005,
@@ -32,6 +33,18 @@ def calibration_error(
        By default, select all keys.
     variable_names : Sequence[str], optional (default = None)
         Optional variable names to show in the output.
+    test_quantities   : dict or None, optional, default: None
+        A dict that maps plot titles to functions that compute
+        test quantities based on estimate/target draws.
+
+        The dict keys are automatically added to ``variable_keys``
+        and ``variable_names``.
+        Test quantity functions are expected to accept a dict of draws with
+        shape ``(batch_size, ...)`` as the first (typically only)
+        positional argument and return an NumPy array of shape
+        ``(batch_size,)``.
+        The functions do not have to deal with an additional
+        sample dimension, as appropriate reshaping is done internally.
     resolution    : int, optional, default: 20
         The number of credibility intervals (CIs) to consider
     aggregation   : callable or None, optional, default: np.median
@@ -55,6 +68,19 @@ def calibration_error(
             The (inferred) variable names.
     """
 
+    if test_quantities is not None:
+        updated_data = compute_test_quantities(
+            targets=targets,
+            estimates=estimates,
+            variable_keys=variable_keys,
+            variable_names=variable_names,
+            test_quantities=test_quantities,
+        )
+        variable_names = updated_data["variable_names"]
+        variable_keys = updated_data["variable_keys"]
+        estimates = updated_data["estimates"]
+        targets = updated_data["targets"]
+
     samples = dicts_to_arrays(
         estimates=estimates,
         targets=targets,
 
@@ -1,16 +1,17 @@
-from collections.abc import Mapping, Sequence
+from collections.abc import Callable, Mapping, Sequence
 
 import numpy as np
 from scipy.stats import binom
 
-from ...utils.dict_utils import dicts_to_arrays
+from ...utils.dict_utils import dicts_to_arrays, compute_test_quantities
 
 
 def calibration_log_gamma(
     estimates: Mapping[str, np.ndarray] | np.ndarray,
     targets: Mapping[str, np.ndarray] | np.ndarray,
     variable_keys: Sequence[str] = None,
     variable_names: Sequence[str] = None,
+    test_quantities: dict[str, Callable] = None,
     num_null_draws: int = 1000,
     quantile: float = 0.05,
 ):
@@ -41,6 +42,18 @@ def calibration_log_gamma(
        By default, select all keys.
     variable_names : Sequence[str], optional (default = None)
         Optional variable names to show in the output.
+    test_quantities   : dict or None, optional, default: None
+        A dict that maps plot titles to functions that compute
+        test quantities based on estimate/target draws.
+
+        The dict keys are automatically added to ``variable_keys``
+        and ``variable_names``.
+        Test quantity functions are expected to accept a dict of draws with
+        shape ``(batch_size, ...)`` as the first (typically only)
+        positional argument and return an NumPy array of shape
+        ``(batch_size,)``.
+        The functions do not have to deal with an additional
+        sample dimension, as appropriate reshaping is done internally.
     quantile : float in (0, 1), optional, default 0.05
         The quantile from the null distribution to be used as a threshold.
         A lower quantile increases sensitivity to deviations from uniformity.
@@ -57,6 +70,21 @@ def calibration_log_gamma(
         - "variable_names" : str
             The (inferred) variable names.
     """
+
+    # Optionally, compute and prepend test quantities from draws
+    if test_quantities is not None:
+        updated_data = compute_test_quantities(
+            targets=targets,
+            estimates=estimates,
+            variable_keys=variable_keys,
+            variable_names=variable_names,
+            test_quantities=test_quantities,
+        )
+        variable_names = updated_data["variable_names"]
+        variable_keys = updated_data["variable_keys"]
+        estimates = updated_data["estimates"]
+        targets = updated_data["targets"]
+
     samples = dicts_to_arrays(
         estimates=estimates,
         targets=targets,
 
@@ -2,14 +2,15 @@
 
 import numpy as np
 
-from ...utils.dict_utils import dicts_to_arrays
+from ...utils.dict_utils import dicts_to_arrays, compute_test_quantities
 
 
 def posterior_contraction(
     estimates: Mapping[str, np.ndarray] | np.ndarray,
     targets: Mapping[str, np.ndarray] | np.ndarray,
     variable_keys: Sequence[str] = None,
     variable_names: Sequence[str] = None,
+    test_quantities: dict[str, Callable] = None,
     aggregation: Callable | None = np.median,
 ) -> dict[str, any]:
     """
@@ -27,6 +28,18 @@ def posterior_contraction(
        By default, select all keys.
     variable_names : Sequence[str], optional (default = None)
         Optional variable names to show in the output.
+    test_quantities   : dict or None, optional, default: None
+        A dict that maps plot titles to functions that compute
+        test quantities based on estimate/target draws.
+
+        The dict keys are automatically added to ``variable_keys``
+        and ``variable_names``.
+        Test quantity functions are expected to accept a dict of draws with
+        shape ``(batch_size, ...)`` as the first (typically only)
+        positional argument and return an NumPy array of shape
+        ``(batch_size,)``.
+        The functions do not have to deal with an additional
+        sample dimension, as appropriate reshaping is done internally.
     aggregation    : callable or None, optional (default = np.median)
         Function to aggregate the PC across draws. Typically `np.mean` or `np.median`.
         If None is provided, the individual values are returned.
@@ -50,6 +63,20 @@ def posterior_contraction(
     indicate low contraction.
     """
 
+    # Optionally, compute and prepend test quantities from draws
+    if test_quantities is not None:
+        updated_data = compute_test_quantities(
+            targets=targets,
+            estimates=estimates,
+            variable_keys=variable_keys,
+            variable_names=variable_names,
+            test_quantities=test_quantities,
+        )
+        variable_names = updated_data["variable_names"]
+        variable_keys = updated_data["variable_keys"]
+        estimates = updated_data["estimates"]
+        targets = updated_data["targets"]
+
     samples = dicts_to_arrays(
         estimates=estimates,
         targets=targets,
 
@@ -2,14 +2,15 @@
 
 import numpy as np
 
-from ...utils.dict_utils import dicts_to_arrays
+from ...utils.dict_utils import dicts_to_arrays, compute_test_quantities
 
 
 def root_mean_squared_error(
     estimates: Mapping[str, np.ndarray] | np.ndarray,
     targets: Mapping[str, np.ndarray] | np.ndarray,
     variable_keys: Sequence[str] = None,
     variable_names: Sequence[str] = None,
+    test_quantities: dict[str, Callable] = None,
     normalize: str | None = "range",
     aggregation: Callable = np.median,
 ) -> dict[str, any]:
@@ -28,6 +29,18 @@ def root_mean_squared_error(
        By default, select all keys.
     variable_names : Sequence[str], optional (default = None)
         Optional variable names to show in the output.
+    test_quantities   : dict or None, optional, default: None
+        A dict that maps plot titles to functions that compute
+        test quantities based on estimate/target draws.
+
+        The dict keys are automatically added to ``variable_keys``
+        and ``variable_names``.
+        Test quantity functions are expected to accept a dict of draws with
+        shape ``(batch_size, ...)`` as the first (typically only)
+        positional argument and return an NumPy array of shape
+        ``(batch_size,)``.
+        The functions do not have to deal with an additional
+        sample dimension, as appropriate reshaping is done internally.
     normalize      : str or None, optional (default = "range")
         Whether to normalize the RMSE using statistics of the prior samples.
         Possible options are ("mean", "range", "median", "iqr", "std", None)
@@ -52,6 +65,20 @@ def root_mean_squared_error(
             The (inferred) variable names.
     """
 
+    # Optionally, compute and prepend test quantities from draws
+    if test_quantities is not None:
+        updated_data = compute_test_quantities(
+            targets=targets,
+            estimates=estimates,
+            variable_keys=variable_keys,
+            variable_names=variable_names,
+            test_quantities=test_quantities,
+        )
+        variable_names = updated_data["variable_names"]
+        variable_keys = updated_data["variable_keys"]
+        estimates = updated_data["estimates"]
+        targets = updated_data["targets"]
+
     samples = dicts_to_arrays(
         estimates=estimates,
         targets=targets,
 
@@ -15,13 +15,13 @@ def calibration_ecdf(
     variable_keys: Sequence[str] = None,
     variable_names: Sequence[str] = None,
     test_quantities: dict[str, Callable] = None,
-    difference: bool = False,
+    difference: bool = True,
     stacked: bool = False,
     rank_type: str | np.ndarray = "fractional",
     figsize: Sequence[float] = None,
     label_fontsize: int = 16,
     legend_fontsize: int = 14,
-    legend_location: str = "upper right",
+    legend_location: str = "lower right",
     title_fontsize: int = 18,
     tick_fontsize: int = 12,
     rank_ecdf_color: str = "#132a70",
@@ -59,7 +59,7 @@ def calibration_ecdf(
         The posterior draws obtained from n_data_sets
     targets     : np.ndarray of shape (n_data_sets, n_params)
         The prior draws obtained for generating n_data_sets
-    difference        : bool, optional, default: False
+    difference        : bool, optional, default: True
         If `True`, plots the ECDF difference.
         Enables a more dynamic visualization range.
     stacked           : bool, optional, default: False
@@ -98,7 +98,9 @@ def calibration_ecdf(
     label_fontsize    : int, optional, default: 16
         The font size of the y-label and y-label texts
     legend_fontsize   : int, optional, default: 14
-        The font size of the legend text
+        The font size of the legend text.
+    legend_location : str, optional, default: 'lower right
+        The location of the legend.
     title_fontsize    : int, optional, default: 18
         The font size of the title text.
         Only relevant if `stacked=False`
@@ -211,11 +213,13 @@ def calibration_ecdf(
     else:
         titles = ["Stacked ECDFs"]
 
-    for ax, title in zip(plot_data["axes"].flat, titles):
+    for i, (ax, title) in enumerate(zip(plot_data["axes"].flat, titles)):
         ax.fill_between(z, L, U, color=fill_color, alpha=0.2, label=rf"{int((1 - alpha) * 100)}$\%$ Confidence Bands")
-        ax.legend(fontsize=legend_fontsize, loc=legend_location)
         ax.set_title(title, fontsize=title_fontsize)
 
+        if i == 0:
+            ax.legend(fontsize=legend_fontsize, loc=legend_location)
+
     prettify_subplots(plot_data["axes"], num_subplots=plot_data["num_variables"], tick_fontsize=tick_fontsize)
 
     add_titles_and_labels(
 
@@ -14,12 +14,12 @@ def calibration_ecdf_from_quantiles(
     quantiles_key: str = "quantiles",
     variable_keys: Sequence[str] = None,
     variable_names: Sequence[str] = None,
-    difference: bool = False,
+    difference: bool = True,
     stacked: bool = False,
     figsize: Sequence[float] = None,
     label_fontsize: int = 16,
     legend_fontsize: int = 14,
-    legend_location: str = "upper right",
+    legend_location: str = "lower right",
     title_fontsize: int = 18,
     tick_fontsize: int = 12,
     rank_ecdf_color: str = "#132a70",
@@ -69,7 +69,7 @@ def calibration_ecdf_from_quantiles(
     variable_names    : list or None, optional, default: None
         The parameter names for nice plot titles.
         Inferred if None. Only relevant if `stacked=False`.
-    difference        : bool, optional, default: False
+    difference        : bool, optional, default: True
         If `True`, plots the ECDF difference.
         Enables a more dynamic visualization range.
     stacked           : bool, optional, default: False
@@ -82,7 +82,9 @@ def calibration_ecdf_from_quantiles(
     label_fontsize    : int, optional, default: 16
         The font size of the y-label and y-label texts
     legend_fontsize   : int, optional, default: 14
-        The font size of the legend text
+        The font size of the legend text.
+    legend_location : str, optional, default: 'lower right
+        The location of the legend.
     title_fontsize    : int, optional, default: 18
         The font size of the title text.
         Only relevant if `stacked=False`
 
@@ -1,4 +1,4 @@
-from collections.abc import Sequence, Mapping
+from collections.abc import Callable, Mapping, Sequence
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -8,13 +8,15 @@
 
 from bayesflow.utils import logging
 from bayesflow.utils import prepare_plot_data, add_titles_and_labels, prettify_subplots
+from bayesflow.utils.dict_utils import compute_test_quantities
 
 
 def calibration_histogram(
     estimates: Mapping[str, np.ndarray] | np.ndarray,
     targets: Mapping[str, np.ndarray] | np.ndarray,
     variable_keys: Sequence[str] = None,
     variable_names: Sequence[str] = None,
+    test_quantities: dict[str, Callable] = None,
     figsize: Sequence[float] = None,
     num_bins: int = 10,
     binomial_interval: float = 0.99,
@@ -46,6 +48,18 @@ def calibration_histogram(
        By default, select all keys.
     variable_names    : list or None, optional, default: None
         The parameter names for nice plot titles. Inferred if None
+    test_quantities   : dict or None, optional, default: None
+        A dict that maps plot titles to functions that compute
+        test quantities based on estimate/target draws.
+
+        The dict keys are automatically added to ``variable_keys``
+        and ``variable_names``.
+        Test quantity functions are expected to accept a dict of draws with
+        shape ``(batch_size, ...)`` as the first (typically only)
+        positional argument and return an NumPy array of shape
+        ``(batch_size,)``.
+        The functions do not have to deal with an additional
+        sample dimension, as appropriate reshaping is done internally.
     figsize          : tuple or None, optional, default : None
         The figure size passed to the matplotlib constructor. Inferred if None
     num_bins          : int, optional, default: 10
@@ -75,6 +89,20 @@ def calibration_histogram(
         If there is a deviation form the expected shapes of `estimates` and `targets`.
     """
 
+    # Optionally, compute and prepend test quantities from draws
+    if test_quantities is not None:
+        updated_data = compute_test_quantities(
+            targets=targets,
+            estimates=estimates,
+            variable_keys=variable_keys,
+            variable_names=variable_names,
+            test_quantities=test_quantities,
+        )
+        variable_names = updated_data["variable_names"]
+        variable_keys = updated_data["variable_keys"]
+        estimates = updated_data["estimates"]
+        targets = updated_data["targets"]
+
     plot_data = prepare_plot_data(
         estimates=estimates,
         targets=targets,