Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bayesflow/diagnostics/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .posterior_contraction import posterior_contraction
from .root_mean_squared_error import root_mean_squared_error
from .expected_calibration_error import expected_calibration_error
from .classifier_two_sample_test import classifier_two_sample_test
136 changes: 136 additions & 0 deletions bayesflow/diagnostics/metrics/classifier_two_sample_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from typing import Sequence, Mapping, Any

import numpy as np

import keras

from bayesflow.utils.exceptions import ShapeError
from bayesflow.networks import MLP


def classifier_two_sample_test(
estimates: np.ndarray,
targets: np.ndarray,
metric: str = "accuracy",
patience: int = 5,
max_epochs: int = 1000,
batch_size: int = 128,
return_metric_only: bool = True,
validation_split: float = 0.5,
standardize: bool = True,
mlp_widths: Sequence = (64, 64),
**kwargs,
) -> float | Mapping[str, Any]:
"""
C2ST metric [1] between samples from two distributions computed using a neural classifier.
Can be computationally expensive if called in a loop[, since it needs to train the model
for each set of samples.

Note: works best for large numbers of samples and averaged across different posteriors.

[1] Lopez-Paz, D., & Oquab, M. (2016). Revisiting classifier two-sample tests. arXiv:1610.06545.

Parameters
----------
estimates : np.ndarray
Array of shape (num_samples_est, num_variables) containing samples representing estimated quantities
(e.g., approximate posterior samples).
targets : np.ndarray
Array of shape (num_samples_tar, num_variables) containing target samples
(e.g., samples from a reference posterior).
metric : str, optional
Metric to evaluate the classifier performance. Default is "accuracy".
patience : int, optional
Number of epochs with no improvement after which training will be stopped. Default is 5.
max_epochs : int, optional
Maximum number of epochs to train the classifier. Default is 1000.
batch_size : int, optional
Number of samples per batch during training. Default is 64.
return_metric_only : bool, optional
If True, only the final validation metric is returned. Otherwise, a dictionary with the score, classifier, and
full training history is returned. Default is True.
validation_split : float, optional
Fraction of the training data to be used as validation data. Default is 0.5.
standardize : bool, optional
If True, both estimates and targets will be standardized using the mean and standard deviation of estimates.
Default is True.
mlp_widths : Sequence[int], optional
Sequence specifying the number of units in each hidden layer of the MLP classifier. Default is (256, 256).
**kwargs
Additional keyword arguments. Recognized keyword:
mlp_kwargs : dict
Dictionary of additional parameters to pass to the MLP constructor.

Returns
-------
results : float or dict
If return_metric_only is True, returns the final validation metric (e.g., accuracy) as a float.
Otherwise, returns a dictionary with keys "score", "classifier", and "history", where "score"
is the final validation metric, "classifier" is the trained Keras model, and "history" contains the
full training history.
"""

# Error, if targets dim does not match estimates dim
num_dims = estimates.shape[1]
if not num_dims == targets.shape[1]:
raise ShapeError(

Check warning on line 76 in bayesflow/diagnostics/metrics/classifier_two_sample_test.py

View check run for this annotation

Codecov / codecov/patch

bayesflow/diagnostics/metrics/classifier_two_sample_test.py#L76

Added line #L76 was not covered by tests
f"estimates and targets can have different number of samples (1st dim)"
f"but must have the same dimensionality (2nd dim)"
f"found: estimates shape {estimates.shape[1]}, targets shape {targets.shape[1]}"
)

# Standardize both estimates and targets relative to estimates mean and std
if standardize:
estimates_mean = np.mean(estimates, axis=0)
estimates_std = np.std(estimates, axis=0)
estimates = (estimates - estimates_mean) / estimates_std
targets = (targets - estimates_mean) / estimates_std

# Create data for classification task
data = np.r_[estimates, targets]
labels = np.r_[np.zeros((estimates.shape[0],)), np.ones((targets.shape[0],))]

# Important: needed, since keras does not shuffle before selecting validation split
shuffle_idx = np.random.permutation(data.shape[0])
data = data[shuffle_idx]
labels = labels[shuffle_idx]

# Create and train classifier with optional stopping
classifier = keras.Sequential(
[MLP(widths=mlp_widths, **kwargs.get("mlp_kwargs", {})), keras.layers.Dense(1, activation="sigmoid")]
)

classifier.compile(optimizer="adam", loss="binary_crossentropy", metrics=[metric])

early_stopping = keras.callbacks.EarlyStopping(
monitor=f"val_{metric}", patience=patience, restore_best_weights=True
)

# For now, we need to enable grads, since we turn them off by default
if keras.backend.backend() == "torch":
import torch

with torch.enable_grad():
history = classifier.fit(
x=data,
y=labels,
epochs=max_epochs,
batch_size=batch_size,
verbose=0,
callbacks=[early_stopping],
validation_split=validation_split,
)
else:
history = classifier.fit(
x=data,
y=labels,
epochs=max_epochs,
batch_size=batch_size,
verbose=0,
callbacks=[early_stopping],
validation_split=validation_split,
)

if return_metric_only:
return history.history[f"val_{metric}"][-1]
return {"score": history.history[f"val_{metric}"][-1], "classifier": classifier, "history": history.history}

Check warning on line 136 in bayesflow/diagnostics/metrics/classifier_two_sample_test.py

View check run for this annotation

Codecov / codecov/patch

bayesflow/diagnostics/metrics/classifier_two_sample_test.py#L136

Added line #L136 was not covered by tests
18 changes: 9 additions & 9 deletions bayesflow/diagnostics/metrics/expected_calibration_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
from typing import Sequence, Any, Mapping

from ...utils.exceptions import ShapeError
from sklearn.calibration import calibration_curve
from ...utils.classification import calibration_curve


def expected_calibration_error(
estimates: np.ndarray,
targets: np.ndarray,
model_names: Sequence[str] = None,
n_bins: int = 10,
num_bins: int = 10,
return_probs: bool = False,
) -> Mapping[str, Any]:
"""
Expand All @@ -31,11 +31,11 @@ def expected_calibration_error(
The one-hot-encoded true model indices.
model_names : Sequence[str], optional (default = None)
Optional model names to show in the output. By default, models are called "M_" + model index.
n_bins : int, optional, default: 10
num_bins : int, optional, default: 10
The number of bins to use for the calibration curves (and marginal histograms).
Passed into ``sklearn.calibration.calibration_curve()``.
Passed into ``bayesflow.utils.calibration_curve()``.
return_probs : bool (default = False)
Do you want to obtain the output of ``sklearn.calibration.calibration_curve()``?
Do you want to obtain the output of ``bayesflow.utils.calibration_curve()``?

Returns
-------
Expand All @@ -48,9 +48,9 @@ def expected_calibration_error(
- "model_names" : str
The (inferred) variable names.
- "probs_true": (optional) list[np.ndarray]:
Outputs of ``sklearn.calibration.calibration_curve()`` per model
Outputs of ``bayesflow.utils.calibration.calibration_curve()`` per model
- "probs_pred": (optional) list[np.ndarray]:
Outputs of ``sklearn.calibration.calibration_curve()`` per model
Outputs of ``bayesflow.utils.calibration.calibration_curve()`` per model
"""

# Convert tensors to numpy, if passed
Expand All @@ -76,10 +76,10 @@ def expected_calibration_error(
for model_index in range(estimates.shape[-1]):
y_true = (targets == model_index).astype(np.float32)
y_prob = estimates[..., model_index]
prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins)
prob_true, prob_pred = calibration_curve(y_true, y_prob, num_bins=num_bins)

# Compute ECE by weighting bin errors by bin size
bins = np.linspace(0.0, 1.0, n_bins + 1)
bins = np.linspace(0.0, 1.0, num_bins + 1)
binids = np.searchsorted(bins[1:-1], y_prob)
bin_total = np.bincount(binids, minlength=len(bins))
nonzero = bin_total != 0
Expand Down
2 changes: 1 addition & 1 deletion bayesflow/diagnostics/plots/calibration_histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def calibration_histogram(
"Confidence intervals might be unreliable!"
)

# Set n_bins automatically, if nothing provided
# Set num_bins automatically, if nothing provided
if num_bins is None:
num_bins = int(ratio / 2)
# Attempt a fix if a single bin is determined so plot still makes sense
Expand Down
12 changes: 6 additions & 6 deletions bayesflow/diagnostics/plots/mc_calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def mc_calibration(
pred_models: dict[str, np.ndarray] | np.ndarray,
true_models: dict[str, np.ndarray] | np.ndarray,
model_names: Sequence[str] = None,
n_bins: int = 10,
num_bins: int = 10,
label_fontsize: int = 16,
title_fontsize: int = 18,
metric_fontsize: int = 14,
Expand All @@ -41,12 +41,12 @@ def mc_calibration(
The one-hot-encoded true model indices per data set.
model_names : list or None, optional, default: None
The model names for nice plot titles. Inferred if None.
n_bins : int, optional, default: 10
num_bins : int, optional, default: 10
The number of bins to use for the calibration curves (and marginal histograms).
label_fontsize : int, optional, default: 16
The font size of the y-label and y-label texts
legend_fontsize : int, optional, default: 14
The font size of the legend text (ECE value)
metric_fontsize : int, optional, default: 14
The font size of the metric (e.g., ECE)
title_fontsize : int, optional, default: 18
The font size of the title text. Only relevant if `stacked=False`
tick_fontsize : int, optional, default: 12
Expand Down Expand Up @@ -83,7 +83,7 @@ def mc_calibration(
estimates=pred_models,
targets=true_models,
model_names=plot_data["variable_names"],
n_bins=n_bins,
num_bins=num_bins,
return_probs=True,
)

Expand All @@ -92,7 +92,7 @@ def mc_calibration(
ax.plot(ece["probs_pred"][j], ece["probs_true"][j], "o-", color=color)

# Plot PMP distribution over bins
uniform_bins = np.linspace(0.0, 1.0, n_bins + 1)
uniform_bins = np.linspace(0.0, 1.0, num_bins + 1)
norm_weights = np.ones_like(plot_data["estimates"]) / len(plot_data["estimates"])
ax.hist(plot_data["estimates"][:, j], bins=uniform_bins, weights=norm_weights[:, j], color="grey", alpha=0.3)

Expand Down
12 changes: 5 additions & 7 deletions bayesflow/diagnostics/plots/mc_confusion_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from matplotlib.colors import LinearSegmentedColormap
import numpy as np

from sklearn.metrics import confusion_matrix

from bayesflow.utils.plot_utils import make_figure
from ...utils.plot_utils import make_figure
from ...utils.classification import confusion_matrix


def mc_confusion_matrix(
Expand Down Expand Up @@ -50,10 +49,9 @@ def mc_confusion_matrix(
ytick_rotation: int, optional, default: None
Rotation of y-axis tick labels (helps with long model names).
normalize : {'true', 'pred', 'all'}, default=None
Passed to sklearn.metrics.confusion_matrix.
Normalizes confusion matrix over the true (rows), predicted (columns)
conditions or all the population. If None, confusion matrix will not be
normalized.
Passed to confusion matrix. Normalizes confusion matrix over the true (rows),
predicted (columns) conditions or all the population. If None, confusion matrix
will not be normalized.
cmap : matplotlib.colors.Colormap or str, optional, default: None
Colormap to be used for the cells. If a str, it should be the name of a registered colormap,
e.g., 'viridis'. Default colormap matches the BayesFlow defaults by ranging from white to red.
Expand Down
1 change: 1 addition & 0 deletions bayesflow/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
fill_triangular_matrix,
weighted_sum,
)
from .classification import calibration_curve, confusion_matrix
from .validators import check_lengths_same
from .workflow_utils import find_inference_network, find_summary_network

Expand Down
2 changes: 2 additions & 0 deletions bayesflow/utils/classification/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .confusion_matrix import confusion_matrix
from .calibration_curve import calibration_curve
82 changes: 82 additions & 0 deletions bayesflow/utils/classification/calibration_curve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import numpy as np


def calibration_curve(
targets: np.ndarray,
estimates: np.ndarray,
*,
pos_label: int | float | bool | str = 1,
num_bins: int = 5,
strategy: str = "uniform",
):
"""Compute true and predicted probabilities for a calibration curve.

The method assumes the inputs come from a binary classifier, and
discretize the [0, 1] interval into bins.

Code from: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/calibration.py#L927

Parameters
----------
targets : array-like of shape (n_samples,)
True targets.
estimates : array-like of shape (n_samples,)
Probabilities of the positive class.
pos_label : int, float, bool or str, default = 1
The label of the positive class.
num_bins : int, default=5
Number of bins to discretize the [0, 1] interval. A bigger number
requires more data. Bins with no samples (i.e. without
corresponding values in `estimates`) will not be returned, thus the
returned arrays may have less than `num_bins` values.
strategy : {'uniform', 'quantile'}, default='uniform'
Strategy used to define the widths of the bins.

uniform
The bins have identical widths.
quantile
The bins have the same number of samples and depend on `y_prob`.

Returns
-------
prob_true : ndarray of shape (num_bins,) or smaller
The proportion of samples whose class is the positive class, in each
bin (fraction of positives).

prob_pred : ndarray of shape (num_bins,) or smaller
The mean estimated probability in each bin.

References
----------
Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good
Probabilities With Supervised Learning, in Proceedings of the 22nd
International Conference on Machine Learning (ICML).
"""

if estimates.min() < 0 or estimates.max() > 1:
raise ValueError("y_prob has values outside [0, 1].")

Check warning on line 57 in bayesflow/utils/classification/calibration_curve.py

View check run for this annotation

Codecov / codecov/patch

bayesflow/utils/classification/calibration_curve.py#L57

Added line #L57 was not covered by tests

labels = np.unique(targets)
if len(labels) > 2:
raise ValueError(f"Only binary classification is supported. Provided labels {labels}.")

Check warning on line 61 in bayesflow/utils/classification/calibration_curve.py

View check run for this annotation

Codecov / codecov/patch

bayesflow/utils/classification/calibration_curve.py#L61

Added line #L61 was not covered by tests
targets = targets == pos_label

if strategy == "quantile": # Determine bin edges by distribution of data
quantiles = np.linspace(0, 1, num_bins + 1)
bins = np.percentile(estimates, quantiles * 100)

Check warning on line 66 in bayesflow/utils/classification/calibration_curve.py

View check run for this annotation

Codecov / codecov/patch

bayesflow/utils/classification/calibration_curve.py#L65-L66

Added lines #L65 - L66 were not covered by tests
elif strategy == "uniform":
bins = np.linspace(0.0, 1.0, num_bins + 1)
else:
raise ValueError("Invalid entry to 'strategy' input. Strategy must be either 'quantile' or 'uniform'.")

Check warning on line 70 in bayesflow/utils/classification/calibration_curve.py

View check run for this annotation

Codecov / codecov/patch

bayesflow/utils/classification/calibration_curve.py#L70

Added line #L70 was not covered by tests

binids = np.searchsorted(bins[1:-1], estimates)

bin_sums = np.bincount(binids, weights=estimates, minlength=len(bins))
bin_true = np.bincount(binids, weights=targets, minlength=len(bins))
bin_total = np.bincount(binids, minlength=len(bins))

nonzero = bin_total != 0
prob_true = bin_true[nonzero] / bin_total[nonzero]
prob_pred = bin_sums[nonzero] / bin_total[nonzero]

return prob_true, prob_pred
Loading
Loading