diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py index 1fb5f2b8d..ceeca4cc4 100644 --- a/bayesflow/diagnostics/metrics/__init__.py +++ b/bayesflow/diagnostics/metrics/__init__.py @@ -2,3 +2,4 @@ from .posterior_contraction import posterior_contraction from .root_mean_squared_error import root_mean_squared_error from .expected_calibration_error import expected_calibration_error +from .classifier_two_sample_test import classifier_two_sample_test diff --git a/bayesflow/diagnostics/metrics/classifier_two_sample_test.py b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py new file mode 100644 index 000000000..d1382ae5d --- /dev/null +++ b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py @@ -0,0 +1,136 @@ +from typing import Sequence, Mapping, Any + +import numpy as np + +import keras + +from bayesflow.utils.exceptions import ShapeError +from bayesflow.networks import MLP + + +def classifier_two_sample_test( + estimates: np.ndarray, + targets: np.ndarray, + metric: str = "accuracy", + patience: int = 5, + max_epochs: int = 1000, + batch_size: int = 128, + return_metric_only: bool = True, + validation_split: float = 0.5, + standardize: bool = True, + mlp_widths: Sequence = (64, 64), + **kwargs, +) -> float | Mapping[str, Any]: + """ + C2ST metric [1] between samples from two distributions computed using a neural classifier. + Can be computationally expensive if called in a loop[, since it needs to train the model + for each set of samples. + + Note: works best for large numbers of samples and averaged across different posteriors. + + [1] Lopez-Paz, D., & Oquab, M. (2016). Revisiting classifier two-sample tests. arXiv:1610.06545. + + Parameters + ---------- + estimates : np.ndarray + Array of shape (num_samples_est, num_variables) containing samples representing estimated quantities + (e.g., approximate posterior samples). + targets : np.ndarray + Array of shape (num_samples_tar, num_variables) containing target samples + (e.g., samples from a reference posterior). + metric : str, optional + Metric to evaluate the classifier performance. Default is "accuracy". + patience : int, optional + Number of epochs with no improvement after which training will be stopped. Default is 5. + max_epochs : int, optional + Maximum number of epochs to train the classifier. Default is 1000. + batch_size : int, optional + Number of samples per batch during training. Default is 64. + return_metric_only : bool, optional + If True, only the final validation metric is returned. Otherwise, a dictionary with the score, classifier, and + full training history is returned. Default is True. + validation_split : float, optional + Fraction of the training data to be used as validation data. Default is 0.5. + standardize : bool, optional + If True, both estimates and targets will be standardized using the mean and standard deviation of estimates. + Default is True. + mlp_widths : Sequence[int], optional + Sequence specifying the number of units in each hidden layer of the MLP classifier. Default is (256, 256). + **kwargs + Additional keyword arguments. Recognized keyword: + mlp_kwargs : dict + Dictionary of additional parameters to pass to the MLP constructor. + + Returns + ------- + results : float or dict + If return_metric_only is True, returns the final validation metric (e.g., accuracy) as a float. + Otherwise, returns a dictionary with keys "score", "classifier", and "history", where "score" + is the final validation metric, "classifier" is the trained Keras model, and "history" contains the + full training history. + """ + + # Error, if targets dim does not match estimates dim + num_dims = estimates.shape[1] + if not num_dims == targets.shape[1]: + raise ShapeError( + f"estimates and targets can have different number of samples (1st dim)" + f"but must have the same dimensionality (2nd dim)" + f"found: estimates shape {estimates.shape[1]}, targets shape {targets.shape[1]}" + ) + + # Standardize both estimates and targets relative to estimates mean and std + if standardize: + estimates_mean = np.mean(estimates, axis=0) + estimates_std = np.std(estimates, axis=0) + estimates = (estimates - estimates_mean) / estimates_std + targets = (targets - estimates_mean) / estimates_std + + # Create data for classification task + data = np.r_[estimates, targets] + labels = np.r_[np.zeros((estimates.shape[0],)), np.ones((targets.shape[0],))] + + # Important: needed, since keras does not shuffle before selecting validation split + shuffle_idx = np.random.permutation(data.shape[0]) + data = data[shuffle_idx] + labels = labels[shuffle_idx] + + # Create and train classifier with optional stopping + classifier = keras.Sequential( + [MLP(widths=mlp_widths, **kwargs.get("mlp_kwargs", {})), keras.layers.Dense(1, activation="sigmoid")] + ) + + classifier.compile(optimizer="adam", loss="binary_crossentropy", metrics=[metric]) + + early_stopping = keras.callbacks.EarlyStopping( + monitor=f"val_{metric}", patience=patience, restore_best_weights=True + ) + + # For now, we need to enable grads, since we turn them off by default + if keras.backend.backend() == "torch": + import torch + + with torch.enable_grad(): + history = classifier.fit( + x=data, + y=labels, + epochs=max_epochs, + batch_size=batch_size, + verbose=0, + callbacks=[early_stopping], + validation_split=validation_split, + ) + else: + history = classifier.fit( + x=data, + y=labels, + epochs=max_epochs, + batch_size=batch_size, + verbose=0, + callbacks=[early_stopping], + validation_split=validation_split, + ) + + if return_metric_only: + return history.history[f"val_{metric}"][-1] + return {"score": history.history[f"val_{metric}"][-1], "classifier": classifier, "history": history.history} diff --git a/bayesflow/diagnostics/metrics/expected_calibration_error.py b/bayesflow/diagnostics/metrics/expected_calibration_error.py index bf6fb951e..81f4d9cb6 100644 --- a/bayesflow/diagnostics/metrics/expected_calibration_error.py +++ b/bayesflow/diagnostics/metrics/expected_calibration_error.py @@ -3,14 +3,14 @@ from typing import Sequence, Any, Mapping from ...utils.exceptions import ShapeError -from sklearn.calibration import calibration_curve +from ...utils.classification import calibration_curve def expected_calibration_error( estimates: np.ndarray, targets: np.ndarray, model_names: Sequence[str] = None, - n_bins: int = 10, + num_bins: int = 10, return_probs: bool = False, ) -> Mapping[str, Any]: """ @@ -31,11 +31,11 @@ def expected_calibration_error( The one-hot-encoded true model indices. model_names : Sequence[str], optional (default = None) Optional model names to show in the output. By default, models are called "M_" + model index. - n_bins : int, optional, default: 10 + num_bins : int, optional, default: 10 The number of bins to use for the calibration curves (and marginal histograms). - Passed into ``sklearn.calibration.calibration_curve()``. + Passed into ``bayesflow.utils.calibration_curve()``. return_probs : bool (default = False) - Do you want to obtain the output of ``sklearn.calibration.calibration_curve()``? + Do you want to obtain the output of ``bayesflow.utils.calibration_curve()``? Returns ------- @@ -48,9 +48,9 @@ def expected_calibration_error( - "model_names" : str The (inferred) variable names. - "probs_true": (optional) list[np.ndarray]: - Outputs of ``sklearn.calibration.calibration_curve()`` per model + Outputs of ``bayesflow.utils.calibration.calibration_curve()`` per model - "probs_pred": (optional) list[np.ndarray]: - Outputs of ``sklearn.calibration.calibration_curve()`` per model + Outputs of ``bayesflow.utils.calibration.calibration_curve()`` per model """ # Convert tensors to numpy, if passed @@ -76,10 +76,10 @@ def expected_calibration_error( for model_index in range(estimates.shape[-1]): y_true = (targets == model_index).astype(np.float32) y_prob = estimates[..., model_index] - prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins) + prob_true, prob_pred = calibration_curve(y_true, y_prob, num_bins=num_bins) # Compute ECE by weighting bin errors by bin size - bins = np.linspace(0.0, 1.0, n_bins + 1) + bins = np.linspace(0.0, 1.0, num_bins + 1) binids = np.searchsorted(bins[1:-1], y_prob) bin_total = np.bincount(binids, minlength=len(bins)) nonzero = bin_total != 0 diff --git a/bayesflow/diagnostics/plots/calibration_histogram.py b/bayesflow/diagnostics/plots/calibration_histogram.py index c7c2de408..6ff10c881 100644 --- a/bayesflow/diagnostics/plots/calibration_histogram.py +++ b/bayesflow/diagnostics/plots/calibration_histogram.py @@ -102,7 +102,7 @@ def calibration_histogram( "Confidence intervals might be unreliable!" ) - # Set n_bins automatically, if nothing provided + # Set num_bins automatically, if nothing provided if num_bins is None: num_bins = int(ratio / 2) # Attempt a fix if a single bin is determined so plot still makes sense diff --git a/bayesflow/diagnostics/plots/mc_calibration.py b/bayesflow/diagnostics/plots/mc_calibration.py index 1f1e3678f..0e80241e5 100644 --- a/bayesflow/diagnostics/plots/mc_calibration.py +++ b/bayesflow/diagnostics/plots/mc_calibration.py @@ -18,7 +18,7 @@ def mc_calibration( pred_models: dict[str, np.ndarray] | np.ndarray, true_models: dict[str, np.ndarray] | np.ndarray, model_names: Sequence[str] = None, - n_bins: int = 10, + num_bins: int = 10, label_fontsize: int = 16, title_fontsize: int = 18, metric_fontsize: int = 14, @@ -41,12 +41,12 @@ def mc_calibration( The one-hot-encoded true model indices per data set. model_names : list or None, optional, default: None The model names for nice plot titles. Inferred if None. - n_bins : int, optional, default: 10 + num_bins : int, optional, default: 10 The number of bins to use for the calibration curves (and marginal histograms). label_fontsize : int, optional, default: 16 The font size of the y-label and y-label texts - legend_fontsize : int, optional, default: 14 - The font size of the legend text (ECE value) + metric_fontsize : int, optional, default: 14 + The font size of the metric (e.g., ECE) title_fontsize : int, optional, default: 18 The font size of the title text. Only relevant if `stacked=False` tick_fontsize : int, optional, default: 12 @@ -83,7 +83,7 @@ def mc_calibration( estimates=pred_models, targets=true_models, model_names=plot_data["variable_names"], - n_bins=n_bins, + num_bins=num_bins, return_probs=True, ) @@ -92,7 +92,7 @@ def mc_calibration( ax.plot(ece["probs_pred"][j], ece["probs_true"][j], "o-", color=color) # Plot PMP distribution over bins - uniform_bins = np.linspace(0.0, 1.0, n_bins + 1) + uniform_bins = np.linspace(0.0, 1.0, num_bins + 1) norm_weights = np.ones_like(plot_data["estimates"]) / len(plot_data["estimates"]) ax.hist(plot_data["estimates"][:, j], bins=uniform_bins, weights=norm_weights[:, j], color="grey", alpha=0.3) diff --git a/bayesflow/diagnostics/plots/mc_confusion_matrix.py b/bayesflow/diagnostics/plots/mc_confusion_matrix.py index 2d27b5073..db5165eae 100644 --- a/bayesflow/diagnostics/plots/mc_confusion_matrix.py +++ b/bayesflow/diagnostics/plots/mc_confusion_matrix.py @@ -5,9 +5,8 @@ from matplotlib.colors import LinearSegmentedColormap import numpy as np -from sklearn.metrics import confusion_matrix - -from bayesflow.utils.plot_utils import make_figure +from ...utils.plot_utils import make_figure +from ...utils.classification import confusion_matrix def mc_confusion_matrix( @@ -50,10 +49,9 @@ def mc_confusion_matrix( ytick_rotation: int, optional, default: None Rotation of y-axis tick labels (helps with long model names). normalize : {'true', 'pred', 'all'}, default=None - Passed to sklearn.metrics.confusion_matrix. - Normalizes confusion matrix over the true (rows), predicted (columns) - conditions or all the population. If None, confusion matrix will not be - normalized. + Passed to confusion matrix. Normalizes confusion matrix over the true (rows), + predicted (columns) conditions or all the population. If None, confusion matrix + will not be normalized. cmap : matplotlib.colors.Colormap or str, optional, default: None Colormap to be used for the cells. If a str, it should be the name of a registered colormap, e.g., 'viridis'. Default colormap matches the BayesFlow defaults by ranging from white to red. diff --git a/bayesflow/utils/__init__.py b/bayesflow/utils/__init__.py index 84f5f47cb..5bdaea274 100644 --- a/bayesflow/utils/__init__.py +++ b/bayesflow/utils/__init__.py @@ -73,6 +73,7 @@ fill_triangular_matrix, weighted_sum, ) +from .classification import calibration_curve, confusion_matrix from .validators import check_lengths_same from .workflow_utils import find_inference_network, find_summary_network diff --git a/bayesflow/utils/classification/__init__.py b/bayesflow/utils/classification/__init__.py new file mode 100644 index 000000000..94e6dbdfd --- /dev/null +++ b/bayesflow/utils/classification/__init__.py @@ -0,0 +1,2 @@ +from .confusion_matrix import confusion_matrix +from .calibration_curve import calibration_curve diff --git a/bayesflow/utils/classification/calibration_curve.py b/bayesflow/utils/classification/calibration_curve.py new file mode 100644 index 000000000..38e49fc08 --- /dev/null +++ b/bayesflow/utils/classification/calibration_curve.py @@ -0,0 +1,82 @@ +import numpy as np + + +def calibration_curve( + targets: np.ndarray, + estimates: np.ndarray, + *, + pos_label: int | float | bool | str = 1, + num_bins: int = 5, + strategy: str = "uniform", +): + """Compute true and predicted probabilities for a calibration curve. + + The method assumes the inputs come from a binary classifier, and + discretize the [0, 1] interval into bins. + + Code from: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/calibration.py#L927 + + Parameters + ---------- + targets : array-like of shape (n_samples,) + True targets. + estimates : array-like of shape (n_samples,) + Probabilities of the positive class. + pos_label : int, float, bool or str, default = 1 + The label of the positive class. + num_bins : int, default=5 + Number of bins to discretize the [0, 1] interval. A bigger number + requires more data. Bins with no samples (i.e. without + corresponding values in `estimates`) will not be returned, thus the + returned arrays may have less than `num_bins` values. + strategy : {'uniform', 'quantile'}, default='uniform' + Strategy used to define the widths of the bins. + + uniform + The bins have identical widths. + quantile + The bins have the same number of samples and depend on `y_prob`. + + Returns + ------- + prob_true : ndarray of shape (num_bins,) or smaller + The proportion of samples whose class is the positive class, in each + bin (fraction of positives). + + prob_pred : ndarray of shape (num_bins,) or smaller + The mean estimated probability in each bin. + + References + ---------- + Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good + Probabilities With Supervised Learning, in Proceedings of the 22nd + International Conference on Machine Learning (ICML). + """ + + if estimates.min() < 0 or estimates.max() > 1: + raise ValueError("y_prob has values outside [0, 1].") + + labels = np.unique(targets) + if len(labels) > 2: + raise ValueError(f"Only binary classification is supported. Provided labels {labels}.") + targets = targets == pos_label + + if strategy == "quantile": # Determine bin edges by distribution of data + quantiles = np.linspace(0, 1, num_bins + 1) + bins = np.percentile(estimates, quantiles * 100) + elif strategy == "uniform": + bins = np.linspace(0.0, 1.0, num_bins + 1) + else: + raise ValueError("Invalid entry to 'strategy' input. Strategy must be either 'quantile' or 'uniform'.") + + binids = np.searchsorted(bins[1:-1], estimates) + + bin_sums = np.bincount(binids, weights=estimates, minlength=len(bins)) + bin_true = np.bincount(binids, weights=targets, minlength=len(bins)) + bin_total = np.bincount(binids, minlength=len(bins)) + + nonzero = bin_total != 0 + prob_true = bin_true[nonzero] / bin_total[nonzero] + prob_pred = bin_sums[nonzero] / bin_total[nonzero] + + return prob_true, prob_pred diff --git a/bayesflow/utils/classification/confusion_matrix.py b/bayesflow/utils/classification/confusion_matrix.py new file mode 100644 index 000000000..4aadda28f --- /dev/null +++ b/bayesflow/utils/classification/confusion_matrix.py @@ -0,0 +1,61 @@ +from typing import Sequence + +import numpy as np + + +def confusion_matrix(targets: np.ndarray, estimates: np.ndarray, labels: Sequence = None, normalize: str = None): + """ + Compute confusion matrix to evaluate the accuracy of a classification or model comparison setting. + + Code inspired by: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/metrics/_classification.py + + Parameters + ---------- + targets : np.ndarray + Ground truth (correct) target values. + estimates : np.ndarray + Estimated targets as returned by a classifier. + labels : Sequence, optional + List of labels to index the matrix. This may be used to reorder or select a subset of labels. + If None, labels that appear at least once in y_true or y_pred are used in sorted order. + normalize : {'true', 'pred', 'all'}, optional + Normalizes confusion matrix over the true (rows), predicted (columns) + conditions or all the population. If None, no normalization is applied. + + Returns + ------- + cm : np.ndarray of shape (num_labels, num_labels) + Confusion matrix. Rows represent true classes, columns represent predicted classes. + """ + + # Get unique labels + if labels is None: + labels = np.unique(np.concatenate((targets, estimates))) + else: + labels = np.asarray(labels) + + label_to_index = {label: i for i, label in enumerate(labels)} + num_labels = len(labels) + + # Initialize the confusion matrix + cm = np.zeros((num_labels, num_labels), dtype=np.int64) + + # Fill confusion matrix + for t, p in zip(targets, estimates): + if t in label_to_index and p in label_to_index: + cm[label_to_index[t], label_to_index[p]] += 1 + + # Normalize if required + if normalize == "true": + with np.errstate(all="ignore"): + cm = cm.astype(np.float64) + cm = np.divide(cm, cm.sum(axis=1, keepdims=True), where=cm.sum(axis=1, keepdims=True) != 0) + elif normalize == "pred": + with np.errstate(all="ignore"): + cm = cm.astype(np.float64) + cm = np.divide(cm, cm.sum(axis=0, keepdims=True), where=cm.sum(axis=0, keepdims=True) != 0) + elif normalize == "all": + cm = cm.astype(np.float64) + cm /= cm.sum() + + return cm diff --git a/pyproject.toml b/pyproject.toml index 5f83611f2..8018b55a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,6 @@ dependencies = [ "matplotlib", "numpy >= 1.24, <2.0", "pandas", - "scikit-learn", "scipy", "seaborn", "tqdm", diff --git a/tests/test_diagnostics/conftest.py b/tests/test_diagnostics/conftest.py index 9bd9bd907..5103ea455 100644 --- a/tests/test_diagnostics/conftest.py +++ b/tests/test_diagnostics/conftest.py @@ -8,6 +8,16 @@ def var_names(): return [r"$\beta_0$", r"$\beta_1$", r"$\sigma$"] +@pytest.fixture() +def random_samples_a(): + return np.random.normal(loc=0, scale=1, size=(5000, 8)) + + +@pytest.fixture() +def random_samples_b(): + return np.random.normal(loc=0, scale=3, size=(5000, 8)) + + @pytest.fixture() def random_estimates(): return { diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py index 792cf558a..4fb0945b3 100644 --- a/tests/test_diagnostics/test_diagnostics_metrics.py +++ b/tests/test_diagnostics/test_diagnostics_metrics.py @@ -50,6 +50,14 @@ def test_root_mean_squared_error(random_estimates, random_targets): assert out["variable_names"] == ["beta_0", "beta_1", "sigma"] +def test_classifier_two_sample_test(random_samples_a, random_samples_b): + metric = bf.diagnostics.metrics.classifier_two_sample_test(estimates=random_samples_a, targets=random_samples_a) + assert 0.55 > metric > 0.45 + + metric = bf.diagnostics.metrics.classifier_two_sample_test(estimates=random_samples_a, targets=random_samples_b) + assert metric > 0.55 + + def test_expected_calibration_error(pred_models, true_models, model_names): out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models, model_names=model_names) assert list(out.keys()) == ["values", "metric_name", "model_names"]