Remove sklearn as dep

stefanradev93 · stefanradev93 · commit ee201d553773 · 2025-04-07T17:22:23.000-04:00
diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py
@@ -2,3 +2,4 @@
 from .posterior_contraction import posterior_contraction
 from .root_mean_squared_error import root_mean_squared_error
 from .expected_calibration_error import expected_calibration_error
+from .classifier_two_sample_test import classifier_two_sample_test
diff --git a/bayesflow/diagnostics/metrics/expected_calibration_error.py b/bayesflow/diagnostics/metrics/expected_calibration_error.py
@@ -3,14 +3,14 @@
 from typing import Sequence, Any, Mapping
 
 from ...utils.exceptions import ShapeError
-from sklearn.calibration import calibration_curve
+from ...utils.classification import calibration_curve
 
 
 def expected_calibration_error(
     estimates: np.ndarray,
     targets: np.ndarray,
     model_names: Sequence[str] = None,
-    n_bins: int = 10,
+    num_bins: int = 10,
     return_probs: bool = False,
 ) -> Mapping[str, Any]:
     """
@@ -31,11 +31,11 @@ def expected_calibration_error(
         The one-hot-encoded true model indices.
     model_names : Sequence[str], optional (default = None)
         Optional model names to show in the output. By default, models are called "M_" + model index.
-    n_bins    : int, optional, default: 10
+    num_bins    : int, optional, default: 10
         The number of bins to use for the calibration curves (and marginal histograms).
-        Passed into ``sklearn.calibration.calibration_curve()``.
+        Passed into ``bayesflow.utils.calibration_curve()``.
     return_probs : bool (default = False)
-        Do you want to obtain the output of ``sklearn.calibration.calibration_curve()``?
+        Do you want to obtain the output of ``bayesflow.utils.calibration_curve()``?
 
     Returns
     -------
@@ -48,9 +48,9 @@ def expected_calibration_error(
         - "model_names" : str
             The (inferred) variable names.
         - "probs_true": (optional) list[np.ndarray]:
-            Outputs of ``sklearn.calibration.calibration_curve()`` per model
+            Outputs of ``bayesflow.utils.calibration.calibration_curve()`` per model
         - "probs_pred": (optional) list[np.ndarray]:
-            Outputs of ``sklearn.calibration.calibration_curve()`` per model
+            Outputs of ``bayesflow.utils.calibration.calibration_curve()`` per model
     """
 
     # Convert tensors to numpy, if passed
@@ -76,10 +76,10 @@ def expected_calibration_error(
     for model_index in range(estimates.shape[-1]):
         y_true = (targets == model_index).astype(np.float32)
         y_prob = estimates[..., model_index]
-        prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins)
+        prob_true, prob_pred = calibration_curve(y_true, y_prob, num_bins=num_bins)
 
         # Compute ECE by weighting bin errors by bin size
-        bins = np.linspace(0.0, 1.0, n_bins + 1)
+        bins = np.linspace(0.0, 1.0, num_bins + 1)
         binids = np.searchsorted(bins[1:-1], y_prob)
         bin_total = np.bincount(binids, minlength=len(bins))
         nonzero = bin_total != 0
diff --git a/bayesflow/diagnostics/plots/mc_confusion_matrix.py b/bayesflow/diagnostics/plots/mc_confusion_matrix.py
@@ -5,9 +5,8 @@
 from matplotlib.colors import LinearSegmentedColormap
 import numpy as np
 
-from sklearn.metrics import confusion_matrix
-
-from bayesflow.utils.plot_utils import make_figure
+from ...utils.plot_utils import make_figure
+from ...utils.classification import confusion_matrix
 
 
 def mc_confusion_matrix(
@@ -50,10 +49,9 @@ def mc_confusion_matrix(
     ytick_rotation: int, optional, default: None
         Rotation of y-axis tick labels (helps with long model names).
     normalize : {'true', 'pred', 'all'}, default=None
-        Passed to sklearn.metrics.confusion_matrix.
-        Normalizes confusion matrix over the true (rows), predicted (columns)
-        conditions or all the population. If None, confusion matrix will not be
-        normalized.
+        Passed to confusion matrix. Normalizes confusion matrix over the true (rows),
+        predicted (columns) conditions or all the population. If None, confusion matrix
+        will not be normalized.
     cmap           : matplotlib.colors.Colormap or str, optional, default: None
         Colormap to be used for the cells. If a str, it should be the name of a registered colormap,
         e.g., 'viridis'. Default colormap matches the BayesFlow defaults by ranging from white to red.
diff --git a/bayesflow/utils/__init__.py b/bayesflow/utils/__init__.py
@@ -72,6 +72,7 @@
     tree_stack,
     fill_triangular_matrix,
 )
+from .classification import calibration_curve, confusion_matrix
 from .validators import check_lengths_same
 from .workflow_utils import find_inference_network, find_summary_network
 
diff --git a/bayesflow/utils/classification/__init__.py b/bayesflow/utils/classification/__init__.py
@@ -0,0 +1,2 @@
+from .confusion_matrix import confusion_matrix
+from .calibration_curve import calibration_curve
diff --git a/bayesflow/utils/classification/calibration_curve.py b/bayesflow/utils/classification/calibration_curve.py
@@ -0,0 +1,82 @@
+import numpy as np
+
+
+def calibration_curve(
+    targets: np.ndarray,
+    estimates: np.ndarray,
+    *,
+    pos_label: int | float | bool | str = 1,
+    num_bins: int = 5,
+    strategy: str = "uniform",
+):
+    """Compute true and predicted probabilities for a calibration curve.
+
+    The method assumes the inputs come from a binary classifier, and
+    discretize the [0, 1] interval into bins.
+
+    Code from: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/calibration.py#L927
+
+    Parameters
+    ----------
+    targets : array-like of shape (n_samples,)
+        True targets.
+    estimates : array-like of shape (n_samples,)
+        Probabilities of the positive class.
+    pos_label : int, float, bool or str, default = 1
+        The label of the positive class.
+    num_bins : int, default=5
+        Number of bins to discretize the [0, 1] interval. A bigger number
+        requires more data. Bins with no samples (i.e. without
+        corresponding values in `y_prob`) will not be returned, thus the
+        returned arrays may have less than `n_bins` values.
+    strategy : {'uniform', 'quantile'}, default='uniform'
+        Strategy used to define the widths of the bins.
+
+        uniform
+            The bins have identical widths.
+        quantile
+            The bins have the same number of samples and depend on `y_prob`.
+
+    Returns
+    -------
+    prob_true : ndarray of shape (n_bins,) or smaller
+        The proportion of samples whose class is the positive class, in each
+        bin (fraction of positives).
+
+    prob_pred : ndarray of shape (n_bins,) or smaller
+        The mean estimated probability in each bin.
+
+    References
+    ----------
+    Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good
+    Probabilities With Supervised Learning, in Proceedings of the 22nd
+    International Conference on Machine Learning (ICML).
+    """
+
+    if estimates.min() < 0 or estimates.max() > 1:
+        raise ValueError("y_prob has values outside [0, 1].")
+
+    labels = np.unique(targets)
+    if len(labels) > 2:
+        raise ValueError(f"Only binary classification is supported. Provided labels {labels}.")
+    targets = targets == pos_label
+
+    if strategy == "quantile":  # Determine bin edges by distribution of data
+        quantiles = np.linspace(0, 1, num_bins + 1)
+        bins = np.percentile(estimates, quantiles * 100)
+    elif strategy == "uniform":
+        bins = np.linspace(0.0, 1.0, num_bins + 1)
+    else:
+        raise ValueError("Invalid entry to 'strategy' input. Strategy must be either 'quantile' or 'uniform'.")
+
+    binids = np.searchsorted(bins[1:-1], estimates)
+
+    bin_sums = np.bincount(binids, weights=estimates, minlength=len(bins))
+    bin_true = np.bincount(binids, weights=targets, minlength=len(bins))
+    bin_total = np.bincount(binids, minlength=len(bins))
+
+    nonzero = bin_total != 0
+    prob_true = bin_true[nonzero] / bin_total[nonzero]
+    prob_pred = bin_sums[nonzero] / bin_total[nonzero]
+
+    return prob_true, prob_pred
diff --git a/bayesflow/utils/classification/confusion_matrix.py b/bayesflow/utils/classification/confusion_matrix.py
@@ -0,0 +1,58 @@
+from typing import Sequence
+
+import numpy as np
+
+
+def confusion_matrix(targets: np.ndarray, estimates: np.ndarray, labels: Sequence = None, normalize: str = None):
+    """
+    Compute confusion matrix to evaluate the accuracy of a classification or model comparison setting.
+
+    Code inspired by: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/metrics/_classification.py
+
+    Parameters
+    ----------
+    targets : np.ndarray
+        Ground truth (correct) target values.
+    estimates : np.ndarray
+        Estimated targets as returned by a classifier.
+    labels : Sequence, optional
+        List of labels to index the matrix. This may be used to reorder or select a subset of labels.
+        If None, labels that appear at least once in y_true or y_pred are used in sorted order.
+    normalize : {'true', 'pred', 'all'}, optional
+        Normalizes confusion matrix over the true (rows), predicted (columns)
+        conditions or all the population. If None, no normalization is applied.
+
+    Returns
+    -------
+    cm : np.ndarray of shape (num_labels, num_labels)
+        Confusion matrix. Rows represent true classes, columns represent predicted classes.
+    """
+
+    # Get unique labels
+    labels = np.asarray(labels) or np.unique(np.concatenate((targets, estimates)))
+
+    label_to_index = {label: i for i, label in enumerate(labels)}
+    num_labels = len(labels)
+
+    # Initialize the confusion matrix
+    cm = np.zeros((num_labels, num_labels), dtype=np.int64)
+
+    # Fill confusion matrix
+    for t, p in zip(targets, estimates):
+        if t in label_to_index and p in label_to_index:
+            cm[label_to_index[t], label_to_index[p]] += 1
+
+    # Normalize if required
+    if normalize == "true":
+        with np.errstate(all="ignore"):
+            cm = cm.astype(np.float64)
+            cm = np.divide(cm, cm.sum(axis=1, keepdims=True), where=cm.sum(axis=1, keepdims=True) != 0)
+    elif normalize == "pred":
+        with np.errstate(all="ignore"):
+            cm = cm.astype(np.float64)
+            cm = np.divide(cm, cm.sum(axis=0, keepdims=True), where=cm.sum(axis=0, keepdims=True) != 0)
+    elif normalize == "all":
+        cm = cm.astype(np.float64)
+        cm /= cm.sum()
+
+    return cm
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,6 @@ dependencies = [
     "matplotlib",
     "numpy >= 1.24, <2.0",
     "pandas",
-    "scikit-learn",
     "scipy",
     "seaborn",
     "tqdm",
diff --git a/tests/test_diagnostics/conftest.py b/tests/test_diagnostics/conftest.py
@@ -8,6 +8,16 @@ def var_names():
     return [r"$\beta_0$", r"$\beta_1$", r"$\sigma$"]
 
 
+@pytest.fixture()
+def random_samples_a():
+    return np.random.normal(loc=0, scale=1, size=(1000, 8))
+
+
+@pytest.fixture()
+def random_samples_b():
+    return np.random.normal(loc=0, scale=3, size=(1000, 8))
+
+
 @pytest.fixture()
 def random_estimates():
     return {
diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
@@ -50,6 +50,14 @@ def test_root_mean_squared_error(random_estimates, random_targets):
     assert out["variable_names"] == ["beta_0", "beta_1", "sigma"]
 
 
+def test_classifier_two_sample_test(random_samples_a, random_samples_b):
+    metric = bf.diagnostics.metrics.classifier_two_sample_test(estimates=random_samples_a, targets=random_samples_a)
+    assert 0.6 > metric > 0.4
+
+    metric = bf.diagnostics.metrics.classifier_two_sample_test(estimates=random_samples_a, targets=random_samples_b)
+    assert metric > 0.6
+
+
 def test_expected_calibration_error(pred_models, true_models, model_names):
     out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models, model_names=model_names)
     assert list(out.keys()) == ["values", "metric_name", "model_names"]

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@`
`72`	`72`	`tree_stack,`
`73`	`73`	`fill_triangular_matrix,`
`74`	`74`	`)`
	`75`	`+from .classification import calibration_curve, confusion_matrix`
`75`	`76`	`from .validators import check_lengths_same`
`76`	`77`	`from .workflow_utils import find_inference_network, find_summary_network`
`77`	`78`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .confusion_matrix import confusion_matrix`
	`2`	`+from .calibration_curve import calibration_curve`