|
| 1 | +import numpy as np |
| 2 | + |
| 3 | + |
| 4 | +def calibration_curve( |
| 5 | + targets: np.ndarray, |
| 6 | + estimates: np.ndarray, |
| 7 | + *, |
| 8 | + pos_label: int | float | bool | str = 1, |
| 9 | + num_bins: int = 5, |
| 10 | + strategy: str = "uniform", |
| 11 | +): |
| 12 | + """Compute true and predicted probabilities for a calibration curve. |
| 13 | +
|
| 14 | + The method assumes the inputs come from a binary classifier, and |
| 15 | + discretize the [0, 1] interval into bins. |
| 16 | +
|
| 17 | + Code from: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/calibration.py#L927 |
| 18 | +
|
| 19 | + Parameters |
| 20 | + ---------- |
| 21 | + targets : array-like of shape (n_samples,) |
| 22 | + True targets. |
| 23 | + estimates : array-like of shape (n_samples,) |
| 24 | + Probabilities of the positive class. |
| 25 | + pos_label : int, float, bool or str, default = 1 |
| 26 | + The label of the positive class. |
| 27 | + num_bins : int, default=5 |
| 28 | + Number of bins to discretize the [0, 1] interval. A bigger number |
| 29 | + requires more data. Bins with no samples (i.e. without |
| 30 | + corresponding values in `y_prob`) will not be returned, thus the |
| 31 | + returned arrays may have less than `n_bins` values. |
| 32 | + strategy : {'uniform', 'quantile'}, default='uniform' |
| 33 | + Strategy used to define the widths of the bins. |
| 34 | +
|
| 35 | + uniform |
| 36 | + The bins have identical widths. |
| 37 | + quantile |
| 38 | + The bins have the same number of samples and depend on `y_prob`. |
| 39 | +
|
| 40 | + Returns |
| 41 | + ------- |
| 42 | + prob_true : ndarray of shape (n_bins,) or smaller |
| 43 | + The proportion of samples whose class is the positive class, in each |
| 44 | + bin (fraction of positives). |
| 45 | +
|
| 46 | + prob_pred : ndarray of shape (n_bins,) or smaller |
| 47 | + The mean estimated probability in each bin. |
| 48 | +
|
| 49 | + References |
| 50 | + ---------- |
| 51 | + Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good |
| 52 | + Probabilities With Supervised Learning, in Proceedings of the 22nd |
| 53 | + International Conference on Machine Learning (ICML). |
| 54 | + """ |
| 55 | + |
| 56 | + if estimates.min() < 0 or estimates.max() > 1: |
| 57 | + raise ValueError("y_prob has values outside [0, 1].") |
| 58 | + |
| 59 | + labels = np.unique(targets) |
| 60 | + if len(labels) > 2: |
| 61 | + raise ValueError(f"Only binary classification is supported. Provided labels {labels}.") |
| 62 | + targets = targets == pos_label |
| 63 | + |
| 64 | + if strategy == "quantile": # Determine bin edges by distribution of data |
| 65 | + quantiles = np.linspace(0, 1, num_bins + 1) |
| 66 | + bins = np.percentile(estimates, quantiles * 100) |
| 67 | + elif strategy == "uniform": |
| 68 | + bins = np.linspace(0.0, 1.0, num_bins + 1) |
| 69 | + else: |
| 70 | + raise ValueError("Invalid entry to 'strategy' input. Strategy must be either 'quantile' or 'uniform'.") |
| 71 | + |
| 72 | + binids = np.searchsorted(bins[1:-1], estimates) |
| 73 | + |
| 74 | + bin_sums = np.bincount(binids, weights=estimates, minlength=len(bins)) |
| 75 | + bin_true = np.bincount(binids, weights=targets, minlength=len(bins)) |
| 76 | + bin_total = np.bincount(binids, minlength=len(bins)) |
| 77 | + |
| 78 | + nonzero = bin_total != 0 |
| 79 | + prob_true = bin_true[nonzero] / bin_total[nonzero] |
| 80 | + prob_pred = bin_sums[nonzero] / bin_total[nonzero] |
| 81 | + |
| 82 | + return prob_true, prob_pred |
0 commit comments