From 91daf3eb43cc7ab4289e380822f98540abb074ba Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Wed, 23 Jul 2025 18:44:41 +0200 Subject: [PATCH 01/26] ENH: implement BinaryClassificationRisk and related instances --- mapie/risk_control.py | 52 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 5489eed11..1f2c96145 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -2,7 +2,7 @@ import warnings from itertools import chain -from typing import Iterable, Optional, Sequence, Tuple, Union, cast +from typing import Iterable, Optional, Sequence, Tuple, Union, cast, Callable import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin @@ -706,3 +706,53 @@ def predict( self.lambdas_star[np.newaxis, np.newaxis, :] ) return y_pred, y_pred_proba_array + + +class BinaryClassificationRisk: + # Any risk that can be defined in the following way will work using the binary + # Hoeffding-Bentkus p-values used in MAPIE + # Take the example of precision in the docstring to explain how the class works. + def __init__( + self, + occurrence: Callable[[int, int], Optional[int]], + # (y_true, y_pred), output: int (0 or 1) or None if undefined + higher_is_better: bool, + ): + self.occurrence = occurrence + self.higher_is_better = higher_is_better + + def get_value_and_effective_sample_size( + self, + y_true: NDArray[int], # shape (n_samples,), values in {0, 1} + y_pred: NDArray[int], # shape (n_samples,), values in {0, 1} + ) -> Optional[Tuple[float, int]]: + # float between 0 and 1, int between 0 and len(y_true) + risk_occurrences = [ + self.occurrence(y_true_i, y_pred_i) + for y_true_i, y_pred_i in zip(y_true, y_pred) + ] + effective_sample_size = len(y_true) - risk_occurrences.count(None) + if effective_sample_size != 0: + risk_value = sum( + occurrence for occurrence in risk_occurrences if occurrence is not None + ) / effective_sample_size + if self.higher_is_better: + risk_value = 1 - risk_value + return risk_value, effective_sample_size + return None + + +precision = BinaryClassificationRisk( + occurrence=lambda y_true, y_pred: None if y_pred == 0 else int(y_pred == y_true), + higher_is_better=True, +) + +accuracy = BinaryClassificationRisk( + occurrence=lambda y_true, y_pred: int(y_pred == y_true), + higher_is_better=True, +) + +recall = BinaryClassificationRisk( + occurrence=lambda y_true, y_pred: None if y_true == 0 else int(y_pred == y_true), + higher_is_better=True, +) \ No newline at end of file From ded36fee43d9e11c434dd37785e165fb39a6907f Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Fri, 25 Jul 2025 16:25:06 +0200 Subject: [PATCH 02/26] ENH: simplify BinaryClassificationRisk API --- mapie/risk_control.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 1f2c96145..0dc597745 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -714,11 +714,12 @@ class BinaryClassificationRisk: # Take the example of precision in the docstring to explain how the class works. def __init__( self, - occurrence: Callable[[int, int], Optional[int]], - # (y_true, y_pred), output: int (0 or 1) or None if undefined + risk_occurrence: Callable[[int, int], int], + risk_condition: Callable[[int, int], bool], higher_is_better: bool, ): - self.occurrence = occurrence + self.risk_occurrence = risk_occurrence + self.risk_condition = risk_condition self.higher_is_better = higher_is_better def get_value_and_effective_sample_size( @@ -728,14 +729,20 @@ def get_value_and_effective_sample_size( ) -> Optional[Tuple[float, int]]: # float between 0 and 1, int between 0 and len(y_true) risk_occurrences = [ - self.occurrence(y_true_i, y_pred_i) + self.risk_occurrence(y_true_i, y_pred_i) for y_true_i, y_pred_i in zip(y_true, y_pred) ] - effective_sample_size = len(y_true) - risk_occurrences.count(None) + risk_conditions = [ + self.risk_condition(y_true_i, y_pred_i) + for y_true_i, y_pred_i in zip(y_true, y_pred) + ] + effective_sample_size = len(y_true) - risk_conditions.count(False) if effective_sample_size != 0: - risk_value = sum( - occurrence for occurrence in risk_occurrences if occurrence is not None - ) / effective_sample_size + risk_sum = sum( + risk_occurrence for risk_occurrence, risk_condition + in zip(risk_occurrences, risk_conditions) + if risk_condition) + risk_value = risk_sum / effective_sample_size if self.higher_is_better: risk_value = 1 - risk_value return risk_value, effective_sample_size @@ -743,16 +750,19 @@ def get_value_and_effective_sample_size( precision = BinaryClassificationRisk( - occurrence=lambda y_true, y_pred: None if y_pred == 0 else int(y_pred == y_true), + risk_occurrence=lambda y_true, y_pred: int(y_pred == y_true), + risk_condition=lambda y_true, y_pred: y_pred == 1, higher_is_better=True, ) accuracy = BinaryClassificationRisk( - occurrence=lambda y_true, y_pred: int(y_pred == y_true), + risk_occurrence=lambda y_true, y_pred: int(y_pred == y_true), + risk_condition=lambda y_true, y_pred: True, higher_is_better=True, ) recall = BinaryClassificationRisk( - occurrence=lambda y_true, y_pred: None if y_true == 0 else int(y_pred == y_true), + risk_occurrence=lambda y_true, y_pred: int(y_pred == y_true), + risk_condition=lambda y_true, y_pred: y_true == 1, higher_is_better=True, ) \ No newline at end of file From f404ffe0da9b48c62ca8246a8ca961c9a056782f Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Tue, 29 Jul 2025 16:09:29 +0200 Subject: [PATCH 03/26] ENH & MTN - Use BinaryClassificationRisk to compute risk - Use warning instead of error when risk is not controled. Throw error when predicting - Remove useless check on lambda=None in ltt_procedure - Remove useless p_values from ltt_procedure outputs - Add possibility to pass an array of n_obs to ltt_procedure and subsequent p-values calculations (needed for binary classification) --- mapie/__init__.py | 2 - mapie/control_risk/ltt.py | 49 +++++----- mapie/control_risk/p_values.py | 43 ++++++--- mapie/risk_control.py | 16 +++- mapie/risk_control_draft.py | 157 +++++++------------------------ mapie/tests/test_control_risk.py | 10 +- 6 files changed, 98 insertions(+), 179 deletions(-) diff --git a/mapie/__init__.py b/mapie/__init__.py index 35fd5b090..5ec9939e1 100644 --- a/mapie/__init__.py +++ b/mapie/__init__.py @@ -4,7 +4,6 @@ regression, utils, risk_control, - risk_control_draft, calibration, subsample, ) @@ -14,7 +13,6 @@ "regression", "classification", "risk_control", - "risk_control_draft", "calibration", "metrics", "utils", diff --git a/mapie/control_risk/ltt.py b/mapie/control_risk/ltt.py index e19d3b849..c1f9e9c9a 100644 --- a/mapie/control_risk/ltt.py +++ b/mapie/control_risk/ltt.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, List, Optional, Tuple +from typing import Any, List, Optional, Tuple, Union import numpy as np @@ -9,29 +9,26 @@ def ltt_procedure( - r_hat: NDArray[np.float32], - alpha_np: NDArray[np.float32], - delta: Optional[float], - n_obs: int, - binary: bool = False, # TODO: maybe should pass p_values fonction instead -) -> Tuple[List[List[Any]], NDArray[np.float32]]: + r_hat: NDArray[float], + alpha_np: NDArray[float], + delta: float, + n_obs: Union[int, NDArray], + binary: bool = False, +) -> List[List[Any]]: """ Apply the Learn-Then-Test procedure for risk control. Note that we will do a multiple test for ``r_hat`` that are less than level ``alpha_np``. The procedure follows the instructions in [1]: - - Calculate p-values for each lambdas descretized - - Apply a family wise error rate algorithm, - here Bonferonni correction - - Return the index lambdas that give you the control - at alpha level + - Calculate p-values for each lambdas discretized + - Apply a family wise error rate algorithm, here Bonferonni correction + - Return the index lambdas that give you the control at alpha level Parameters ---------- r_hat: NDArray of shape (n_lambdas, ). - Empirical risk with respect - to the lambdas. - Here lambdas are thresholds that impact decision making, + Empirical risk with respect to the lambdas. + Here lambdas are thresholds that impact decision-making, therefore empirical risk. alpha_np: NDArray of shape (n_alpha, ). @@ -44,34 +41,34 @@ def ltt_procedure( Correspond to proportion of failure we don't want to exceed. + n_obs: Union[int, NDArray] + Correspond to the number of observations used to compute the risk. + In the case of a conditional loss, n_obs must be the + number of effective observations used to compute the empirical risk + for each lambda, hence of shape (n_lambdas, ). + + binary: bool, default=False + Must be True if the loss associated to the risk is binary. + Returns ------- valid_index: List[List[Any]]. - Contain the valid index that satisfy fwer control + Contain the valid index that satisfy FWER control for each alpha (length aren't the same for each alpha). - p_values: NDArray of shape (n_lambda, n_alpha). - Contains the values of p_value for different alpha. - References ---------- [1] Angelopoulos, A. N., Bates, S., Candès, E. J., Jordan, M. I., & Lei, L. (2021). Learn then test: "Calibrating predictive algorithms to achieve risk control". """ - if delta is None: - raise ValueError( - "Invalid delta: delta cannot be None while" - + " controlling precision with LTT. " - ) p_values = compute_hoeffdding_bentkus_p_value(r_hat, n_obs, alpha_np, binary) N = len(p_values) valid_index = [] for i in range(len(alpha_np)): l_index = np.where(p_values[:, i] <= delta/N)[0].tolist() valid_index.append(l_index) - return valid_index, p_values # TODO : p_values is not used, we could remove it - # Or return corrected p_values + return valid_index def find_lambda_control_star( diff --git a/mapie/control_risk/p_values.py b/mapie/control_risk/p_values.py index d1a420a4c..f7ca882a0 100644 --- a/mapie/control_risk/p_values.py +++ b/mapie/control_risk/p_values.py @@ -8,11 +8,11 @@ def compute_hoeffdding_bentkus_p_value( - r_hat: NDArray[np.float32], - n_obs: int, - alpha: Union[float, NDArray[np.float32]], + r_hat: NDArray[float], + n_obs: Union[int, NDArray], + alpha: Union[float, NDArray[float]], binary: bool = False, -) -> NDArray[np.float32]: +) -> NDArray[float]: """ The method computes the p_values according to the Hoeffding_Bentkus inequality for each @@ -30,9 +30,11 @@ def compute_hoeffdding_bentkus_p_value( Here lambdas are thresholds that impact decision making and therefore empirical risk. - n_obs: int. - Correspond to the number of observations in - dataset. + n_obs: Union[int, NDArray] + Correspond to the number of observations used to compute the risk. + In the case of a conditional loss, n_obs must be the + number of effective observations used to compute the empirical risk + for each lambda, hence of shape (n_lambdas, ). alpha: Union[float, Iterable[float]]. Contains the different alphas control level. @@ -40,6 +42,11 @@ def compute_hoeffdding_bentkus_p_value( If it is a iterable, it is a NDArray of shape (n_alpha, ). + binary: bool, default=False + Must be True if the loss associated to the risk is binary. + If True, we use a tighter version of the Bentkus p-value, valid when the + loss associated to the risk is binary. See section 3.2 of [1]. + Returns ------- hb_p_values: NDArray of shape (n_lambda, n_alpha). @@ -62,9 +69,17 @@ def compute_hoeffdding_bentkus_p_value( len(r_hat), axis=0 ) + if isinstance(n_obs, int): + n_obs = np.full_like(r_hat, n_obs, dtype=float) + n_obs_repeat = np.repeat( + np.expand_dims(n_obs, axis=1), + len(alpha_np), + axis=1 + ) + hoeffding_p_value = np.exp( - -n_obs * _h1( - np.where( # TODO : shouldn't we use np.minimum ? + -n_obs_repeat * _h1( + np.where( r_hat_repeat > alpha_repeat, alpha_repeat, r_hat_repeat @@ -74,9 +89,9 @@ def compute_hoeffdding_bentkus_p_value( ) factor = 1 if binary else np.e bentkus_p_value = factor * binom.cdf( - np.ceil(n_obs * r_hat_repeat), n_obs, alpha_repeat + np.ceil(n_obs_repeat * r_hat_repeat), n_obs, alpha_repeat ) - hb_p_value = np.where( # TODO : shouldn't we use np.minimum ? + hb_p_value = np.where( bentkus_p_value > hoeffding_p_value, hoeffding_p_value, bentkus_p_value @@ -85,8 +100,8 @@ def compute_hoeffdding_bentkus_p_value( def _h1( - r_hats: NDArray[np.float32], alphas: NDArray[np.float32] -) -> NDArray[np.float32]: + r_hats: NDArray[float], alphas: NDArray[float] +) -> NDArray[float]: """ This function allow us to compute the tighter version of hoeffding inequality. @@ -113,7 +128,7 @@ def _h1( Returns ------- - NDArray of shape a(n_lambdas, n_alpha). + NDArray of shape (n_lambdas, n_alpha). """ elt1 = np.zeros_like(r_hats, dtype=float) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 0dc597745..b59950f3e 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -681,8 +681,8 @@ def predict( if self.metric_control == 'precision': self.n_obs = len(self.risks) self.r_hat = self.risks.mean(axis=0) - self.valid_index, self.p_values = ltt_procedure( - self.r_hat, alpha_np, delta, self.n_obs + self.valid_index = ltt_procedure( + self.r_hat, alpha_np, cast(float, delta), self.n_obs ) self._check_valid_index(alpha_np) self.lambdas_star, self.r_star = find_lambda_control_star( @@ -724,8 +724,8 @@ def __init__( def get_value_and_effective_sample_size( self, - y_true: NDArray[int], # shape (n_samples,), values in {0, 1} - y_pred: NDArray[int], # shape (n_samples,), values in {0, 1} + y_true: NDArray[int], # shape (n_samples,), values in {0, 1} + y_pred: NDArray[int], # shape (n_samples,), values in {0, 1} ) -> Optional[Tuple[float, int]]: # float between 0 and 1, int between 0 and len(y_true) risk_occurrences = [ @@ -765,4 +765,10 @@ def get_value_and_effective_sample_size( risk_occurrence=lambda y_true, y_pred: int(y_pred == y_true), risk_condition=lambda y_true, y_pred: y_true == 1, higher_is_better=True, -) \ No newline at end of file +) + +_automatic_best_predict_param_choice = { + precision: recall, + recall: precision, + accuracy: accuracy, +} diff --git a/mapie/risk_control_draft.py b/mapie/risk_control_draft.py index a4f1f9485..af8241fde 100644 --- a/mapie/risk_control_draft.py +++ b/mapie/risk_control_draft.py @@ -1,168 +1,79 @@ -from typing import Any, Optional, Union +import warnings +from typing import Optional, Union, Callable import numpy as np from numpy._typing import ArrayLike, NDArray -from sklearn.utils import check_random_state from mapie.control_risk.ltt import ltt_procedure -from mapie.utils import _check_n_jobs, _check_verbose +from mapie.risk_control import BinaryClassificationRisk + # General TODOs: -# TODO: maybe use type float instead of float32? # TODO : in calibration and prediction, # use _transform_pred_proba or a function adapted to binary # to get the probabilities depending on the classifier +# TODO: remove the no cover below class BinaryClassificationController: # pragma: no cover # TODO : test that this is working with a sklearn pipeline # TODO : test that this is working with a pandas dataframes - """ - Controller for the calibration of our binary classifier. - - Parameters - ---------- - fitted_binary_classifier: Any - Any object that provides a `predict_proba` method. - - metric: str - The performance metric we want to control (ex: "precision") - - target_level: float - The target performance level we want to achieve (ex: 0.8) - - confidence_level: float - The maximum acceptable probability of the precision falling below the - target precision level (ex: 0.8) - - Attributes - ---------- - precision_per_threshold: NDArray - Precision of the binary classifier on the calibration set for each - threshold from self._thresholds. - - valid_threshold: NDArray - Thresholds that meet the target precision with the desired confidence. - - best_threshold: float - Valid threshold that maximizes the recall, i.e. the smallest valid - threshold. - """ - def __init__( self, - fitted_binary_classifier: Any, - metric: str, + # X -> y_proba of shape (n_samples, 2) + predict_function: Callable[[ArrayLike], ArrayLike], + risk: BinaryClassificationRisk, target_level: float, confidence_level: float = 0.9, - n_jobs: Optional[int] = None, - random_state: Optional[Union[int, np.random.RandomState]] = None, - verbose: int = 0 + best_predict_param_choice: Union[str, BinaryClassificationRisk] = "auto", ): - _check_n_jobs(n_jobs) - _check_verbose(verbose) - check_random_state(random_state) - - self._classifier = fitted_binary_classifier + self._predict_function = predict_function + self._risk = risk + self._best_predict_param_choice = best_predict_param_choice self._alpha = 1 - target_level self._delta = 1 - confidence_level - self._n_jobs = n_jobs # TODO : use this in the class or delete - self._random_state = random_state # TODO : use this in the class or delete - self._verbose = verbose # TODO : use this in the class or delete - self._thresholds: NDArray[np.float32] = np.arange(0, 1, 0.01) + self._thresholds: NDArray[float] = np.linspace(0, 0.99, 100) # TODO: add a _is_calibrated attribute to check at prediction time - self.valid_thresholds: Optional[NDArray[np.float32]] = None + self.valid_thresholds: Optional[NDArray[float]] = None self.best_threshold: Optional[float] = None def calibrate(self, X_calibrate: ArrayLike, y_calibrate: ArrayLike) -> None: - """ - Find the threshold that statistically guarantees the desired precision - level while maximizing the recall. - - Parameters - ---------- - X_calibrate: ArrayLike - Features of the calibration set. - - y_calibrate: ArrayLike - True labels of the calibration set. - - Raises - ------ - ValueError - If no thresholds that meet the target precision with the desired - confidence level are found. - """ - # TODO: Make sure this works with sklearn train_test_split/Series + # TODO: Make sure the following works with sklearn train_test_split/Series y_calibrate_ = np.asarray(y_calibrate) - predictions_proba = self._classifier.predict_proba(X_calibrate)[:, 1] + predictions_proba = self._predict_function(X_calibrate)[:, 1] + + predictions_per_threshold = ( + predictions_proba[:, np.newaxis] >= self._thresholds + ).T.astype(int) - risk_per_threshold = 1 - self._compute_precision( - predictions_proba, y_calibrate_ + risks_and_eff_sizes = np.array( + [self._risk.get_value_and_effective_sample_size( + y_calibrate_, + predictions + ) for predictions in predictions_per_threshold] ) - valid_thresholds_index, _ = ltt_procedure( - risk_per_threshold, + valid_thresholds_index = ltt_procedure( + risks_and_eff_sizes[:, 0], np.array([self._alpha]), self._delta, - len(y_calibrate_), + risks_and_eff_sizes[:, 1], True, ) self.valid_thresholds = self._thresholds[valid_thresholds_index[0]] if len(self.valid_thresholds) == 0: - # TODO: just warn, and raise error at prediction if no valid thresholds - raise ValueError("No valid thresholds found") + warnings.warn("No predict parameters were found to control the risk.") # Minimum in case of precision control only self.best_threshold = min(self.valid_thresholds) def predict(self, X_test: ArrayLike) -> NDArray: - """ - Predict binary labels on the test set, using the best threshold found - during calibration. - - Parameters - ---------- - X_test: ArrayLike - Features of the test set. - - Returns - ------- - ArrayLike - Predicted labels (0 or 1) for each sample in the test set. - """ - predictions_proba = self._classifier.predict_proba(X_test)[:, 1] + if self.best_threshold is None: + raise ValueError( + "No predict parameters were found to control the risk. Cannot predict." + ) + predictions_proba = self._predict_function(X_test)[:, 1] return (predictions_proba >= self.best_threshold).astype(int) - - def _compute_precision( # TODO: use sklearn or MAPIE ? - self, predictions_proba: NDArray[np.float32], y_cal: NDArray[np.float32] - ) -> NDArray[np.float32]: - """ - Compute the precision for each threshold. - """ - predictions_per_threshold = ( - predictions_proba[:, np.newaxis] >= self._thresholds - ).astype(int) - - true_positives = np.sum( - (predictions_per_threshold == 1) & (y_cal[:, np.newaxis] == 1), - axis=0, - ) - false_positives = np.sum( - (predictions_per_threshold == 1) & (y_cal[:, np.newaxis] == 0), - axis=0, - ) - - positive_predictions = true_positives + false_positives - - # Avoid division by zero - precision_per_threshold = np.ones_like(self._thresholds, dtype=float) - nonzero_mask = positive_predictions > 0 - precision_per_threshold[nonzero_mask] = ( - true_positives[nonzero_mask] / positive_predictions[nonzero_mask] - ) - - return precision_per_threshold diff --git a/mapie/tests/test_control_risk.py b/mapie/tests/test_control_risk.py index 66eaab09f..a0a513de6 100644 --- a/mapie/tests/test_control_risk.py +++ b/mapie/tests/test_control_risk.py @@ -151,9 +151,8 @@ def test_ltt_type_output_alpha_delta( delta: float ) -> None: """Test type output _ltt_procedure""" - valid_index, p_values = ltt_procedure(r_hat, alpha, delta, n) + valid_index = ltt_procedure(r_hat, alpha, delta, n) assert isinstance(valid_index, list) - assert isinstance(p_values, np.ndarray) @pytest.mark.parametrize("valid_index", [[[0, 1]]]) @@ -181,10 +180,3 @@ def test_invalid_shape_alpha_hb() -> None: """Test error message when invalid alpha shape""" with pytest.raises(ValueError, match=r".*Invalid confidence_level"): compute_hoeffdding_bentkus_p_value(r_hat, n, wrong_alpha_shape) - - -@pytest.mark.parametrize("delta", [None]) -def test_delta_none_ltt(delta: Optional[float]) -> None: - """Test error message when invalid delta""" - with pytest.raises(ValueError, match=r".*Invalid delta"): - ltt_procedure(r_hat, alpha, delta, n) From e8dae57f963e00e69774a07301b584f41799c0df Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Wed, 30 Jul 2025 10:57:22 +0200 Subject: [PATCH 04/26] ENH & MTN & FIX - Fix bentkus_p_value calculation - Fix and move higher_is_better logic in the same place - Implement unit test for BinaryClassificationRiskControl - Fix parametrizing of existing test --- mapie/control_risk/p_values.py | 2 +- mapie/risk_control.py | 2 -- mapie/risk_control_draft.py | 10 +++++-- mapie/tests/test_risk_control.py | 49 ++++++++++++++++++++++++++++++-- 4 files changed, 56 insertions(+), 7 deletions(-) diff --git a/mapie/control_risk/p_values.py b/mapie/control_risk/p_values.py index f7ca882a0..297d103d6 100644 --- a/mapie/control_risk/p_values.py +++ b/mapie/control_risk/p_values.py @@ -89,7 +89,7 @@ def compute_hoeffdding_bentkus_p_value( ) factor = 1 if binary else np.e bentkus_p_value = factor * binom.cdf( - np.ceil(n_obs_repeat * r_hat_repeat), n_obs, alpha_repeat + np.ceil(n_obs_repeat * r_hat_repeat), n_obs_repeat, alpha_repeat ) hb_p_value = np.where( bentkus_p_value > hoeffding_p_value, diff --git a/mapie/risk_control.py b/mapie/risk_control.py index b59950f3e..1f56c235a 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -743,8 +743,6 @@ def get_value_and_effective_sample_size( in zip(risk_occurrences, risk_conditions) if risk_condition) risk_value = risk_sum / effective_sample_size - if self.higher_is_better: - risk_value = 1 - risk_value return risk_value, effective_sample_size return None diff --git a/mapie/risk_control_draft.py b/mapie/risk_control_draft.py index af8241fde..4a5a426e4 100644 --- a/mapie/risk_control_draft.py +++ b/mapie/risk_control_draft.py @@ -30,7 +30,7 @@ def __init__( self._predict_function = predict_function self._risk = risk self._best_predict_param_choice = best_predict_param_choice - self._alpha = 1 - target_level + self._target_level = target_level self._delta = 1 - confidence_level self._thresholds: NDArray[float] = np.linspace(0, 0.99, 100) @@ -56,9 +56,15 @@ def calibrate(self, X_calibrate: ArrayLike, y_calibrate: ArrayLike) -> None: ) for predictions in predictions_per_threshold] ) + if self._risk.higher_is_better: + risks_and_eff_sizes[:, 0] = 1 - risks_and_eff_sizes[:, 0] + alpha = self._target_level + else: + alpha = 1 - self._target_level + valid_thresholds_index = ltt_procedure( risks_and_eff_sizes[:, 0], - np.array([self._alpha]), + np.array([alpha]), self._delta, risks_and_eff_sizes[:, 1], True, diff --git a/mapie/tests/test_risk_control.py b/mapie/tests/test_risk_control.py index abd1e5f09..26d379b6f 100644 --- a/mapie/tests/test_risk_control.py +++ b/mapie/tests/test_risk_control.py @@ -11,10 +11,17 @@ from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.utils.validation import check_is_fitted +from sklearn.metrics import precision_score, recall_score, accuracy_score from typing_extensions import TypedDict from numpy.typing import NDArray -from mapie.risk_control import PrecisionRecallController +from mapie.risk_control import ( + PrecisionRecallController, + precision, + recall, + accuracy, + BinaryClassificationRisk, +) Params = TypedDict( "Params", @@ -260,7 +267,7 @@ def test_predict_output_shape( X, alpha=alpha, bound=args["bound"], - delta=.1 + delta=delta ) n_alpha = len(alpha) if hasattr(alpha, "__len__") else 1 assert y_pred.shape == y.shape @@ -808,3 +815,41 @@ def test_method_none_recall() -> None: ) mapie_clf.fit(X_toy, y_toy) assert mapie_clf.method == "crc" + + +# The following test is voluntarily agnostic +# to the specific binary classification risk control implementation. +@pytest.mark.parametrize( + "risk_instance, metric_func, effective_sample_func", + [ + (precision, precision_score, lambda y_true, y_pred: np.sum(y_pred == 1)), + (recall, recall_score, lambda y_true, y_pred: np.sum(y_true == 1)), + (accuracy, accuracy_score, lambda y_true, y_pred: len(y_true)), + ], +) +@pytest.mark.parametrize( + "y_true, y_pred", + [ + (np.array([1, 0, 1, 0]), np.array([1, 1, 0, 0])), + (np.array([1, 1, 0, 0]), np.array([1, 1, 1, 0])), + (np.array([0, 0, 0, 0]), np.array([0, 1, 0, 1])), + ], +) +def test_binary_classification_risk( + risk_instance: BinaryClassificationRisk, + metric_func, + effective_sample_func, + y_true, + y_pred +): + result = risk_instance.get_value_and_effective_sample_size(y_true, y_pred) + if effective_sample_func(y_true, y_pred) == 0: + assert result is None + elif result is None: + raise ValueError() + else: + value, n = result + expected_value = metric_func(y_true, y_pred) + expected_n = effective_sample_func(y_true, y_pred) + assert np.isclose(value, expected_value) + assert n == expected_n \ No newline at end of file From a580aa474b44fc9dd6487ebe697b1f1d54d33b53 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Wed, 30 Jul 2025 15:30:11 +0200 Subject: [PATCH 05/26] TEST - hoeffdding_bentkus_p_value with n_obs as an array --- mapie/tests/test_control_risk.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/mapie/tests/test_control_risk.py b/mapie/tests/test_control_risk.py index a0a513de6..7693d1433 100644 --- a/mapie/tests/test_control_risk.py +++ b/mapie/tests/test_control_risk.py @@ -180,3 +180,26 @@ def test_invalid_shape_alpha_hb() -> None: """Test error message when invalid alpha shape""" with pytest.raises(ValueError, match=r".*Invalid confidence_level"): compute_hoeffdding_bentkus_p_value(r_hat, n, wrong_alpha_shape) + + +def test_p_values_n_obs_int_vs_array() -> None: + """Test that using n_obs as an array gives the same values as an int""" + r_hat = np.array([0.5, 0.8]) + n_obs = np.array([1100, 1200]) + alpha = np.array([0.6, 0.7]) + + pval_0 = compute_hoeffdding_bentkus_p_value( + np.array([r_hat[0]]), + int(n_obs[0]), + alpha + ) + pval_1 = compute_hoeffdding_bentkus_p_value( + np.array([r_hat[1]]), + int(n_obs[1]), + alpha + ) + pval_manual = np.vstack([pval_0, pval_1]) + + pval_array = compute_hoeffdding_bentkus_p_value(r_hat, n_obs, alpha) + + np.testing.assert_allclose(pval_manual, pval_array, rtol=1e-12) From 9e8b0921826ce7e2ccd102ca36f003b973347264 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Wed, 30 Jul 2025 15:44:21 +0200 Subject: [PATCH 06/26] FIX - linting --- mapie/control_risk/ltt.py | 2 +- mapie/risk_control.py | 28 ++++++++++++++++------------ mapie/tests/test_control_risk.py | 2 +- mapie/tests/test_risk_control.py | 2 +- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/mapie/control_risk/ltt.py b/mapie/control_risk/ltt.py index c1f9e9c9a..2b59a9b58 100644 --- a/mapie/control_risk/ltt.py +++ b/mapie/control_risk/ltt.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Tuple, Union import numpy as np diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 1f56c235a..5eb028764 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -11,8 +11,10 @@ from sklearn.multioutput import MultiOutputClassifier from sklearn.pipeline import Pipeline from sklearn.utils import check_random_state -from sklearn.utils.validation import (_check_y, _num_samples, check_is_fitted, - indexable) +from sklearn.utils.validation import ( + _check_y, _num_samples, check_is_fitted, + indexable, +) from numpy.typing import ArrayLike, NDArray from .control_risk.crc_rcps import find_lambda_star, get_r_hat_plus @@ -218,8 +220,9 @@ def _check_method(self) -> None: "Invalid method for metric: " + "You are controlling " + self.metric_control + " and you are using invalid method: " + self.method - + ". Use instead: " + "".join(self.valid_methods_by_metric_[ - self.metric_control] + + ". Use instead: " + "".join( + self.valid_methods_by_metric_[ + self.metric_control] ) ) @@ -365,10 +368,10 @@ def _check_estimator( LogisticRegression() ) X_train, X_conf, y_train, y_conf = train_test_split( - X, - y, - test_size=self.conformalize_size, - random_state=self.random_state, + X, + y, + test_size=self.conformalize_size, + random_state=self.random_state, ) estimator.fit(X_train, y_train) warnings.warn( @@ -686,7 +689,7 @@ def predict( ) self._check_valid_index(alpha_np) self.lambdas_star, self.r_star = find_lambda_control_star( - self.r_hat, self.valid_index, self.lambdas + self.r_hat, self.valid_index, self.lambdas ) y_pred_proba_array = ( y_pred_proba_array > @@ -739,9 +742,10 @@ def get_value_and_effective_sample_size( effective_sample_size = len(y_true) - risk_conditions.count(False) if effective_sample_size != 0: risk_sum = sum( - risk_occurrence for risk_occurrence, risk_condition - in zip(risk_occurrences, risk_conditions) - if risk_condition) + occurrence for occurrence, condition in zip( + risk_occurrences, risk_conditions + ) if condition + ) risk_value = risk_sum / effective_sample_size return risk_value, effective_sample_size return None diff --git a/mapie/tests/test_control_risk.py b/mapie/tests/test_control_risk.py index 7693d1433..15779100a 100644 --- a/mapie/tests/test_control_risk.py +++ b/mapie/tests/test_control_risk.py @@ -2,7 +2,7 @@ Testing for control_risk module. Testing for now risks for multilabel classification """ -from typing import List, Optional, Union +from typing import List, Union import numpy as np import pytest diff --git a/mapie/tests/test_risk_control.py b/mapie/tests/test_risk_control.py index 26d379b6f..2003ce8fd 100644 --- a/mapie/tests/test_risk_control.py +++ b/mapie/tests/test_risk_control.py @@ -852,4 +852,4 @@ def test_binary_classification_risk( expected_value = metric_func(y_true, y_pred) expected_n = effective_sample_func(y_true, y_pred) assert np.isclose(value, expected_value) - assert n == expected_n \ No newline at end of file + assert n == expected_n From 5fbc94019584681e4401a8c6009608f3cdf91b3b Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Wed, 30 Jul 2025 16:17:42 +0200 Subject: [PATCH 07/26] ENH - Performance, warning and docstring improvements --- mapie/control_risk/p_values.py | 10 ++-------- mapie/risk_control.py | 16 ++++++---------- mapie/risk_control_draft.py | 6 +++++- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/mapie/control_risk/p_values.py b/mapie/control_risk/p_values.py index 297d103d6..190ecae90 100644 --- a/mapie/control_risk/p_values.py +++ b/mapie/control_risk/p_values.py @@ -103,11 +103,8 @@ def _h1( r_hats: NDArray[float], alphas: NDArray[float] ) -> NDArray[float]: """ - This function allow us to compute - the tighter version of hoeffding inequality. - This function is then used in the - hoeffding_bentkus_p_value function for the - computation of p-values. + This function allow us to compute the tighter version of hoeffding inequality. + When r_hat = 0, the log is undefined, but the limit is 0, so we set the result to 0. Parameters ---------- @@ -131,9 +128,6 @@ def _h1( NDArray of shape (n_lambdas, n_alpha). """ elt1 = np.zeros_like(r_hats, dtype=float) - - # Compute only where r_hats != 0 to avoid log(0) - # TODO: check Angelopoulos implementation mask = r_hats != 0 elt1[mask] = r_hats[mask] * np.log(r_hats[mask] / alphas[mask]) elt2 = (1 - r_hats) * np.log((1 - r_hats) / (1 - alphas)) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 5eb028764..b764fd930 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -731,21 +731,17 @@ def get_value_and_effective_sample_size( y_pred: NDArray[int], # shape (n_samples,), values in {0, 1} ) -> Optional[Tuple[float, int]]: # float between 0 and 1, int between 0 and len(y_true) - risk_occurrences = [ + risk_occurrences = np.array([ self.risk_occurrence(y_true_i, y_pred_i) for y_true_i, y_pred_i in zip(y_true, y_pred) - ] - risk_conditions = [ + ]) + risk_conditions = np.array([ self.risk_condition(y_true_i, y_pred_i) for y_true_i, y_pred_i in zip(y_true, y_pred) - ] - effective_sample_size = len(y_true) - risk_conditions.count(False) + ]) + effective_sample_size = len(y_true) - np.sum(~risk_conditions) if effective_sample_size != 0: - risk_sum = sum( - occurrence for occurrence, condition in zip( - risk_occurrences, risk_conditions - ) if condition - ) + risk_sum = np.sum(risk_occurrences[risk_conditions]) risk_value = risk_sum / effective_sample_size return risk_value, effective_sample_size return None diff --git a/mapie/risk_control_draft.py b/mapie/risk_control_draft.py index 4a5a426e4..2e018750a 100644 --- a/mapie/risk_control_draft.py +++ b/mapie/risk_control_draft.py @@ -71,7 +71,11 @@ def calibrate(self, X_calibrate: ArrayLike, y_calibrate: ArrayLike) -> None: ) self.valid_thresholds = self._thresholds[valid_thresholds_index[0]] if len(self.valid_thresholds) == 0: - warnings.warn("No predict parameters were found to control the risk.") + warnings.warn( + "No predict parameters were found to control the risk at the given " + "target and confidence levels. " + "Try using a larger calibration set or a better model.", + ) # Minimum in case of precision control only self.best_threshold = min(self.valid_thresholds) From cc88354d7f62e4d01a182d08c588cc06d84f4c82 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Wed, 30 Jul 2025 16:24:40 +0200 Subject: [PATCH 08/26] FIX - Fix local typing issue, investigate CI typing issues --- Makefile | 1 + mapie/risk_control.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8414450a7..43120ac09 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,7 @@ lint: flake8 examples mapie notebooks --max-line-length=88 type-check: + mypy --version mypy mapie coverage: diff --git a/mapie/risk_control.py b/mapie/risk_control.py index b764fd930..9b9c974a4 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -741,7 +741,7 @@ def get_value_and_effective_sample_size( ]) effective_sample_size = len(y_true) - np.sum(~risk_conditions) if effective_sample_size != 0: - risk_sum = np.sum(risk_occurrences[risk_conditions]) + risk_sum: int = np.sum(risk_occurrences[risk_conditions]) risk_value = risk_sum / effective_sample_size return risk_value, effective_sample_size return None From feb075dac0c6a129c5f58ceb432ceac2f325e68b Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Wed, 30 Jul 2025 16:31:39 +0200 Subject: [PATCH 09/26] FIX - Continue investigating CI typing issues --- .github/workflows/test.yml | 1 + Makefile | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4de289019..19234720c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -63,6 +63,7 @@ jobs: run: make lint - name: Check static typing run: make type-check + continue-on-error: true - name: Test and coverage with pytest run: make coverage - name: Code coverage diff --git a/Makefile b/Makefile index 43120ac09..8414450a7 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,6 @@ lint: flake8 examples mapie notebooks --max-line-length=88 type-check: - mypy --version mypy mapie coverage: From bf28de0a31dc4af6f3b4d5fc84b4f2c861607878 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 31 Jul 2025 15:25:47 +0200 Subject: [PATCH 10/26] MTN - remove relative import --- mapie/control_risk/ltt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapie/control_risk/ltt.py b/mapie/control_risk/ltt.py index 2b59a9b58..119363a0d 100644 --- a/mapie/control_risk/ltt.py +++ b/mapie/control_risk/ltt.py @@ -5,7 +5,7 @@ from numpy.typing import ArrayLike, NDArray -from .p_values import compute_hoeffdding_bentkus_p_value +from mapie.control_risk.p_values import compute_hoeffdding_bentkus_p_value def ltt_procedure( From 09e87510e4efc17c32aa1e4346b69ab3fce1aae5 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 31 Jul 2025 18:37:47 +0200 Subject: [PATCH 11/26] ENH & TEST - Handle the case of undefined risk (ex: precision with no positive predictions) --- mapie/risk_control.py | 8 ++++++-- mapie/tests/test_control_risk.py | 16 +++++++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 9b9c974a4..7b22b69b8 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -729,8 +729,12 @@ def get_value_and_effective_sample_size( self, y_true: NDArray[int], # shape (n_samples,), values in {0, 1} y_pred: NDArray[int], # shape (n_samples,), values in {0, 1} - ) -> Optional[Tuple[float, int]]: + ) -> Tuple[float, int]: # float between 0 and 1, int between 0 and len(y_true) + # returns (1, -1) when the risk is not defined (condition never met) + # In this case, the corresponding lambda shouldn't be considered valid. + # In the current LTT implementation, providing n_obs=-1 will result + # in an infinite p_value, effectively invaliding the lambda risk_occurrences = np.array([ self.risk_occurrence(y_true_i, y_pred_i) for y_true_i, y_pred_i in zip(y_true, y_pred) @@ -744,7 +748,7 @@ def get_value_and_effective_sample_size( risk_sum: int = np.sum(risk_occurrences[risk_conditions]) risk_value = risk_sum / effective_sample_size return risk_value, effective_sample_size - return None + return 1, -1 precision = BinaryClassificationRisk( diff --git a/mapie/tests/test_control_risk.py b/mapie/tests/test_control_risk.py index 15779100a..17f635741 100644 --- a/mapie/tests/test_control_risk.py +++ b/mapie/tests/test_control_risk.py @@ -182,7 +182,7 @@ def test_invalid_shape_alpha_hb() -> None: compute_hoeffdding_bentkus_p_value(r_hat, n, wrong_alpha_shape) -def test_p_values_n_obs_int_vs_array() -> None: +def test_hb_p_values_n_obs_int_vs_array() -> None: """Test that using n_obs as an array gives the same values as an int""" r_hat = np.array([0.5, 0.8]) n_obs = np.array([1100, 1200]) @@ -203,3 +203,17 @@ def test_p_values_n_obs_int_vs_array() -> None: pval_array = compute_hoeffdding_bentkus_p_value(r_hat, n_obs, alpha) np.testing.assert_allclose(pval_manual, pval_array, rtol=1e-12) + + +def test_ltt_procedure_n_obs_negative() -> None: + """ + Test ltt_procedure with negative n_obs. + This happens when the risk, defined as the conditional expectation of + a loss, is undefined because the condition is never met. + This should return an invalid lambda. + """ + r_hat = np.array([0.5]) + n_obs = np.array([-1]) + alpha_np = np.array([0.6]) + binary = True + assert ltt_procedure(r_hat, alpha_np, 0.1, n_obs, binary) == [[]] From 1f1879585b1787432936701cbe3b06ee0cc7beb1 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Fri, 1 Aug 2025 13:45:57 +0200 Subject: [PATCH 12/26] MTN - Revert formatting to avoid changes unrelated to current PR --- mapie/risk_control.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 7b22b69b8..917474678 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -11,10 +11,8 @@ from sklearn.multioutput import MultiOutputClassifier from sklearn.pipeline import Pipeline from sklearn.utils import check_random_state -from sklearn.utils.validation import ( - _check_y, _num_samples, check_is_fitted, - indexable, -) +from sklearn.utils.validation import (_check_y, _num_samples, check_is_fitted, + indexable) from numpy.typing import ArrayLike, NDArray from .control_risk.crc_rcps import find_lambda_star, get_r_hat_plus @@ -220,9 +218,8 @@ def _check_method(self) -> None: "Invalid method for metric: " + "You are controlling " + self.metric_control + " and you are using invalid method: " + self.method - + ". Use instead: " + "".join( - self.valid_methods_by_metric_[ - self.metric_control] + + ". Use instead: " + "".join(self.valid_methods_by_metric_[ + self.metric_control] ) ) @@ -368,10 +365,10 @@ def _check_estimator( LogisticRegression() ) X_train, X_conf, y_train, y_conf = train_test_split( - X, - y, - test_size=self.conformalize_size, - random_state=self.random_state, + X, + y, + test_size=self.conformalize_size, + random_state=self.random_state, ) estimator.fit(X_train, y_train) warnings.warn( @@ -689,7 +686,7 @@ def predict( ) self._check_valid_index(alpha_np) self.lambdas_star, self.r_star = find_lambda_control_star( - self.r_hat, self.valid_index, self.lambdas + self.r_hat, self.valid_index, self.lambdas ) y_pred_proba_array = ( y_pred_proba_array > From e661adb0ddacb77824b2fa693fac3788226931b2 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Fri, 1 Aug 2025 13:54:25 +0200 Subject: [PATCH 13/26] MTN - Clarify code --- mapie/risk_control_draft.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mapie/risk_control_draft.py b/mapie/risk_control_draft.py index 2e018750a..570dcc31e 100644 --- a/mapie/risk_control_draft.py +++ b/mapie/risk_control_draft.py @@ -56,17 +56,20 @@ def calibrate(self, X_calibrate: ArrayLike, y_calibrate: ArrayLike) -> None: ) for predictions in predictions_per_threshold] ) + risks_per_threshold = risks_and_eff_sizes[:, 0] + eff_sample_sizes_per_threshold = risks_and_eff_sizes[:, 1] + if self._risk.higher_is_better: - risks_and_eff_sizes[:, 0] = 1 - risks_and_eff_sizes[:, 0] + risks_per_threshold = 1 - risks_per_threshold alpha = self._target_level else: alpha = 1 - self._target_level valid_thresholds_index = ltt_procedure( - risks_and_eff_sizes[:, 0], + risks_per_threshold, np.array([alpha]), self._delta, - risks_and_eff_sizes[:, 1], + eff_sample_sizes_per_threshold, True, ) self.valid_thresholds = self._thresholds[valid_thresholds_index[0]] From f232d5dfc3f20274975bbaaf9f12982f214f4209 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Wed, 27 Aug 2025 15:16:40 +0200 Subject: [PATCH 14/26] TEST - Fix test following handling of undefined risk --- mapie/tests/test_risk_control.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mapie/tests/test_risk_control.py b/mapie/tests/test_risk_control.py index 2003ce8fd..16a072298 100644 --- a/mapie/tests/test_risk_control.py +++ b/mapie/tests/test_risk_control.py @@ -844,9 +844,7 @@ def test_binary_classification_risk( ): result = risk_instance.get_value_and_effective_sample_size(y_true, y_pred) if effective_sample_func(y_true, y_pred) == 0: - assert result is None - elif result is None: - raise ValueError() + assert result == (1, -1) else: value, n = result expected_value = metric_func(y_true, y_pred) From df343ca6f035c9f1e459d1f904d454dffbe39f8b Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Wed, 27 Aug 2025 16:40:25 +0200 Subject: [PATCH 15/26] FIX - Fix typing issues in Python 3.9, revert CI back to normal --- .github/workflows/test.yml | 1 - mapie/control_risk/ltt.py | 4 ++-- mapie/control_risk/p_values.py | 10 +++++----- mapie/risk_control.py | 4 ++-- mapie/risk_control_draft.py | 6 +++--- 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 19234720c..4de289019 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -63,7 +63,6 @@ jobs: run: make lint - name: Check static typing run: make type-check - continue-on-error: true - name: Test and coverage with pytest run: make coverage - name: Code coverage diff --git a/mapie/control_risk/ltt.py b/mapie/control_risk/ltt.py index 119363a0d..ce0212943 100644 --- a/mapie/control_risk/ltt.py +++ b/mapie/control_risk/ltt.py @@ -9,8 +9,8 @@ def ltt_procedure( - r_hat: NDArray[float], - alpha_np: NDArray[float], + r_hat: NDArray, + alpha_np: NDArray, delta: float, n_obs: Union[int, NDArray], binary: bool = False, diff --git a/mapie/control_risk/p_values.py b/mapie/control_risk/p_values.py index 190ecae90..ba5ac1db3 100644 --- a/mapie/control_risk/p_values.py +++ b/mapie/control_risk/p_values.py @@ -8,11 +8,11 @@ def compute_hoeffdding_bentkus_p_value( - r_hat: NDArray[float], + r_hat: NDArray, n_obs: Union[int, NDArray], - alpha: Union[float, NDArray[float]], + alpha: Union[float, NDArray], binary: bool = False, -) -> NDArray[float]: +) -> NDArray: """ The method computes the p_values according to the Hoeffding_Bentkus inequality for each @@ -100,8 +100,8 @@ def compute_hoeffdding_bentkus_p_value( def _h1( - r_hats: NDArray[float], alphas: NDArray[float] -) -> NDArray[float]: + r_hats: NDArray, alphas: NDArray +) -> NDArray: """ This function allow us to compute the tighter version of hoeffding inequality. When r_hat = 0, the log is undefined, but the limit is 0, so we set the result to 0. diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 917474678..d7eb06a3b 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -724,8 +724,8 @@ def __init__( def get_value_and_effective_sample_size( self, - y_true: NDArray[int], # shape (n_samples,), values in {0, 1} - y_pred: NDArray[int], # shape (n_samples,), values in {0, 1} + y_true: NDArray, # shape (n_samples,), values in {0, 1} + y_pred: NDArray, # shape (n_samples,), values in {0, 1} ) -> Tuple[float, int]: # float between 0 and 1, int between 0 and len(y_true) # returns (1, -1) when the risk is not defined (condition never met) diff --git a/mapie/risk_control_draft.py b/mapie/risk_control_draft.py index 570dcc31e..cc3a3f40e 100644 --- a/mapie/risk_control_draft.py +++ b/mapie/risk_control_draft.py @@ -21,7 +21,7 @@ class BinaryClassificationController: # pragma: no cover def __init__( self, # X -> y_proba of shape (n_samples, 2) - predict_function: Callable[[ArrayLike], ArrayLike], + predict_function: Callable[[ArrayLike], NDArray], risk: BinaryClassificationRisk, target_level: float, confidence_level: float = 0.9, @@ -33,10 +33,10 @@ def __init__( self._target_level = target_level self._delta = 1 - confidence_level - self._thresholds: NDArray[float] = np.linspace(0, 0.99, 100) + self._thresholds: NDArray = np.linspace(0, 0.99, 100) # TODO: add a _is_calibrated attribute to check at prediction time - self.valid_thresholds: Optional[NDArray[float]] = None + self.valid_thresholds: Optional[NDArray] = None self.best_threshold: Optional[float] = None def calibrate(self, X_calibrate: ArrayLike, y_calibrate: ArrayLike) -> None: From 8bf31fa7b51cb027eee6b927bfa7bf74f1af0679 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 28 Aug 2025 01:21:18 +0200 Subject: [PATCH 16/26] WIP - try to fix typing (can't reproduce locally) --- mapie/risk_control.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index d7eb06a3b..ce54d86aa 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -740,7 +740,7 @@ def get_value_and_effective_sample_size( self.risk_condition(y_true_i, y_pred_i) for y_true_i, y_pred_i in zip(y_true, y_pred) ]) - effective_sample_size = len(y_true) - np.sum(~risk_conditions) + effective_sample_size: int = len(y_true) - np.sum(~risk_conditions) if effective_sample_size != 0: risk_sum: int = np.sum(risk_occurrences[risk_conditions]) risk_value = risk_sum / effective_sample_size From c819bcd92c7ff74d596ffb5b65066e91c2135bc4 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 28 Aug 2025 01:27:22 +0200 Subject: [PATCH 17/26] WIP - try to fix typing (can't reproduce locally) --- mapie/risk_control.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index ce54d86aa..64e1a578c 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -740,7 +740,8 @@ def get_value_and_effective_sample_size( self.risk_condition(y_true_i, y_pred_i) for y_true_i, y_pred_i in zip(y_true, y_pred) ]) - effective_sample_size: int = len(y_true) - np.sum(~risk_conditions) + effective_sample_size = len(y_true) - np.sum(~risk_conditions) + cast(int, effective_sample_size) # Needed for MyPy with Python 3.9 if effective_sample_size != 0: risk_sum: int = np.sum(risk_occurrences[risk_conditions]) risk_value = risk_sum / effective_sample_size From 6b4fff59b988c8bb5f5e14fc5b7944b0cfc3ec28 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 28 Aug 2025 01:33:14 +0200 Subject: [PATCH 18/26] WIP - try to fix typing (can't reproduce locally) --- mapie/risk_control.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 64e1a578c..7776a9a2c 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -741,10 +741,12 @@ def get_value_and_effective_sample_size( for y_true_i, y_pred_i in zip(y_true, y_pred) ]) effective_sample_size = len(y_true) - np.sum(~risk_conditions) - cast(int, effective_sample_size) # Needed for MyPy with Python 3.9 if effective_sample_size != 0: risk_sum: int = np.sum(risk_occurrences[risk_conditions]) risk_value = risk_sum / effective_sample_size + # Casting needed for MyPy with Python 3.9 + cast(float, risk_value) + cast(int, effective_sample_size) return risk_value, effective_sample_size return 1, -1 From e428c893614942f5681d71b19e8a74f3c3772a9e Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 28 Aug 2025 01:40:23 +0200 Subject: [PATCH 19/26] WIP - try to fix typing (can't reproduce locally) --- mapie/risk_control.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 7776a9a2c..fe7b9622f 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -741,12 +741,10 @@ def get_value_and_effective_sample_size( for y_true_i, y_pred_i in zip(y_true, y_pred) ]) effective_sample_size = len(y_true) - np.sum(~risk_conditions) + effective_sample_size cast(int, effective_sample_size) # Needed for MyPy with Python 3.9 if effective_sample_size != 0: risk_sum: int = np.sum(risk_occurrences[risk_conditions]) risk_value = risk_sum / effective_sample_size - # Casting needed for MyPy with Python 3.9 - cast(float, risk_value) - cast(int, effective_sample_size) return risk_value, effective_sample_size return 1, -1 From 78017bee239e9bdd05d091c00e32ed00137c8110 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 28 Aug 2025 01:45:03 +0200 Subject: [PATCH 20/26] WIP - try to fix typing (can't reproduce locally) --- mapie/risk_control.py | 2 +- mapie/risk_control_draft.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index fe7b9622f..48760fdf4 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -741,7 +741,7 @@ def get_value_and_effective_sample_size( for y_true_i, y_pred_i in zip(y_true, y_pred) ]) effective_sample_size = len(y_true) - np.sum(~risk_conditions) - effective_sample_size cast(int, effective_sample_size) # Needed for MyPy with Python 3.9 + effective_sample_size = cast(int, effective_sample_size) # Needed for MyPy with Python 3.9 if effective_sample_size != 0: risk_sum: int = np.sum(risk_occurrences[risk_conditions]) risk_value = risk_sum / effective_sample_size diff --git a/mapie/risk_control_draft.py b/mapie/risk_control_draft.py index cc3a3f40e..e4f400a87 100644 --- a/mapie/risk_control_draft.py +++ b/mapie/risk_control_draft.py @@ -79,9 +79,9 @@ def calibrate(self, X_calibrate: ArrayLike, y_calibrate: ArrayLike) -> None: "target and confidence levels. " "Try using a larger calibration set or a better model.", ) - - # Minimum in case of precision control only - self.best_threshold = min(self.valid_thresholds) + else: + # Minimum in case of precision control only + self.best_threshold = min(self.valid_thresholds) def predict(self, X_test: ArrayLike) -> NDArray: if self.best_threshold is None: From 54aac1e5f5fc443a3c407ed9cb48eec82c4d684d Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 28 Aug 2025 01:52:16 +0200 Subject: [PATCH 21/26] WIP - try to fix typing (can't reproduce locally) --- mapie/risk_control.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 48760fdf4..ffd5b7cb1 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -741,7 +741,8 @@ def get_value_and_effective_sample_size( for y_true_i, y_pred_i in zip(y_true, y_pred) ]) effective_sample_size = len(y_true) - np.sum(~risk_conditions) - effective_sample_size = cast(int, effective_sample_size) # Needed for MyPy with Python 3.9 + # Casting needed for MyPy with Python 3.9 + effective_sample_size = cast(int, effective_sample_size) if effective_sample_size != 0: risk_sum: int = np.sum(risk_occurrences[risk_conditions]) risk_value = risk_sum / effective_sample_size From d8e615ff5b24dbd1d1004b832c6ffac010906517 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 28 Aug 2025 09:45:38 +0200 Subject: [PATCH 22/26] WIP - try to fix typing (can't reproduce locally) --- mapie/risk_control.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index ffd5b7cb1..a3ea41cb0 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -742,11 +742,11 @@ def get_value_and_effective_sample_size( ]) effective_sample_size = len(y_true) - np.sum(~risk_conditions) # Casting needed for MyPy with Python 3.9 - effective_sample_size = cast(int, effective_sample_size) - if effective_sample_size != 0: + effective_sample_size_int = cast(int, effective_sample_size) + if effective_sample_size_int != 0: risk_sum: int = np.sum(risk_occurrences[risk_conditions]) - risk_value = risk_sum / effective_sample_size - return risk_value, effective_sample_size + risk_value = risk_sum / effective_sample_size_int + return risk_value, effective_sample_size_int return 1, -1 From dda38d5f6738806885f150a69042971804df7303 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 28 Aug 2025 18:12:31 +0200 Subject: [PATCH 23/26] ENH - Add theoretical validity notebook to documentation --- ...risk_control_theoretical_tests_proto.ipynb | 484 ++++++++++++++++++ 1 file changed, 484 insertions(+) create mode 100644 notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb diff --git a/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb b/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb new file mode 100644 index 000000000..f8aa15880 --- /dev/null +++ b/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb @@ -0,0 +1,484 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2ae91ff6-9706-41f1-bfdb-c39f5f2bfb9d", + "metadata": { + "id": "2ae91ff6-9706-41f1-bfdb-c39f5f2bfb9d" + }, + "source": [ + "# Binary classification risk control - Theoretical tests prototype" + ] + }, + { + "cell_type": "code", + "id": "1c564c4f-1e63-4c2f-bdd5-d84029c1473a", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-28T15:14:24.815466Z", + "start_time": "2025-08-28T15:14:21.457372Z" + } + }, + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2" + ], + "outputs": [], + "execution_count": 20 + }, + { + "cell_type": "code", + "id": "f1c2e64a", + "metadata": { + "id": "f1c2e64a", + "ExecuteTime": { + "end_time": "2025-08-28T15:14:25.924379Z", + "start_time": "2025-08-28T15:14:24.915913Z" + } + }, + "source": [ + "from sklearn.datasets import make_classification\n", + "import numpy as np\n", + "from mapie.risk_control import precision, accuracy, recall\n", + "from mapie.risk_control_draft import BinaryClassificationController\n", + "from itertools import product" + ], + "outputs": [], + "execution_count": 21 + }, + { + "cell_type": "code", + "id": "6c0b5e81-81f1-4688-a4d7-57c6adba44b4", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-28T15:14:26.859600Z", + "start_time": "2025-08-28T15:14:26.069276Z" + } + }, + "source": [ + "# Using sklearn.dummy.DummyClassifier would be clearer\n", + "class RandomClassifier:\n", + " def __init__(self, seed=42, threshold=0.5):\n", + " self.seed = seed\n", + " self.threshold = threshold\n", + "\n", + " def _get_prob(self, x):\n", + " local_seed = hash((x, self.seed)) % (2**32)\n", + " rng = np.random.RandomState(local_seed)\n", + " return np.round(rng.rand(), 2)\n", + "\n", + " def predict_proba(self, X):\n", + " probs = np.array([self._get_prob(x) for x in X])\n", + " return np.vstack([1 - probs, probs]).T\n", + "\n", + " def predict(self, X):\n", + " probs = self.predict_proba(X)[:, 1]\n", + " return (probs >= self.threshold).astype(int)" + ], + "outputs": [], + "execution_count": 22 + }, + { + "cell_type": "code", + "id": "03383363-b86d-4593-adf4-80215b6f1dcf", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 376 + }, + "id": "03383363-b86d-4593-adf4-80215b6f1dcf", + "outputId": "b15146cf-518e-4a93-8128-6c1865a08b01", + "ExecuteTime": { + "end_time": "2025-08-28T15:42:03.789703Z", + "start_time": "2025-08-28T15:41:28.346628Z" + } + }, + "source": [ + "N = [100, 5] # size of the calibration set\n", + "risk = [\n", + " {\"name\": \"precision\", \"risk\": precision},\n", + " {\"name\": \"recall\", \"risk\": recall},\n", + " {\"name\": \"accuracy\", \"risk\": accuracy},\n", + "]\n", + "predict_params = [np.linspace(0, 0.99, 100), np.array([0.5])]\n", + "target_level = [0.1, 0.9]\n", + "confidence_level = [0.8, 0.2]\n", + "\n", + "n_repeats = 100\n", + "invalid_experiments = []\n", + "\n", + "for i, combination in enumerate(product(N, risk, predict_params, target_level, confidence_level)):\n", + " N, risk, predict_params, target_level, confidence_level = combination\n", + "\n", + " clf = RandomClassifier()\n", + " nb_errors = 0 # number of iterations where the risk is not controlled (i.e., not all the valid thresholds found by LTT are actually valid)\n", + "\n", + " for _ in range(n_repeats):\n", + "\n", + " X_calibrate, y_calibrate = make_classification(\n", + " n_samples=N,\n", + " n_features=1,\n", + " n_informative=1,\n", + " n_redundant=0,\n", + " n_repeated=0,\n", + " n_classes=2,\n", + " n_clusters_per_class=1,\n", + " weights=[0.5, 0.5],\n", + " flip_y=0,\n", + " random_state=None\n", + " )\n", + " X_calibrate = X_calibrate.squeeze()\n", + "\n", + " controller = BinaryClassificationController(\n", + " predict_function=clf.predict_proba,\n", + " risk=risk[\"risk\"],\n", + " target_level=target_level,\n", + " confidence_level=confidence_level,\n", + " )\n", + " controller._thresholds = predict_params\n", + " controller.calibrate(X_calibrate, y_calibrate)\n", + " valid_parameters = controller.valid_thresholds\n", + "\n", + " # The following works because the data is balanced\n", + " if risk[\"risk\"] == precision or risk[\"risk\"] == accuracy:\n", + " if target_level > p and len(valid_parameters) >= 1:\n", + " nb_errors += 1\n", + " elif risk[\"risk\"] == recall:\n", + " if any(x < 0 or x > np.round(1-target_level, 2) for x in valid_parameters) and len(valid_parameters) >= 1:\n", + " nb_errors += 1\n", + "\n", + " print(f\"Proportion of times the risk is not controlled: {nb_errors/n_repeats}\")\n", + " print(f\"Risk level: {1-confidence_level}\")\n", + "\n", + " if nb_errors/n_repeats <= 1 - confidence_level:\n", + " #print(\"Valid experiment\")\n", + " pass\n", + " else:\n", + " print(\"Unvalid experiment\")\n", + " print(f\"{N=} {risk['name']=} {predict_params=} {target_level=} {confidence_level=}\")\n", + " invalid_experiments.append(i)" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.19999999999999996\n", + "Unvalid experiment\n", + "N=100 risk['name']='precision' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", + " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", + " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", + " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", + " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", + " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", + " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", + " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", + " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", + " 0.99]) target_level=0.9 confidence_level=0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.8\n", + "Unvalid experiment\n", + "N=100 risk['name']='precision' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", + " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", + " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", + " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", + " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", + " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", + " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", + " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", + " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", + " 0.99]) target_level=0.9 confidence_level=0.2\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.19999999999999996\n", + "Unvalid experiment\n", + "N=100 risk['name']='precision' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.8\n", + "Unvalid experiment\n", + "N=100 risk['name']='precision' predict_params=array([0.5]) target_level=0.9 confidence_level=0.2\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.19999999999999996\n", + "Unvalid experiment\n", + "N=100 risk['name']='recall' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", + " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", + " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", + " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", + " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", + " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", + " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", + " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", + " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", + " 0.99]) target_level=0.9 confidence_level=0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.8\n", + "Unvalid experiment\n", + "N=100 risk['name']='recall' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", + " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", + " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", + " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", + " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", + " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", + " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", + " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", + " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", + " 0.99]) target_level=0.9 confidence_level=0.2\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.19999999999999996\n", + "Unvalid experiment\n", + "N=100 risk['name']='recall' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.8\n", + "Unvalid experiment\n", + "N=100 risk['name']='recall' predict_params=array([0.5]) target_level=0.9 confidence_level=0.2\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.19999999999999996\n", + "Unvalid experiment\n", + "N=100 risk['name']='accuracy' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", + " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", + " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", + " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", + " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", + " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", + " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", + " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", + " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", + " 0.99]) target_level=0.9 confidence_level=0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.8\n", + "Unvalid experiment\n", + "N=100 risk['name']='accuracy' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", + " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", + " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", + " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", + " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", + " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", + " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", + " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", + " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", + " 0.99]) target_level=0.9 confidence_level=0.2\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.19999999999999996\n", + "Unvalid experiment\n", + "N=100 risk['name']='accuracy' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", + "Proportion of times the risk is not controlled: 1.0\n", + "Risk level: 0.8\n", + "Unvalid experiment\n", + "N=100 risk['name']='accuracy' predict_params=array([0.5]) target_level=0.9 confidence_level=0.2\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.46\n", + "Risk level: 0.19999999999999996\n", + "Unvalid experiment\n", + "N=5 risk['name']='precision' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", + "Proportion of times the risk is not controlled: 0.76\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.8\n", + "Risk level: 0.19999999999999996\n", + "Unvalid experiment\n", + "N=5 risk['name']='recall' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", + "Proportion of times the risk is not controlled: 0.75\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.53\n", + "Risk level: 0.19999999999999996\n", + "Unvalid experiment\n", + "N=5 risk['name']='accuracy' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", + " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", + " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", + " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", + " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", + " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", + " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", + " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", + " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", + " 0.99]) target_level=0.9 confidence_level=0.8\n", + "Proportion of times the risk is not controlled: 0.46\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.19999999999999996\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Risk level: 0.8\n", + "Proportion of times the risk is not controlled: 0.81\n", + "Risk level: 0.19999999999999996\n", + "Unvalid experiment\n", + "N=5 risk['name']='accuracy' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", + "Proportion of times the risk is not controlled: 0.99\n", + "Risk level: 0.8\n", + "Unvalid experiment\n", + "N=5 risk['name']='accuracy' predict_params=array([0.5]) target_level=0.9 confidence_level=0.2\n" + ] + } + ], + "execution_count": 38 + }, + { + "cell_type": "code", + "id": "104c7232-c8b1-432e-94dd-3f65e730483f", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-28T15:42:04.124169Z", + "start_time": "2025-08-28T15:42:03.820976Z" + } + }, + "source": "print(invalid_experiments)", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 30, 38, 42, 46, 47]\n" + ] + } + ], + "execution_count": 39 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-28T15:40:33.298549Z", + "start_time": "2025-08-28T15:40:32.942139Z" + } + }, + "cell_type": "code", + "source": "[2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 30, 38, 42, 46, 47]", + "id": "64a8c25a0488d9dd", + "outputs": [ + { + "data": { + "text/plain": [ + "[2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 30, 38, 42, 46, 47]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 35 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-28T16:08:52.638117Z", + "start_time": "2025-08-28T16:08:51.776497Z" + } + }, + "cell_type": "code", + "source": "print(i)", + "id": "7d45e8b38f509cba", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "47\n" + ] + } + ], + "execution_count": 40 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-28T16:09:05.979621Z", + "start_time": "2025-08-28T16:09:05.600091Z" + } + }, + "cell_type": "code", + "source": "print(len(invalid_experiments))", + "id": "68ee725d094bdc8b", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "17\n" + ] + } + ], + "execution_count": 41 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "60c8d5117a5b6030" + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 42f48b2f54e349c872aae344a6179f057c985f0d Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 28 Aug 2025 18:39:59 +0200 Subject: [PATCH 24/26] FIX - Fix theoretical validity notebook --- .../risk_control/risk_control_theoretical_tests_proto.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb b/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb index f8aa15880..e982b7a7e 100644 --- a/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb +++ b/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb @@ -141,7 +141,7 @@ "\n", " # The following works because the data is balanced\n", " if risk[\"risk\"] == precision or risk[\"risk\"] == accuracy:\n", - " if target_level > p and len(valid_parameters) >= 1:\n", + " if target_level > 0.5 and len(valid_parameters) >= 1:\n", " nb_errors += 1\n", " elif risk[\"risk\"] == recall:\n", " if any(x < 0 or x > np.round(1-target_level, 2) for x in valid_parameters) and len(valid_parameters) >= 1:\n", From a3a1ec25a278d3022706c50fcbb7ac313c3ecf1f Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Fri, 29 Aug 2025 12:15:40 +0200 Subject: [PATCH 25/26] FIX - Fix implementation error in BinaryClassificationController, improve theoretical test notebook --- mapie/risk_control_draft.py | 4 +- ...risk_control_theoretical_tests_proto.ipynb | 800 +++++++++++------- 2 files changed, 477 insertions(+), 327 deletions(-) diff --git a/mapie/risk_control_draft.py b/mapie/risk_control_draft.py index e4f400a87..b9431b586 100644 --- a/mapie/risk_control_draft.py +++ b/mapie/risk_control_draft.py @@ -61,9 +61,9 @@ def calibrate(self, X_calibrate: ArrayLike, y_calibrate: ArrayLike) -> None: if self._risk.higher_is_better: risks_per_threshold = 1 - risks_per_threshold - alpha = self._target_level - else: alpha = 1 - self._target_level + else: + alpha = self._target_level valid_thresholds_index = ltt_procedure( risks_per_threshold, diff --git a/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb b/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb index e982b7a7e..18859131b 100644 --- a/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb +++ b/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb @@ -1,60 +1,55 @@ { "cells": [ { + "metadata": {}, "cell_type": "markdown", - "id": "2ae91ff6-9706-41f1-bfdb-c39f5f2bfb9d", - "metadata": { - "id": "2ae91ff6-9706-41f1-bfdb-c39f5f2bfb9d" - }, - "source": [ - "# Binary classification risk control - Theoretical tests prototype" - ] + "source": "# Binary classification risk control - Theoretical tests prototype", + "id": "ed592eb3f8989aa8" }, { - "cell_type": "code", - "id": "1c564c4f-1e63-4c2f-bdd5-d84029c1473a", "metadata": { "ExecuteTime": { - "end_time": "2025-08-28T15:14:24.815466Z", - "start_time": "2025-08-28T15:14:21.457372Z" + "end_time": "2025-08-29T10:08:15.315556Z", + "start_time": "2025-08-29T10:08:14.792182Z" } }, + "cell_type": "code", "source": [ "%reload_ext autoreload\n", "%autoreload 2" ], + "id": "9b1422ae620955fd", "outputs": [], - "execution_count": 20 + "execution_count": 80 }, { - "cell_type": "code", - "id": "f1c2e64a", "metadata": { - "id": "f1c2e64a", "ExecuteTime": { - "end_time": "2025-08-28T15:14:25.924379Z", - "start_time": "2025-08-28T15:14:24.915913Z" + "end_time": "2025-08-29T10:08:15.956177Z", + "start_time": "2025-08-29T10:08:15.331400Z" } }, + "cell_type": "code", "source": [ "from sklearn.datasets import make_classification\n", "import numpy as np\n", "from mapie.risk_control import precision, accuracy, recall\n", "from mapie.risk_control_draft import BinaryClassificationController\n", - "from itertools import product" + "from itertools import product\n", + "from decimal import Decimal" ], + "id": "faeb2f47a92dbf35", "outputs": [], - "execution_count": 21 + "execution_count": 81 }, { - "cell_type": "code", - "id": "6c0b5e81-81f1-4688-a4d7-57c6adba44b4", "metadata": { "ExecuteTime": { - "end_time": "2025-08-28T15:14:26.859600Z", - "start_time": "2025-08-28T15:14:26.069276Z" + "end_time": "2025-08-29T10:08:16.435315Z", + "start_time": "2025-08-29T10:08:15.964027Z" } }, + "cell_type": "code", "source": [ "# Using sklearn.dummy.DummyClassifier would be clearer\n", "class RandomClassifier:\n", @@ -75,24 +70,18 @@ " probs = self.predict_proba(X)[:, 1]\n", " return (probs >= self.threshold).astype(int)" ], + "id": "eefafd6d1697fb9c", "outputs": [], - "execution_count": 22 + "execution_count": 82 }, { - "cell_type": "code", - "id": "03383363-b86d-4593-adf4-80215b6f1dcf", "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 376 - }, - "id": "03383363-b86d-4593-adf4-80215b6f1dcf", - "outputId": "b15146cf-518e-4a93-8128-6c1865a08b01", "ExecuteTime": { - "end_time": "2025-08-28T15:42:03.789703Z", - "start_time": "2025-08-28T15:41:28.346628Z" + "end_time": "2025-08-29T10:09:01.114704Z", + "start_time": "2025-08-29T10:08:16.452127Z" } }, + "cell_type": "code", "source": [ "N = [100, 5] # size of the calibration set\n", "risk = [\n", @@ -100,18 +89,20 @@ " {\"name\": \"recall\", \"risk\": recall},\n", " {\"name\": \"accuracy\", \"risk\": accuracy},\n", "]\n", - "predict_params = [np.linspace(0, 0.99, 100), np.array([0.5])]\n", + "predict_params = [np.linspace(0, 0.99, 100), np.array([0.5])]\n", "target_level = [0.1, 0.9]\n", "confidence_level = [0.8, 0.2]\n", "\n", "n_repeats = 100\n", - "invalid_experiments = []\n", "\n", - "for i, combination in enumerate(product(N, risk, predict_params, target_level, confidence_level)):\n", + "for combination in product(N, risk, predict_params, target_level, confidence_level):\n", " N, risk, predict_params, target_level, confidence_level = combination\n", + " alpha = float(Decimal(\"1\") - Decimal(str(target_level))) # to avoid floating point issues\n", + " delta = float(Decimal(\"1\") - Decimal(str(confidence_level))) # to avoid floating point issues\n", "\n", " clf = RandomClassifier()\n", " nb_errors = 0 # number of iterations where the risk is not controlled (i.e., not all the valid thresholds found by LTT are actually valid)\n", + " total_nb_valid_params = 0\n", "\n", " for _ in range(n_repeats):\n", "\n", @@ -138,323 +129,482 @@ " controller._thresholds = predict_params\n", " controller.calibrate(X_calibrate, y_calibrate)\n", " valid_parameters = controller.valid_thresholds\n", + " total_nb_valid_params += len(valid_parameters)\n", "\n", " # The following works because the data is balanced\n", " if risk[\"risk\"] == precision or risk[\"risk\"] == accuracy:\n", " if target_level > 0.5 and len(valid_parameters) >= 1:\n", " nb_errors += 1\n", " elif risk[\"risk\"] == recall:\n", - " if any(x < 0 or x > np.round(1-target_level, 2) for x in valid_parameters) and len(valid_parameters) >= 1:\n", + " if any(x > alpha for x in valid_parameters) and len(valid_parameters) >= 1:\n", " nb_errors += 1\n", "\n", + " print(f\"\\n{N=}, {risk['name']=}, {len(predict_params)=}, {target_level=}, {confidence_level=}\")\n", + "\n", " print(f\"Proportion of times the risk is not controlled: {nb_errors/n_repeats}\")\n", - " print(f\"Risk level: {1-confidence_level}\")\n", + " print(f\"Delta: {delta}\")\n", + " print(f\"Mean number of valid thresholds found per iteration: {int(np.round(total_nb_valid_params/n_repeats))}\")\n", "\n", - " if nb_errors/n_repeats <= 1 - confidence_level:\n", - " #print(\"Valid experiment\")\n", - " pass\n", + " if nb_errors/n_repeats <= delta:\n", + " print(\"Valid experiment\")\n", " else:\n", - " print(\"Unvalid experiment\")\n", - " print(f\"{N=} {risk['name']=} {predict_params=} {target_level=} {confidence_level=}\")\n", - " invalid_experiments.append(i)" + " print(\"Invalid experiment\")" ], + "id": "1fdffae392bb7a65", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.19999999999999996\n", - "Unvalid experiment\n", - "N=100 risk['name']='precision' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", - " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", - " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", - " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", - " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", - " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", - " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", - " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", - " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", - " 0.99]) target_level=0.9 confidence_level=0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.8\n", - "Unvalid experiment\n", - "N=100 risk['name']='precision' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", - " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", - " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", - " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", - " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", - " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", - " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", - " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", - " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", - " 0.99]) target_level=0.9 confidence_level=0.2\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.19999999999999996\n", - "Unvalid experiment\n", - "N=100 risk['name']='precision' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.8\n", - "Unvalid experiment\n", - "N=100 risk['name']='precision' predict_params=array([0.5]) target_level=0.9 confidence_level=0.2\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.19999999999999996\n", - "Unvalid experiment\n", - "N=100 risk['name']='recall' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", - " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", - " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", - " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", - " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", - " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", - " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", - " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", - " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", - " 0.99]) target_level=0.9 confidence_level=0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.8\n", - "Unvalid experiment\n", - "N=100 risk['name']='recall' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", - " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", - " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", - " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", - " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", - " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", - " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", - " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", - " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", - " 0.99]) target_level=0.9 confidence_level=0.2\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.19999999999999996\n", - "Unvalid experiment\n", - "N=100 risk['name']='recall' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.8\n", - "Unvalid experiment\n", - "N=100 risk['name']='recall' predict_params=array([0.5]) target_level=0.9 confidence_level=0.2\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.19999999999999996\n", - "Unvalid experiment\n", - "N=100 risk['name']='accuracy' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", - " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", - " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", - " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", - " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", - " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", - " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", - " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", - " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", - " 0.99]) target_level=0.9 confidence_level=0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.8\n", - "Unvalid experiment\n", - "N=100 risk['name']='accuracy' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", - " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", - " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", - " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", - " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", - " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", - " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", - " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", - " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", - " 0.99]) target_level=0.9 confidence_level=0.2\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.19999999999999996\n", - "Unvalid experiment\n", - "N=100 risk['name']='accuracy' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", - "Proportion of times the risk is not controlled: 1.0\n", - "Risk level: 0.8\n", - "Unvalid experiment\n", - "N=100 risk['name']='accuracy' predict_params=array([0.5]) target_level=0.9 confidence_level=0.2\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.46\n", - "Risk level: 0.19999999999999996\n", - "Unvalid experiment\n", - "N=5 risk['name']='precision' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", - "Proportion of times the risk is not controlled: 0.76\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.8\n", - "Risk level: 0.19999999999999996\n", - "Unvalid experiment\n", - "N=5 risk['name']='recall' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", - "Proportion of times the risk is not controlled: 0.75\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.53\n", - "Risk level: 0.19999999999999996\n", - "Unvalid experiment\n", - "N=5 risk['name']='accuracy' predict_params=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,\n", - " 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,\n", - " 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,\n", - " 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,\n", - " 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,\n", - " 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,\n", - " 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,\n", - " 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,\n", - " 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,\n", - " 0.99]) target_level=0.9 confidence_level=0.8\n", - "Proportion of times the risk is not controlled: 0.46\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.19999999999999996\n", - "Proportion of times the risk is not controlled: 0.0\n", - "Risk level: 0.8\n", - "Proportion of times the risk is not controlled: 0.81\n", - "Risk level: 0.19999999999999996\n", - "Unvalid experiment\n", - "N=5 risk['name']='accuracy' predict_params=array([0.5]) target_level=0.9 confidence_level=0.8\n", - "Proportion of times the risk is not controlled: 0.99\n", - "Risk level: 0.8\n", - "Unvalid experiment\n", - "N=5 risk['name']='accuracy' predict_params=array([0.5]) target_level=0.9 confidence_level=0.2\n" - ] - } - ], - "execution_count": 38 - }, - { - "cell_type": "code", - "id": "104c7232-c8b1-432e-94dd-3f65e730483f", - "metadata": { - "ExecuteTime": { - "end_time": "2025-08-28T15:42:04.124169Z", - "start_time": "2025-08-28T15:42:03.820976Z" - } - }, - "source": "print(invalid_experiments)", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 30, 38, 42, 46, 47]\n" - ] - } - ], - "execution_count": 39 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-08-28T15:40:33.298549Z", - "start_time": "2025-08-28T15:40:32.942139Z" - } - }, - "cell_type": "code", - "source": "[2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 30, 38, 42, 46, 47]", - "id": "64a8c25a0488d9dd", - "outputs": [ - { - "data": { - "text/plain": [ - "[2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 30, 38, 42, 46, 47]" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 35 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-08-28T16:08:52.638117Z", - "start_time": "2025-08-28T16:08:51.776497Z" - } - }, - "cell_type": "code", - "source": "print(i)", - "id": "7d45e8b38f509cba", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "47\n" + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.32\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 0\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.24\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 0\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.62\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.62\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 0\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.52\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.57\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.57\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.48\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 0\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n", + "\n", + "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n", + "Proportion of times the risk is not controlled: 0.0\n", + "Delta: 0.99\n", + "Mean number of valid thresholds found per iteration: 1\n", + "Valid experiment\n" ] } ], - "execution_count": 40 + "execution_count": 83 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-08-28T16:09:05.979621Z", - "start_time": "2025-08-28T16:09:05.600091Z" + "end_time": "2025-08-29T10:09:01.146778Z", + "start_time": "2025-08-29T10:09:01.143637Z" } }, "cell_type": "code", - "source": "print(len(invalid_experiments))", - "id": "68ee725d094bdc8b", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "17\n" - ] - } - ], - "execution_count": 41 - }, - { - "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, "source": "", - "id": "60c8d5117a5b6030" + "id": "4c3f437f0b2897a1", + "outputs": [], + "execution_count": null } ], "metadata": { From b96d0a11192e3552eeeedee0e3ab3cd6d78fe8f9 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Fri, 29 Aug 2025 14:34:14 +0200 Subject: [PATCH 26/26] ENH - Final tweaks to theoretical_validity_tests.ipynb --- ...ipynb => theoretical_validity_tests.ipynb} | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) rename notebooks/risk_control/{risk_control_theoretical_tests_proto.ipynb => theoretical_validity_tests.ipynb} (93%) diff --git a/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb b/notebooks/risk_control/theoretical_validity_tests.ipynb similarity index 93% rename from notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb rename to notebooks/risk_control/theoretical_validity_tests.ipynb index 18859131b..a5786c82f 100644 --- a/notebooks/risk_control/risk_control_theoretical_tests_proto.ipynb +++ b/notebooks/risk_control/theoretical_validity_tests.ipynb @@ -3,9 +3,29 @@ { "metadata": {}, "cell_type": "markdown", - "source": "# Binary classification risk control - Theoretical tests prototype", + "source": "# Binary classification risk control - Theoretical tests to validate implementation", "id": "ed592eb3f8989aa8" }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Protocol description\n", + "Testing theoretical guarantees of risk control in binary classification using a random classifier and synthetic data.\n", + "\n", + "Each test case looks at a combination of parameters, for which we repeat the experiment `n_repeat` times. The model is the same for all experiments (basically a random classifier), but the data is different each time.\n", + "\n", + "Each experiment consists of the following:\n", + " - We calibrate a BinaryClassificationController. It gives us the list of lambda values that control the risk according to LTT.\n", + " - Because we know that the model is random, we know the theoretical risk associated with each lambda value. So we are able to check if the lambda values given by LTT actually control the risk. If not, we count 1 \"error\". Note that *each* lambda value should control the risk, not just one of them.\n", + "\n", + "After n_repeat experiments, we compute the proportion of errors, that should be less than delta (1 - confidence_level).\n", + "\n", + "# Results\n", + "The risk is controlled in all the test cases. Overall, LTT seems very conservative (to achieve a high percentage of errors, we need to lower the confidence level significantly (0.01) and use only one threshold to avoid the Bonferroni effect). But this is likely due to the model being random, and thus having a lot of variance. It would be interesting to see how this evolves with a better model." + ], + "id": "8c1746b673c148dd" + }, { "metadata": { "ExecuteTime": { @@ -51,7 +71,7 @@ }, "cell_type": "code", "source": [ - "# Using sklearn.dummy.DummyClassifier would be clearer\n", + "# Using sklearn.dummy.DummyClassifier would be cleaner\n", "class RandomClassifier:\n", " def __init__(self, seed=42, threshold=0.5):\n", " self.seed = seed\n", @@ -131,7 +151,9 @@ " valid_parameters = controller.valid_thresholds\n", " total_nb_valid_params += len(valid_parameters)\n", "\n", - " # The following works because the data is balanced\n", + " # In the following, we check that all the valid thresholds found by LTT actually control the risk.\n", + " # Instead of sampling a large test set, we use the fact that we know the theoretical risk of a random classifier.\n", + " # The calculations here are valid only for a balanced data generator.\n", " if risk[\"risk\"] == precision or risk[\"risk\"] == accuracy:\n", " if target_level > 0.5 and len(valid_parameters) >= 1:\n", " nb_errors += 1\n",