diff --git a/mapie/__init__.py b/mapie/__init__.py
index 35fd5b090..5ec9939e1 100644
--- a/mapie/__init__.py
+++ b/mapie/__init__.py
@@ -4,7 +4,6 @@
     regression,
     utils,
     risk_control,
-    risk_control_draft,
     calibration,
     subsample,
 )
@@ -14,7 +13,6 @@
     "regression",
     "classification",
     "risk_control",
-    "risk_control_draft",
     "calibration",
     "metrics",
     "utils",
diff --git a/mapie/control_risk/ltt.py b/mapie/control_risk/ltt.py
index e19d3b849..ce0212943 100644
--- a/mapie/control_risk/ltt.py
+++ b/mapie/control_risk/ltt.py
@@ -1,37 +1,34 @@
 import warnings
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Tuple, Union
 
 import numpy as np
 
 from numpy.typing import ArrayLike, NDArray
 
-from .p_values import compute_hoeffdding_bentkus_p_value
+from mapie.control_risk.p_values import compute_hoeffdding_bentkus_p_value
 
 
 def ltt_procedure(
-    r_hat: NDArray[np.float32],
-    alpha_np: NDArray[np.float32],
-    delta: Optional[float],
-    n_obs: int,
-    binary: bool = False,  # TODO: maybe should pass p_values fonction instead
-) -> Tuple[List[List[Any]], NDArray[np.float32]]:
+    r_hat: NDArray,
+    alpha_np: NDArray,
+    delta: float,
+    n_obs: Union[int, NDArray],
+    binary: bool = False,
+) -> List[List[Any]]:
     """
     Apply the Learn-Then-Test procedure for risk control.
     Note that we will do a multiple test for ``r_hat`` that are
     less than level ``alpha_np``.
     The procedure follows the instructions in [1]:
-        - Calculate p-values for each lambdas descretized
-        - Apply a family wise error rate algorithm,
-        here Bonferonni correction
-        - Return the index lambdas that give you the control
-        at alpha level
+        - Calculate p-values for each lambdas discretized
+        - Apply a family wise error rate algorithm, here Bonferonni correction
+        - Return the index lambdas that give you the control at alpha level
 
     Parameters
     ----------
     r_hat: NDArray of shape (n_lambdas, ).
-        Empirical risk with respect
-        to the lambdas.
-        Here lambdas are thresholds that impact decision making,
+        Empirical risk with respect to the lambdas.
+        Here lambdas are thresholds that impact decision-making,
         therefore empirical risk.
 
     alpha_np: NDArray of shape (n_alpha, ).
@@ -44,34 +41,34 @@ def ltt_procedure(
         Correspond to proportion of failure we don't
         want to exceed.
 
+    n_obs: Union[int, NDArray]
+        Correspond to the number of observations used to compute the risk.
+        In the case of a conditional loss, n_obs must be the
+        number of effective observations used to compute the empirical risk
+        for each lambda, hence of shape (n_lambdas, ).
+
+    binary: bool, default=False
+        Must be True if the loss associated to the risk is binary.
+
     Returns
     -------
     valid_index: List[List[Any]].
-        Contain the valid index that satisfy fwer control
+        Contain the valid index that satisfy FWER control
         for each alpha (length aren't the same for each alpha).
 
-    p_values: NDArray of shape (n_lambda, n_alpha).
-        Contains the values of p_value for different alpha.
-
     References
     ----------
     [1] Angelopoulos, A. N., Bates, S., Candès, E. J., Jordan,
     M. I., & Lei, L. (2021). Learn then test:
     "Calibrating predictive algorithms to achieve risk control".
     """
-    if delta is None:
-        raise ValueError(
-            "Invalid delta: delta cannot be None while"
-            + " controlling precision with LTT. "
-        )
     p_values = compute_hoeffdding_bentkus_p_value(r_hat, n_obs, alpha_np, binary)
     N = len(p_values)
     valid_index = []
     for i in range(len(alpha_np)):
         l_index = np.where(p_values[:, i] <= delta/N)[0].tolist()
         valid_index.append(l_index)
-    return valid_index, p_values  # TODO : p_values is not used, we could remove it
-    # Or return corrected p_values
+    return valid_index
 
 
 def find_lambda_control_star(
diff --git a/mapie/control_risk/p_values.py b/mapie/control_risk/p_values.py
index d1a420a4c..ba5ac1db3 100644
--- a/mapie/control_risk/p_values.py
+++ b/mapie/control_risk/p_values.py
@@ -8,11 +8,11 @@
 
 
 def compute_hoeffdding_bentkus_p_value(
-    r_hat: NDArray[np.float32],
-    n_obs: int,
-    alpha: Union[float, NDArray[np.float32]],
+    r_hat: NDArray,
+    n_obs: Union[int, NDArray],
+    alpha: Union[float, NDArray],
     binary: bool = False,
-) -> NDArray[np.float32]:
+) -> NDArray:
     """
     The method computes the p_values according to
     the Hoeffding_Bentkus inequality for each
@@ -30,9 +30,11 @@ def compute_hoeffdding_bentkus_p_value(
         Here lambdas are thresholds that impact decision
         making and therefore empirical risk.
 
-    n_obs: int.
-        Correspond to the number of observations in
-        dataset.
+    n_obs: Union[int, NDArray]
+        Correspond to the number of observations used to compute the risk.
+        In the case of a conditional loss, n_obs must be the
+        number of effective observations used to compute the empirical risk
+        for each lambda, hence of shape (n_lambdas, ).
 
     alpha: Union[float, Iterable[float]].
         Contains the different alphas control level.
@@ -40,6 +42,11 @@ def compute_hoeffdding_bentkus_p_value(
         If it is a iterable, it is a NDArray of shape
         (n_alpha, ).
 
+    binary: bool, default=False
+        Must be True if the loss associated to the risk is binary.
+        If True, we use a tighter version of the Bentkus p-value, valid when the
+        loss associated to the risk is binary. See section 3.2 of [1].
+
     Returns
     -------
     hb_p_values: NDArray of shape (n_lambda, n_alpha).
@@ -62,9 +69,17 @@ def compute_hoeffdding_bentkus_p_value(
         len(r_hat),
         axis=0
     )
+    if isinstance(n_obs, int):
+        n_obs = np.full_like(r_hat, n_obs, dtype=float)
+    n_obs_repeat = np.repeat(
+        np.expand_dims(n_obs, axis=1),
+        len(alpha_np),
+        axis=1
+    )
+
     hoeffding_p_value = np.exp(
-        -n_obs * _h1(
-            np.where(  # TODO : shouldn't we use np.minimum ?
+        -n_obs_repeat * _h1(
+            np.where(
                 r_hat_repeat > alpha_repeat,
                 alpha_repeat,
                 r_hat_repeat
@@ -74,9 +89,9 @@ def compute_hoeffdding_bentkus_p_value(
     )
     factor = 1 if binary else np.e
     bentkus_p_value = factor * binom.cdf(
-        np.ceil(n_obs * r_hat_repeat), n_obs, alpha_repeat
+        np.ceil(n_obs_repeat * r_hat_repeat), n_obs_repeat, alpha_repeat
     )
-    hb_p_value = np.where(  # TODO : shouldn't we use np.minimum ?
+    hb_p_value = np.where(
         bentkus_p_value > hoeffding_p_value,
         hoeffding_p_value,
         bentkus_p_value
@@ -85,14 +100,11 @@ def compute_hoeffdding_bentkus_p_value(
 
 
 def _h1(
-    r_hats: NDArray[np.float32], alphas: NDArray[np.float32]
-) -> NDArray[np.float32]:
+    r_hats: NDArray, alphas: NDArray
+) -> NDArray:
     """
-    This function allow us to compute
-    the tighter version of hoeffding inequality.
-    This function is then used in the
-    hoeffding_bentkus_p_value function for the
-    computation of p-values.
+    This function allow us to compute the tighter version of hoeffding inequality.
+    When r_hat = 0, the log is undefined, but the limit is 0, so we set the result to 0.
 
     Parameters
     ----------
@@ -113,12 +125,9 @@ def _h1(
 
     Returns
     -------
-    NDArray of shape a(n_lambdas, n_alpha).
+    NDArray of shape (n_lambdas, n_alpha).
     """
     elt1 = np.zeros_like(r_hats, dtype=float)
-
-    # Compute only where r_hats != 0 to avoid log(0)
-    # TODO: check Angelopoulos implementation
     mask = r_hats != 0
     elt1[mask] = r_hats[mask] * np.log(r_hats[mask] / alphas[mask])
     elt2 = (1 - r_hats) * np.log((1 - r_hats) / (1 - alphas))
diff --git a/mapie/risk_control.py b/mapie/risk_control.py
index 5489eed11..a3ea41cb0 100644
--- a/mapie/risk_control.py
+++ b/mapie/risk_control.py
@@ -2,7 +2,7 @@
 
 import warnings
 from itertools import chain
-from typing import Iterable, Optional, Sequence, Tuple, Union, cast
+from typing import Iterable, Optional, Sequence, Tuple, Union, cast, Callable
 
 import numpy as np
 from sklearn.base import BaseEstimator, ClassifierMixin
@@ -681,8 +681,8 @@ def predict(
         if self.metric_control == 'precision':
             self.n_obs = len(self.risks)
             self.r_hat = self.risks.mean(axis=0)
-            self.valid_index, self.p_values = ltt_procedure(
-                self.r_hat, alpha_np, delta, self.n_obs
+            self.valid_index = ltt_procedure(
+                self.r_hat, alpha_np, cast(float, delta), self.n_obs
             )
             self._check_valid_index(alpha_np)
             self.lambdas_star, self.r_star = find_lambda_control_star(
@@ -706,3 +706,70 @@ def predict(
                 self.lambdas_star[np.newaxis, np.newaxis, :]
             )
         return y_pred, y_pred_proba_array
+
+
+class BinaryClassificationRisk:
+    # Any risk that can be defined in the following way will work using the binary
+    # Hoeffding-Bentkus p-values used in MAPIE
+    # Take the example of precision in the docstring to explain how the class works.
+    def __init__(
+        self,
+        risk_occurrence: Callable[[int, int], int],
+        risk_condition: Callable[[int, int], bool],
+        higher_is_better: bool,
+    ):
+        self.risk_occurrence = risk_occurrence
+        self.risk_condition = risk_condition
+        self.higher_is_better = higher_is_better
+
+    def get_value_and_effective_sample_size(
+        self,
+        y_true: NDArray,  # shape (n_samples,), values in {0, 1}
+        y_pred: NDArray,  # shape (n_samples,), values in {0, 1}
+    ) -> Tuple[float, int]:
+        # float between 0 and 1, int between 0 and len(y_true)
+        # returns (1, -1) when the risk is not defined (condition never met)
+        # In this case, the corresponding lambda shouldn't be considered valid.
+        # In the current LTT implementation, providing n_obs=-1 will result
+        # in an infinite p_value, effectively invaliding the lambda
+        risk_occurrences = np.array([
+            self.risk_occurrence(y_true_i, y_pred_i)
+            for y_true_i, y_pred_i in zip(y_true, y_pred)
+        ])
+        risk_conditions = np.array([
+            self.risk_condition(y_true_i, y_pred_i)
+            for y_true_i, y_pred_i in zip(y_true, y_pred)
+        ])
+        effective_sample_size = len(y_true) - np.sum(~risk_conditions)
+        # Casting needed for MyPy with Python 3.9
+        effective_sample_size_int = cast(int, effective_sample_size)
+        if effective_sample_size_int != 0:
+            risk_sum: int = np.sum(risk_occurrences[risk_conditions])
+            risk_value = risk_sum / effective_sample_size_int
+            return risk_value, effective_sample_size_int
+        return 1, -1
+
+
+precision = BinaryClassificationRisk(
+    risk_occurrence=lambda y_true, y_pred: int(y_pred == y_true),
+    risk_condition=lambda y_true, y_pred: y_pred == 1,
+    higher_is_better=True,
+)
+
+accuracy = BinaryClassificationRisk(
+    risk_occurrence=lambda y_true, y_pred: int(y_pred == y_true),
+    risk_condition=lambda y_true, y_pred: True,
+    higher_is_better=True,
+)
+
+recall = BinaryClassificationRisk(
+    risk_occurrence=lambda y_true, y_pred: int(y_pred == y_true),
+    risk_condition=lambda y_true, y_pred: y_true == 1,
+    higher_is_better=True,
+)
+
+_automatic_best_predict_param_choice = {
+    precision: recall,
+    recall: precision,
+    accuracy: accuracy,
+}
diff --git a/mapie/risk_control_draft.py b/mapie/risk_control_draft.py
index a4f1f9485..b9431b586 100644
--- a/mapie/risk_control_draft.py
+++ b/mapie/risk_control_draft.py
@@ -1,168 +1,92 @@
-from typing import Any, Optional, Union
+import warnings
+from typing import Optional, Union, Callable
 
 import numpy as np
 from numpy._typing import ArrayLike, NDArray
-from sklearn.utils import check_random_state
 
 from mapie.control_risk.ltt import ltt_procedure
-from mapie.utils import _check_n_jobs, _check_verbose
+from mapie.risk_control import BinaryClassificationRisk
+
 
 # General TODOs:
-# TODO: maybe use type float instead of float32?
 # TODO : in calibration and prediction,
 #  use _transform_pred_proba or a function adapted to binary
 # to get the probabilities depending on the classifier
 
 
+# TODO: remove the no cover below
 class BinaryClassificationController:  # pragma: no cover
     # TODO : test that this is working with a sklearn pipeline
     # TODO : test that this is working with a pandas dataframes
-    """
-    Controller for the calibration of our binary classifier.
-
-    Parameters
-    ----------
-    fitted_binary_classifier: Any
-        Any object that provides a `predict_proba` method.
-
-    metric: str
-        The performance metric we want to control (ex: "precision")
-
-    target_level: float
-        The target performance level we want to achieve (ex: 0.8)
-
-    confidence_level: float
-        The maximum acceptable probability of the precision falling below the
-        target precision level (ex: 0.8)
-
-    Attributes
-    ----------
-    precision_per_threshold: NDArray
-        Precision of the binary classifier on the calibration set for each
-        threshold from self._thresholds.
-
-    valid_threshold: NDArray
-        Thresholds that meet the target precision with the desired confidence.
-
-    best_threshold: float
-        Valid threshold that maximizes the recall, i.e. the smallest valid
-        threshold.
-    """
-
     def __init__(
         self,
-        fitted_binary_classifier: Any,
-        metric: str,
+        # X -> y_proba of shape (n_samples, 2)
+        predict_function: Callable[[ArrayLike], NDArray],
+        risk: BinaryClassificationRisk,
         target_level: float,
         confidence_level: float = 0.9,
-        n_jobs: Optional[int] = None,
-        random_state: Optional[Union[int, np.random.RandomState]] = None,
-        verbose: int = 0
+        best_predict_param_choice: Union[str, BinaryClassificationRisk] = "auto",
     ):
-        _check_n_jobs(n_jobs)
-        _check_verbose(verbose)
-        check_random_state(random_state)
-
-        self._classifier = fitted_binary_classifier
-        self._alpha = 1 - target_level
+        self._predict_function = predict_function
+        self._risk = risk
+        self._best_predict_param_choice = best_predict_param_choice
+        self._target_level = target_level
         self._delta = 1 - confidence_level
-        self._n_jobs = n_jobs  # TODO : use this in the class or delete
-        self._random_state = random_state  # TODO : use this in the class or delete
-        self._verbose = verbose  # TODO : use this in the class or delete
 
-        self._thresholds: NDArray[np.float32] = np.arange(0, 1, 0.01)
+        self._thresholds: NDArray = np.linspace(0, 0.99, 100)
         # TODO: add a _is_calibrated attribute to check at prediction time
 
-        self.valid_thresholds: Optional[NDArray[np.float32]] = None
+        self.valid_thresholds: Optional[NDArray] = None
         self.best_threshold: Optional[float] = None
 
     def calibrate(self, X_calibrate: ArrayLike, y_calibrate: ArrayLike) -> None:
-        """
-        Find the threshold that statistically guarantees the desired precision
-        level while maximizing the recall.
-
-        Parameters
-        ----------
-        X_calibrate: ArrayLike
-            Features of the calibration set.
-
-        y_calibrate: ArrayLike
-            True labels of the calibration set.
-
-        Raises
-        ------
-        ValueError
-            If no thresholds that meet the target precision with the desired
-            confidence level are found.
-        """
-        # TODO: Make sure this works with sklearn train_test_split/Series
+        # TODO: Make sure the following works with sklearn train_test_split/Series
         y_calibrate_ = np.asarray(y_calibrate)
 
-        predictions_proba = self._classifier.predict_proba(X_calibrate)[:, 1]
+        predictions_proba = self._predict_function(X_calibrate)[:, 1]
+
+        predictions_per_threshold = (
+            predictions_proba[:, np.newaxis] >= self._thresholds
+        ).T.astype(int)
 
-        risk_per_threshold = 1 - self._compute_precision(
-            predictions_proba, y_calibrate_
+        risks_and_eff_sizes = np.array(
+            [self._risk.get_value_and_effective_sample_size(
+                y_calibrate_,
+                predictions
+            ) for predictions in predictions_per_threshold]
         )
 
-        valid_thresholds_index, _ = ltt_procedure(
-            risk_per_threshold,
-            np.array([self._alpha]),
+        risks_per_threshold = risks_and_eff_sizes[:, 0]
+        eff_sample_sizes_per_threshold = risks_and_eff_sizes[:, 1]
+
+        if self._risk.higher_is_better:
+            risks_per_threshold = 1 - risks_per_threshold
+            alpha = 1 - self._target_level
+        else:
+            alpha = self._target_level
+
+        valid_thresholds_index = ltt_procedure(
+            risks_per_threshold,
+            np.array([alpha]),
             self._delta,
-            len(y_calibrate_),
+            eff_sample_sizes_per_threshold,
             True,
         )
         self.valid_thresholds = self._thresholds[valid_thresholds_index[0]]
         if len(self.valid_thresholds) == 0:
-            # TODO: just warn, and raise error at prediction if no valid thresholds
-            raise ValueError("No valid thresholds found")
-
-        # Minimum in case of precision control only
-        self.best_threshold = min(self.valid_thresholds)
+            warnings.warn(
+                "No predict parameters were found to control the risk at the given "
+                "target and confidence levels. "
+                "Try using a larger calibration set or a better model.",
+            )
+        else:
+            # Minimum in case of precision control only
+            self.best_threshold = min(self.valid_thresholds)
 
     def predict(self, X_test: ArrayLike) -> NDArray:
-        """
-        Predict binary labels on the test set, using the best threshold found
-        during calibration.
-
-        Parameters
-        ----------
-        X_test: ArrayLike
-            Features of the test set.
-
-        Returns
-        -------
-        ArrayLike
-            Predicted labels (0 or 1) for each sample in the test set.
-        """
-        predictions_proba = self._classifier.predict_proba(X_test)[:, 1]
+        if self.best_threshold is None:
+            raise ValueError(
+                "No predict parameters were found to control the risk. Cannot predict."
+            )
+        predictions_proba = self._predict_function(X_test)[:, 1]
         return (predictions_proba >= self.best_threshold).astype(int)
-
-    def _compute_precision(  # TODO: use sklearn or MAPIE ?
-        self, predictions_proba: NDArray[np.float32], y_cal: NDArray[np.float32]
-    ) -> NDArray[np.float32]:
-        """
-        Compute the precision for each threshold.
-        """
-        predictions_per_threshold = (
-            predictions_proba[:, np.newaxis] >= self._thresholds
-        ).astype(int)
-
-        true_positives = np.sum(
-            (predictions_per_threshold == 1) & (y_cal[:, np.newaxis] == 1),
-            axis=0,
-        )
-        false_positives = np.sum(
-            (predictions_per_threshold == 1) & (y_cal[:, np.newaxis] == 0),
-            axis=0,
-        )
-
-        positive_predictions = true_positives + false_positives
-
-        # Avoid division by zero
-        precision_per_threshold = np.ones_like(self._thresholds, dtype=float)
-        nonzero_mask = positive_predictions > 0
-        precision_per_threshold[nonzero_mask] = (
-            true_positives[nonzero_mask] / positive_predictions[nonzero_mask]
-        )
-
-        return precision_per_threshold
diff --git a/mapie/tests/test_control_risk.py b/mapie/tests/test_control_risk.py
index 66eaab09f..17f635741 100644
--- a/mapie/tests/test_control_risk.py
+++ b/mapie/tests/test_control_risk.py
@@ -2,7 +2,7 @@
 Testing for control_risk module.
 Testing for now risks for multilabel classification
 """
-from typing import List, Optional, Union
+from typing import List, Union
 
 import numpy as np
 import pytest
@@ -151,9 +151,8 @@ def test_ltt_type_output_alpha_delta(
     delta: float
 ) -> None:
     """Test type output _ltt_procedure"""
-    valid_index, p_values = ltt_procedure(r_hat, alpha, delta, n)
+    valid_index = ltt_procedure(r_hat, alpha, delta, n)
     assert isinstance(valid_index, list)
-    assert isinstance(p_values, np.ndarray)
 
 
 @pytest.mark.parametrize("valid_index", [[[0, 1]]])
@@ -183,8 +182,38 @@ def test_invalid_shape_alpha_hb() -> None:
         compute_hoeffdding_bentkus_p_value(r_hat, n, wrong_alpha_shape)
 
 
-@pytest.mark.parametrize("delta", [None])
-def test_delta_none_ltt(delta: Optional[float]) -> None:
-    """Test error message when invalid delta"""
-    with pytest.raises(ValueError, match=r".*Invalid delta"):
-        ltt_procedure(r_hat, alpha, delta, n)
+def test_hb_p_values_n_obs_int_vs_array() -> None:
+    """Test that using n_obs as an array gives the same values as an int"""
+    r_hat = np.array([0.5, 0.8])
+    n_obs = np.array([1100, 1200])
+    alpha = np.array([0.6, 0.7])
+
+    pval_0 = compute_hoeffdding_bentkus_p_value(
+        np.array([r_hat[0]]),
+        int(n_obs[0]),
+        alpha
+    )
+    pval_1 = compute_hoeffdding_bentkus_p_value(
+        np.array([r_hat[1]]),
+        int(n_obs[1]),
+        alpha
+    )
+    pval_manual = np.vstack([pval_0, pval_1])
+
+    pval_array = compute_hoeffdding_bentkus_p_value(r_hat, n_obs, alpha)
+
+    np.testing.assert_allclose(pval_manual, pval_array, rtol=1e-12)
+
+
+def test_ltt_procedure_n_obs_negative() -> None:
+    """
+    Test ltt_procedure with negative n_obs.
+     This happens when the risk, defined as the conditional expectation of
+     a loss, is undefined because the condition is never met.
+     This should return an invalid lambda.
+     """
+    r_hat = np.array([0.5])
+    n_obs = np.array([-1])
+    alpha_np = np.array([0.6])
+    binary = True
+    assert ltt_procedure(r_hat, alpha_np, 0.1, n_obs, binary) == [[]]
diff --git a/mapie/tests/test_risk_control.py b/mapie/tests/test_risk_control.py
index abd1e5f09..16a072298 100644
--- a/mapie/tests/test_risk_control.py
+++ b/mapie/tests/test_risk_control.py
@@ -11,10 +11,17 @@
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils.validation import check_is_fitted
+from sklearn.metrics import precision_score, recall_score, accuracy_score
 from typing_extensions import TypedDict
 
 from numpy.typing import NDArray
-from mapie.risk_control import PrecisionRecallController
+from mapie.risk_control import (
+    PrecisionRecallController,
+    precision,
+    recall,
+    accuracy,
+    BinaryClassificationRisk,
+)
 
 Params = TypedDict(
     "Params",
@@ -260,7 +267,7 @@ def test_predict_output_shape(
         X,
         alpha=alpha,
         bound=args["bound"],
-        delta=.1
+        delta=delta
     )
     n_alpha = len(alpha) if hasattr(alpha, "__len__") else 1
     assert y_pred.shape == y.shape
@@ -808,3 +815,39 @@ def test_method_none_recall() -> None:
     )
     mapie_clf.fit(X_toy, y_toy)
     assert mapie_clf.method == "crc"
+
+
+# The following test is voluntarily agnostic
+# to the specific binary classification risk control implementation.
+@pytest.mark.parametrize(
+    "risk_instance, metric_func, effective_sample_func",
+    [
+        (precision, precision_score, lambda y_true, y_pred: np.sum(y_pred == 1)),
+        (recall, recall_score, lambda y_true, y_pred: np.sum(y_true == 1)),
+        (accuracy, accuracy_score, lambda y_true, y_pred: len(y_true)),
+    ],
+)
+@pytest.mark.parametrize(
+    "y_true, y_pred",
+    [
+        (np.array([1, 0, 1, 0]), np.array([1, 1, 0, 0])),
+        (np.array([1, 1, 0, 0]), np.array([1, 1, 1, 0])),
+        (np.array([0, 0, 0, 0]), np.array([0, 1, 0, 1])),
+    ],
+)
+def test_binary_classification_risk(
+    risk_instance: BinaryClassificationRisk,
+    metric_func,
+    effective_sample_func,
+    y_true,
+    y_pred
+):
+    result = risk_instance.get_value_and_effective_sample_size(y_true, y_pred)
+    if effective_sample_func(y_true, y_pred) == 0:
+        assert result == (1, -1)
+    else:
+        value, n = result
+        expected_value = metric_func(y_true, y_pred)
+        expected_n = effective_sample_func(y_true, y_pred)
+        assert np.isclose(value, expected_value)
+        assert n == expected_n
diff --git a/notebooks/risk_control/theoretical_validity_tests.ipynb b/notebooks/risk_control/theoretical_validity_tests.ipynb
new file mode 100644
index 000000000..a5786c82f
--- /dev/null
+++ b/notebooks/risk_control/theoretical_validity_tests.ipynb
@@ -0,0 +1,656 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Binary classification risk control - Theoretical tests to validate implementation",
+   "id": "ed592eb3f8989aa8"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "# Protocol description\n",
+    "Testing theoretical guarantees of risk control in binary classification using a random classifier and synthetic data.\n",
+    "\n",
+    "Each test case looks at a combination of parameters, for which we repeat the experiment `n_repeat` times. The model is the same for all experiments (basically a random classifier), but the data is different each time.\n",
+    "\n",
+    "Each experiment consists of the following:\n",
+    " - We calibrate a BinaryClassificationController. It gives us the list of lambda values that control the risk according to LTT.\n",
+    " - Because we know that the model is random, we know the theoretical risk associated with each lambda value. So we are able to check if the lambda values given by LTT actually control the risk. If not, we count 1 \"error\". Note that *each* lambda value should control the risk, not just one of them.\n",
+    "\n",
+    "After n_repeat experiments, we compute the proportion of errors, that should be less than delta (1 - confidence_level).\n",
+    "\n",
+    "# Results\n",
+    "The risk is controlled in all the test cases. Overall, LTT seems very conservative (to achieve a high percentage of errors, we need to lower the confidence level significantly (0.01) and use only one threshold to avoid the Bonferroni effect). But this is likely due to the model being random, and thus having a lot of variance. It would be interesting to see how this evolves with a better model."
+   ],
+   "id": "8c1746b673c148dd"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-08-29T10:08:15.315556Z",
+     "start_time": "2025-08-29T10:08:14.792182Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2"
+   ],
+   "id": "9b1422ae620955fd",
+   "outputs": [],
+   "execution_count": 80
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-08-29T10:08:15.956177Z",
+     "start_time": "2025-08-29T10:08:15.331400Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from sklearn.datasets import make_classification\n",
+    "import numpy as np\n",
+    "from mapie.risk_control import precision, accuracy, recall\n",
+    "from mapie.risk_control_draft import BinaryClassificationController\n",
+    "from itertools import product\n",
+    "from decimal import Decimal"
+   ],
+   "id": "faeb2f47a92dbf35",
+   "outputs": [],
+   "execution_count": 81
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-08-29T10:08:16.435315Z",
+     "start_time": "2025-08-29T10:08:15.964027Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "# Using sklearn.dummy.DummyClassifier would be cleaner\n",
+    "class RandomClassifier:\n",
+    "    def __init__(self, seed=42, threshold=0.5):\n",
+    "        self.seed = seed\n",
+    "        self.threshold = threshold\n",
+    "\n",
+    "    def _get_prob(self, x):\n",
+    "        local_seed = hash((x, self.seed)) % (2**32)\n",
+    "        rng = np.random.RandomState(local_seed)\n",
+    "        return np.round(rng.rand(), 2)\n",
+    "\n",
+    "    def predict_proba(self, X):\n",
+    "        probs = np.array([self._get_prob(x) for x in X])\n",
+    "        return np.vstack([1 - probs, probs]).T\n",
+    "\n",
+    "    def predict(self, X):\n",
+    "        probs = self.predict_proba(X)[:, 1]\n",
+    "        return (probs >= self.threshold).astype(int)"
+   ],
+   "id": "eefafd6d1697fb9c",
+   "outputs": [],
+   "execution_count": 82
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-08-29T10:09:01.114704Z",
+     "start_time": "2025-08-29T10:08:16.452127Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "N = [100, 5]  # size of the calibration set\n",
+    "risk = [\n",
+    "    {\"name\": \"precision\", \"risk\": precision},\n",
+    "    {\"name\": \"recall\", \"risk\": recall},\n",
+    "    {\"name\": \"accuracy\", \"risk\": accuracy},\n",
+    "]\n",
+    "predict_params =  [np.linspace(0, 0.99, 100), np.array([0.5])]\n",
+    "target_level = [0.1, 0.9]\n",
+    "confidence_level = [0.8, 0.2]\n",
+    "\n",
+    "n_repeats = 100\n",
+    "\n",
+    "for combination in product(N, risk, predict_params, target_level, confidence_level):\n",
+    "    N, risk, predict_params, target_level, confidence_level = combination\n",
+    "    alpha = float(Decimal(\"1\") - Decimal(str(target_level))) # to avoid floating point issues\n",
+    "    delta = float(Decimal(\"1\") - Decimal(str(confidence_level))) # to avoid floating point issues\n",
+    "\n",
+    "    clf = RandomClassifier()\n",
+    "    nb_errors = 0  # number of iterations where the risk is not controlled (i.e., not all the valid thresholds found by LTT are actually valid)\n",
+    "    total_nb_valid_params = 0\n",
+    "\n",
+    "    for _ in range(n_repeats):\n",
+    "\n",
+    "        X_calibrate, y_calibrate = make_classification(\n",
+    "            n_samples=N,\n",
+    "            n_features=1,\n",
+    "            n_informative=1,\n",
+    "            n_redundant=0,\n",
+    "            n_repeated=0,\n",
+    "            n_classes=2,\n",
+    "            n_clusters_per_class=1,\n",
+    "            weights=[0.5, 0.5],\n",
+    "            flip_y=0,\n",
+    "            random_state=None\n",
+    "        )\n",
+    "        X_calibrate = X_calibrate.squeeze()\n",
+    "\n",
+    "        controller = BinaryClassificationController(\n",
+    "            predict_function=clf.predict_proba,\n",
+    "            risk=risk[\"risk\"],\n",
+    "            target_level=target_level,\n",
+    "            confidence_level=confidence_level,\n",
+    "        )\n",
+    "        controller._thresholds = predict_params\n",
+    "        controller.calibrate(X_calibrate, y_calibrate)\n",
+    "        valid_parameters = controller.valid_thresholds\n",
+    "        total_nb_valid_params += len(valid_parameters)\n",
+    "\n",
+    "        # In the following, we check that all the valid thresholds found by LTT actually control the risk.\n",
+    "        # Instead of sampling a large test set, we use the fact that we know the theoretical risk of a random classifier.\n",
+    "        # The calculations here are valid only for a balanced data generator.\n",
+    "        if risk[\"risk\"] == precision or risk[\"risk\"] == accuracy:\n",
+    "            if target_level > 0.5 and len(valid_parameters) >= 1:\n",
+    "                nb_errors += 1\n",
+    "        elif risk[\"risk\"] == recall:\n",
+    "            if any(x > alpha for x in valid_parameters) and len(valid_parameters) >= 1:\n",
+    "                nb_errors += 1\n",
+    "\n",
+    "    print(f\"\\n{N=}, {risk['name']=}, {len(predict_params)=}, {target_level=}, {confidence_level=}\")\n",
+    "\n",
+    "    print(f\"Proportion of times the risk is not controlled: {nb_errors/n_repeats}\")\n",
+    "    print(f\"Delta: {delta}\")\n",
+    "    print(f\"Mean number of valid thresholds found per iteration: {int(np.round(total_nb_valid_params/n_repeats))}\")\n",
+    "\n",
+    "    if nb_errors/n_repeats <= delta:\n",
+    "        print(\"Valid experiment\")\n",
+    "    else:\n",
+    "        print(\"Invalid experiment\")"
+   ],
+   "id": "1fdffae392bb7a65",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.32\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 0\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.24\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 0\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.62\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.62\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=100, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 0\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='precision', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.52\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.57\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.57\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.48\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 0\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='recall', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.5, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n",
+      "\n",
+      "N=5, risk['name']='accuracy', len(predict_params)=1, target_level=0.45, confidence_level=0.01\n",
+      "Proportion of times the risk is not controlled: 0.0\n",
+      "Delta: 0.99\n",
+      "Mean number of valid thresholds found per iteration: 1\n",
+      "Valid experiment\n"
+     ]
+    }
+   ],
+   "execution_count": 83
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-08-29T10:09:01.146778Z",
+     "start_time": "2025-08-29T10:09:01.143637Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "",
+   "id": "4c3f437f0b2897a1",
+   "outputs": [],
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}