UPD: add comments and robust test (FWER procedure)

Thibault Cordier · Thibault Cordier · commit cda125051cd4 · 2024-06-07T15:21:59.000+02:00
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,9 +2,10 @@
 History
 =======
 
-0.8.3 (2024-**-**)
+0.8.4 (2024-**-**)
 ------------------
 
+* Fix the quantile formula to ensure valid coverage for any number of calibration data in `ConformityScore`.
 * Fix conda versionning.
 * Reduce precision for test in `MapieCalibrator`.
 * Fix invalid certificate when downloading data.
diff --git a/mapie/conformity_scores/conformity_scores.py b/mapie/conformity_scores/conformity_scores.py
@@ -247,15 +247,18 @@ def get_quantile(
         n_ref = conformity_scores.shape[1-axis]
         n_calib = np.min(np.sum(~np.isnan(conformity_scores), axis=axis))
         signed = 1-2*reversed
+
+        # Adapt alpha w.r.t upper/lower : alpha vs. 1-alpha
         alpha_ref = (1-2*alpha_np)*reversed + alpha_np
 
+        # Adjust alpha w.r.t quantile correction
+        alpha_ref = np.ceil(alpha_ref*(n_calib+1))/n_calib
+
+        # Compute the target quantiles
         quantile = signed * np.column_stack([
             np_nanquantile(
-                signed * conformity_scores.astype(float),
-                np.ceil(_alpha*(n_calib+1))/n_calib,
-                axis=axis,
-                method="lower"
-            ) if 0 < np.ceil(_alpha*(n_calib+1))/n_calib < 1
+                signed * conformity_scores, _alpha, axis=axis, method="lower"
+            ) if 0 < _alpha < 1
             else np.inf * np.ones(n_ref)
             for _alpha in alpha_ref
         ])
diff --git a/mapie/tests/test_conformity_scores.py b/mapie/tests/test_conformity_scores.py
@@ -1,5 +1,3 @@
-from typing import Any
-
 import numpy as np
 import pytest
 from sklearn.linear_model import LinearRegression
@@ -385,7 +383,7 @@ def test_residual_normalised_prefit_get_estimation_distribution() -> None:
 @pytest.mark.parametrize("alpha", [[0.5], [0.5, 0.6]])
 def test_intervals_shape_with_every_score(
     score: ConformityScore,
-    alpha: Any
+    alpha: NDArray
 ) -> None:
     estim = LinearRegression().fit(X_toy, y_toy)
     mapie_reg = MapieRegressor(
diff --git a/mapie/tests/test_regression.py b/mapie/tests/test_regression.py
@@ -18,6 +18,7 @@
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils.validation import check_is_fitted
+from scipy.stats import ttest_1samp
 from typing_extensions import TypedDict
 
 from mapie._typing import NDArray
@@ -155,14 +156,14 @@
 
 COVERAGES = {
     "naive": 0.954,
-    "split": 0.962,
+    "split": 0.956,
     "jackknife": 0.956,
     "jackknife_plus": 0.952,
     "jackknife_minmax": 0.962,
     "cv": 0.954,
     "cv_plus": 0.954,
     "cv_minmax": 0.962,
-    "prefit": 0.960,
+    "prefit": 0.956,
     "cv_plus_median": 0.954,
     "jackknife_plus_ab": 0.952,
     "jackknife_minmax_ab": 0.968,
@@ -260,7 +261,7 @@ def test_predict_output_shape(
 
 
 @pytest.mark.parametrize("delta", [0.6, 0.8])
-@pytest.mark.parametrize("n_calib", [10 + i for i in range(11)] + [50, 100])
+@pytest.mark.parametrize("n_calib", [10 + i for i in range(13)] + [50, 100])
 def test_coverage_validity(delta: float, n_calib: int) -> None:
     """
     Test that the prefit method provides valid coverage
@@ -269,33 +270,34 @@ def test_coverage_validity(delta: float, n_calib: int) -> None:
     n_split, n_train, n_test = 100, 100, 1000
     n_all = n_train + n_calib + n_test
     X, y = make_regression(n_all, random_state=random_state)
-
-    X_train, X_cal_test, y_train, y_cal_test = \
-        train_test_split(X, y, train_size=n_train, random_state=random_state)
+    Xtr, Xct, ytr, yct = train_test_split(
+        X, y, train_size=n_train, random_state=random_state
+    )
 
     model = LinearRegression()
-    model.fit(X_train, y_train)
+    model.fit(Xtr, ytr)
 
     cov_list = []
     for _ in range(n_split):
         mapie_reg = MapieRegressor(estimator=model, method="base", cv="prefit")
-        X_cal, X_test, y_cal, y_test = \
-            train_test_split(X_cal_test, y_cal_test, test_size=n_test)
-        mapie_reg.fit(X_cal, y_cal)
-        _, y_pis = mapie_reg.predict(X_test, alpha=1-delta)
-        coverage = \
-            regression_coverage_score(y_test,  y_pis[:, 0, 0], y_pis[:, 1, 0])
+        Xc, Xt, yc, yt = train_test_split(Xct, yct, test_size=n_test)
+        mapie_reg.fit(Xc, yc)
+        _, y_pis = mapie_reg.predict(Xt, alpha=1-delta)
+        y_low, y_up = y_pis[:, 0, 0], y_pis[:, 1, 0]
+        coverage = regression_coverage_score(yt, y_low, y_up)
         cov_list.append(coverage)
 
     # Here we are testing whether the average coverage is statistically
     # less than the target coverage.
-    from scipy.stats import ttest_1samp
     mean_low, mean_up = delta, delta + 1/(n_calib+1)
     _, pval_low = ttest_1samp(cov_list, popmean=mean_low, alternative='less')
     _, pval_up = ttest_1samp(cov_list, popmean=mean_up, alternative='greater')
 
-    np.testing.assert_array_less(0.01, pval_low)
-    np.testing.assert_array_less(0.01, pval_up)
+    # We perform a FWER controlling procedure (Bonferroni)
+    p_fwer = 0.01  # probability of making one or more false discoveries: 1%
+    p_bonf = p_fwer / 30  # because a total of 30 test_coverage_validity
+    np.testing.assert_array_less(p_bonf, pval_low)
+    np.testing.assert_array_less(p_bonf, pval_up)
 
 
 def test_same_results_prefit_split() -> None:
@@ -562,7 +564,7 @@ def test_results_prefit_naive() -> None:
 
 
 def test_results_prefit() -> None:
-    """Test prefit results on a standard train/validation/test split."""
+    """Test prefit results on a standard train/calibration split."""
     X_train, X_calib, y_train, y_calib = train_test_split(
         X, y, test_size=1/2, random_state=1
     )