1818from sklearn .pipeline import Pipeline , make_pipeline
1919from sklearn .preprocessing import OneHotEncoder
2020from sklearn .utils .validation import check_is_fitted
21+ from scipy .stats import ttest_1samp
2122from typing_extensions import TypedDict
2223
2324from mapie ._typing import NDArray
155156
156157COVERAGES = {
157158 "naive" : 0.954 ,
158- "split" : 0.962 ,
159+ "split" : 0.956 ,
159160 "jackknife" : 0.956 ,
160161 "jackknife_plus" : 0.952 ,
161162 "jackknife_minmax" : 0.962 ,
162163 "cv" : 0.954 ,
163164 "cv_plus" : 0.954 ,
164165 "cv_minmax" : 0.962 ,
165- "prefit" : 0.960 ,
166+ "prefit" : 0.956 ,
166167 "cv_plus_median" : 0.954 ,
167168 "jackknife_plus_ab" : 0.952 ,
168169 "jackknife_minmax_ab" : 0.968 ,
@@ -260,7 +261,7 @@ def test_predict_output_shape(
260261
261262
262263@pytest .mark .parametrize ("delta" , [0.6 , 0.8 ])
263- @pytest .mark .parametrize ("n_calib" , [10 + i for i in range (11 )] + [50 , 100 ])
264+ @pytest .mark .parametrize ("n_calib" , [10 + i for i in range (13 )] + [50 , 100 ])
264265def test_coverage_validity (delta : float , n_calib : int ) -> None :
265266 """
266267 Test that the prefit method provides valid coverage
@@ -269,33 +270,34 @@ def test_coverage_validity(delta: float, n_calib: int) -> None:
269270 n_split , n_train , n_test = 100 , 100 , 1000
270271 n_all = n_train + n_calib + n_test
271272 X , y = make_regression (n_all , random_state = random_state )
272-
273- X_train , X_cal_test , y_train , y_cal_test = \
274- train_test_split ( X , y , train_size = n_train , random_state = random_state )
273+ Xtr , Xct , ytr , yct = train_test_split (
274+ X , y , train_size = n_train , random_state = random_state
275+ )
275276
276277 model = LinearRegression ()
277- model .fit (X_train , y_train )
278+ model .fit (Xtr , ytr )
278279
279280 cov_list = []
280281 for _ in range (n_split ):
281282 mapie_reg = MapieRegressor (estimator = model , method = "base" , cv = "prefit" )
282- X_cal , X_test , y_cal , y_test = \
283- train_test_split (X_cal_test , y_cal_test , test_size = n_test )
284- mapie_reg .fit (X_cal , y_cal )
285- _ , y_pis = mapie_reg .predict (X_test , alpha = 1 - delta )
286- coverage = \
287- regression_coverage_score (y_test , y_pis [:, 0 , 0 ], y_pis [:, 1 , 0 ])
283+ Xc , Xt , yc , yt = train_test_split (Xct , yct , test_size = n_test )
284+ mapie_reg .fit (Xc , yc )
285+ _ , y_pis = mapie_reg .predict (Xt , alpha = 1 - delta )
286+ y_low , y_up = y_pis [:, 0 , 0 ], y_pis [:, 1 , 0 ]
287+ coverage = regression_coverage_score (yt , y_low , y_up )
288288 cov_list .append (coverage )
289289
290290 # Here we are testing whether the average coverage is statistically
291291 # less than the target coverage.
292- from scipy .stats import ttest_1samp
293292 mean_low , mean_up = delta , delta + 1 / (n_calib + 1 )
294293 _ , pval_low = ttest_1samp (cov_list , popmean = mean_low , alternative = 'less' )
295294 _ , pval_up = ttest_1samp (cov_list , popmean = mean_up , alternative = 'greater' )
296295
297- np .testing .assert_array_less (0.01 , pval_low )
298- np .testing .assert_array_less (0.01 , pval_up )
296+ # We perform a FWER controlling procedure (Bonferroni)
297+ p_fwer = 0.01 # probability of making one or more false discoveries: 1%
298+ p_bonf = p_fwer / 30 # because a total of 30 test_coverage_validity
299+ np .testing .assert_array_less (p_bonf , pval_low )
300+ np .testing .assert_array_less (p_bonf , pval_up )
299301
300302
301303def test_same_results_prefit_split () -> None :
@@ -562,7 +564,7 @@ def test_results_prefit_naive() -> None:
562564
563565
564566def test_results_prefit () -> None :
565- """Test prefit results on a standard train/validation/test split."""
567+ """Test prefit results on a standard train/calibration split."""
566568 X_train , X_calib , y_train , y_calib = train_test_split (
567569 X , y , test_size = 1 / 2 , random_state = 1
568570 )
0 commit comments