Skip to content

Commit 4733b0b

Browse files
Merge pull request #453 from scikit-learn-contrib/452-coverage-validity
FIX bug #452: Quantile Correction
2 parents 1110092 + 4e5b96d commit 4733b0b

File tree

6 files changed

+151
-93
lines changed

6 files changed

+151
-93
lines changed

HISTORY.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
History
33
=======
44

5-
0.8.3 (2024-**-**)
5+
0.8.4 (2024-**-**)
66
------------------
77

8+
* Fix the quantile formula to ensure valid coverage for any number of calibration data in `ConformityScore`.
89
* Fix conda versionning.
910
* Reduce precision for test in `MapieCalibrator`.
1011
* Fix invalid certificate when downloading data.

mapie/conformity_scores/conformity_scores.py

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ def get_quantile(
214214
conformity_scores: NDArray,
215215
alpha_np: NDArray,
216216
axis: int,
217-
method: str
217+
reversed: bool = False
218218
) -> NDArray:
219219
"""
220220
Compute the alpha quantile of the conformity scores or the conformity
@@ -235,25 +235,32 @@ def get_quantile(
235235
axis: int
236236
The axis from which to compute the quantile.
237237
238-
method: str
239-
``"higher"`` or ``"lower"`` the method to compute the quantile.
238+
reversed: bool
239+
Boolean specifying whether we take the upper or lower quantile,
240+
if False, the alpha quantile, otherwise the (1-alpha) quantile.
240241
241242
Returns
242243
-------
243244
NDArray of shape (1, n_alpha) or (n_samples, n_alpha)
244245
The quantile of the conformity scores.
245246
"""
246-
n_ref = conformity_scores.shape[-1]
247-
quantile = np.column_stack([
247+
n_ref = conformity_scores.shape[1-axis]
248+
n_calib = np.min(np.sum(~np.isnan(conformity_scores), axis=axis))
249+
signed = 1-2*reversed
250+
251+
# Adapt alpha w.r.t upper/lower : alpha vs. 1-alpha
252+
alpha_ref = (1-2*alpha_np)*reversed + alpha_np
253+
254+
# Adjust alpha w.r.t quantile correction
255+
alpha_ref = np.ceil(alpha_ref*(n_calib+1))/n_calib
256+
257+
# Compute the target quantiles
258+
quantile = signed * np.column_stack([
248259
np_nanquantile(
249-
conformity_scores.astype(float),
250-
_alpha,
251-
axis=axis,
252-
method=method
260+
signed * conformity_scores, _alpha, axis=axis, method="lower"
253261
) if 0 < _alpha < 1
254-
else np.inf * np.ones(n_ref) if method == "higher"
255-
else - np.inf * np.ones(n_ref)
256-
for _alpha in alpha_np
262+
else np.inf * np.ones(n_ref)
263+
for _alpha in alpha_ref
257264
])
258265
return quantile
259266

@@ -281,7 +288,7 @@ def _beta_optimize(
281288
-------
282289
NDArray
283290
Array of betas minimizing the differences
284-
``(1-alpa+beta)-quantile - beta-quantile``.
291+
``(1-alpha+beta)-quantile - beta-quantile``.
285292
"""
286293
beta_np = np.full(
287294
shape=(len(lower_bounds), len(alpha_np)),
@@ -405,26 +412,34 @@ def get_bounds(
405412
X, y_pred_up, conformity_scores
406413
)
407414
bound_low = self.get_quantile(
408-
conformity_scores_low, alpha_low, axis=1, method="lower"
415+
conformity_scores_low, alpha_low, axis=1, reversed=True
409416
)
410417
bound_up = self.get_quantile(
411-
conformity_scores_up, alpha_up, axis=1, method="higher"
418+
conformity_scores_up, alpha_up, axis=1
412419
)
420+
413421
else:
414-
quantile_search = "higher" if self.sym else "lower"
415-
alpha_low = 1 - alpha_np if self.sym else beta_np
416-
alpha_up = 1 - alpha_np if self.sym else 1 - alpha_np + beta_np
422+
if self.sym:
423+
alpha_ref = 1-alpha_np
424+
quantile_ref = self.get_quantile(
425+
conformity_scores[..., np.newaxis], alpha_ref, axis=0
426+
)
427+
quantile_low, quantile_up = -quantile_ref, quantile_ref
428+
429+
else:
430+
alpha_low, alpha_up = beta_np, 1 - alpha_np + beta_np
431+
432+
quantile_low = self.get_quantile(
433+
conformity_scores[..., np.newaxis],
434+
alpha_low, axis=0, reversed=True
435+
)
436+
quantile_up = self.get_quantile(
437+
conformity_scores[..., np.newaxis],
438+
alpha_up, axis=0
439+
)
417440

418-
quantile_low = self.get_quantile(
419-
conformity_scores[..., np.newaxis],
420-
alpha_low, axis=0, method=quantile_search
421-
)
422-
quantile_up = self.get_quantile(
423-
conformity_scores[..., np.newaxis],
424-
alpha_up, axis=0, method="higher"
425-
)
426441
bound_low = self.get_estimation_distribution(
427-
X, y_pred_low, signed * quantile_low
442+
X, y_pred_low, quantile_low
428443
)
429444
bound_up = self.get_estimation_distribution(
430445
X, y_pred_up, quantile_up

mapie/regression/regression.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -628,7 +628,7 @@ def predict(
628628

629629
alpha_np = cast(NDArray, alpha)
630630
if not allow_infinite_bounds:
631-
n = len(self.conformity_scores_)
631+
n = np.sum(~np.isnan(self.conformity_scores_))
632632
check_alpha_and_n_samples(alpha_np, n)
633633

634634
y_pred, y_pred_low, y_pred_up = \

mapie/tests/test_conformity_scores.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from typing import Any
2-
31
import numpy as np
42
import pytest
53
from sklearn.linear_model import LinearRegression
@@ -382,18 +380,17 @@ def test_residual_normalised_prefit_get_estimation_distribution() -> None:
382380
@pytest.mark.parametrize("score", [AbsoluteConformityScore(),
383381
GammaConformityScore(),
384382
ResidualNormalisedScore()])
385-
@pytest.mark.parametrize("alpha", [[0.3], [0.5, 0.4]])
383+
@pytest.mark.parametrize("alpha", [[0.5], [0.5, 0.6]])
386384
def test_intervals_shape_with_every_score(
387385
score: ConformityScore,
388-
alpha: Any
386+
alpha: NDArray
389387
) -> None:
388+
estim = LinearRegression().fit(X_toy, y_toy)
390389
mapie_reg = MapieRegressor(
391-
method="base", cv="split", conformity_score=score
390+
estimator=estim, method="base", cv="prefit", conformity_score=score
392391
)
393-
X = np.concatenate((X_toy, X_toy))
394-
y = np.concatenate((y_toy, y_toy))
395-
mapie_reg = mapie_reg.fit(X, y)
396-
y_pred, intervals = mapie_reg.predict(X, alpha=alpha)
397-
n_samples = X.shape[0]
392+
mapie_reg = mapie_reg.fit(X_toy, y_toy)
393+
y_pred, intervals = mapie_reg.predict(X_toy, alpha=alpha)
394+
n_samples = X_toy.shape[0]
398395
assert y_pred.shape[0] == n_samples
399396
assert intervals.shape == (n_samples, 2, len(alpha))

mapie/tests/test_regression.py

Lines changed: 83 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from sklearn.pipeline import Pipeline, make_pipeline
1919
from sklearn.preprocessing import OneHotEncoder
2020
from sklearn.utils.validation import check_is_fitted
21+
from scipy.stats import ttest_1samp
2122
from typing_extensions import TypedDict
2223

2324
from mapie._typing import NDArray
@@ -100,6 +101,13 @@
100101
test_size=None,
101102
random_state=random_state
102103
),
104+
"cv_plus_median": Params(
105+
method="plus",
106+
agg_function="median",
107+
cv=KFold(n_splits=3, shuffle=True, random_state=random_state),
108+
test_size=None,
109+
random_state=random_state
110+
),
103111
"cv_minmax": Params(
104112
method="minmax",
105113
agg_function="mean",
@@ -131,35 +139,35 @@
131139
}
132140

133141
WIDTHS = {
134-
"naive": 3.81,
135-
"split": 3.87,
142+
"naive": 3.80,
143+
"split": 3.89,
136144
"jackknife": 3.89,
137145
"jackknife_plus": 3.90,
138146
"jackknife_minmax": 3.96,
139-
"cv": 3.85,
140-
"cv_plus": 3.90,
141-
"cv_minmax": 4.04,
142-
"prefit": 4.81,
143-
"cv_plus_median": 3.90,
147+
"cv": 3.88,
148+
"cv_plus": 3.91,
149+
"cv_minmax": 4.07,
150+
"prefit": 3.89,
151+
"cv_plus_median": 3.91,
144152
"jackknife_plus_ab": 3.90,
145-
"jackknife_minmax_ab": 4.13,
146-
"jackknife_plus_median_ab": 3.87,
153+
"jackknife_minmax_ab": 4.14,
154+
"jackknife_plus_median_ab": 3.88,
147155
}
148156

149157
COVERAGES = {
150-
"naive": 0.952,
151-
"split": 0.952,
152-
"jackknife": 0.952,
158+
"naive": 0.954,
159+
"split": 0.956,
160+
"jackknife": 0.956,
153161
"jackknife_plus": 0.952,
154-
"jackknife_minmax": 0.952,
155-
"cv": 0.958,
156-
"cv_plus": 0.956,
157-
"cv_minmax": 0.966,
158-
"prefit": 0.980,
162+
"jackknife_minmax": 0.962,
163+
"cv": 0.954,
164+
"cv_plus": 0.954,
165+
"cv_minmax": 0.962,
166+
"prefit": 0.956,
159167
"cv_plus_median": 0.954,
160168
"jackknife_plus_ab": 0.952,
161-
"jackknife_minmax_ab": 0.970,
162-
"jackknife_plus_median_ab": 0.960,
169+
"jackknife_minmax_ab": 0.968,
170+
"jackknife_plus_median_ab": 0.952,
163171
}
164172

165173

@@ -212,15 +220,15 @@ def test_valid_agg_function(agg_function: str) -> None:
212220

213221
@pytest.mark.parametrize(
214222
"cv", [None, -1, 2, KFold(), LeaveOneOut(),
215-
ShuffleSplit(n_splits=1),
223+
ShuffleSplit(n_splits=1, test_size=0.5),
216224
PredefinedSplit(test_fold=[-1]*3+[0]*3),
217225
"prefit", "split"]
218226
)
219227
def test_valid_cv(cv: Any) -> None:
220228
"""Test that valid cv raise no errors."""
221229
model = LinearRegression()
222230
model.fit(X_toy, y_toy)
223-
mapie_reg = MapieRegressor(estimator=model, cv=cv)
231+
mapie_reg = MapieRegressor(estimator=model, cv=cv, test_size=0.5)
224232
mapie_reg.fit(X_toy, y_toy)
225233
mapie_reg.predict(X_toy, alpha=0.5)
226234

@@ -237,7 +245,7 @@ def test_too_large_cv(cv: Any) -> None:
237245

238246

239247
@pytest.mark.parametrize("strategy", [*STRATEGIES])
240-
@pytest.mark.parametrize("dataset", [(X, y), (X_toy, y_toy)])
248+
@pytest.mark.parametrize("dataset", [(X, y)])
241249
@pytest.mark.parametrize("alpha", [0.2, [0.2, 0.4], (0.2, 0.4)])
242250
def test_predict_output_shape(
243251
strategy: str, alpha: Any, dataset: Tuple[NDArray, NDArray]
@@ -252,6 +260,46 @@ def test_predict_output_shape(
252260
assert y_pis.shape == (X.shape[0], 2, n_alpha)
253261

254262

263+
@pytest.mark.parametrize("delta", [0.6, 0.8])
264+
@pytest.mark.parametrize("n_calib", [10 + i for i in range(13)] + [50, 100])
265+
def test_coverage_validity(delta: float, n_calib: int) -> None:
266+
"""
267+
Test that the prefit method provides valid coverage
268+
for different calibration data sizes and coverage targets.
269+
"""
270+
n_split, n_train, n_test = 100, 100, 1000
271+
n_all = n_train + n_calib + n_test
272+
X, y = make_regression(n_all, random_state=random_state)
273+
Xtr, Xct, ytr, yct = train_test_split(
274+
X, y, train_size=n_train, random_state=random_state
275+
)
276+
277+
model = LinearRegression()
278+
model.fit(Xtr, ytr)
279+
280+
cov_list = []
281+
for _ in range(n_split):
282+
mapie_reg = MapieRegressor(estimator=model, method="base", cv="prefit")
283+
Xc, Xt, yc, yt = train_test_split(Xct, yct, test_size=n_test)
284+
mapie_reg.fit(Xc, yc)
285+
_, y_pis = mapie_reg.predict(Xt, alpha=1-delta)
286+
y_low, y_up = y_pis[:, 0, 0], y_pis[:, 1, 0]
287+
coverage = regression_coverage_score(yt, y_low, y_up)
288+
cov_list.append(coverage)
289+
290+
# Here we are testing whether the average coverage is statistically
291+
# less than the target coverage.
292+
mean_low, mean_up = delta, delta + 1/(n_calib+1)
293+
_, pval_low = ttest_1samp(cov_list, popmean=mean_low, alternative='less')
294+
_, pval_up = ttest_1samp(cov_list, popmean=mean_up, alternative='greater')
295+
296+
# We perform a FWER controlling procedure (Bonferroni)
297+
p_fwer = 0.01 # probability of making one or more false discoveries: 1%
298+
p_bonf = p_fwer / 30 # because a total of 30 test_coverage_validity
299+
np.testing.assert_array_less(p_bonf, pval_low)
300+
np.testing.assert_array_less(p_bonf, pval_up)
301+
302+
255303
def test_same_results_prefit_split() -> None:
256304
"""
257305
Test checking that if split and prefit method have exactly
@@ -265,12 +313,12 @@ def test_same_results_prefit_split() -> None:
265313
X_train, X_calib = X[train_index], X[val_index]
266314
y_train, y_calib = y[train_index], y[val_index]
267315

268-
mapie_reg = MapieRegressor(cv=cv)
316+
mapie_reg = MapieRegressor(method='base', cv=cv)
269317
mapie_reg.fit(X, y)
270318
y_pred_1, y_pis_1 = mapie_reg.predict(X, alpha=0.1)
271319

272320
model = LinearRegression().fit(X_train, y_train)
273-
mapie_reg = MapieRegressor(estimator=model, cv="prefit")
321+
mapie_reg = MapieRegressor(estimator=model, method='base', cv="prefit")
274322
mapie_reg.fit(X_calib, y_calib)
275323
y_pred_2, y_pis_2 = mapie_reg.predict(X, alpha=0.1)
276324

@@ -334,8 +382,8 @@ def test_results_single_and_multi_jobs(strategy: str) -> None:
334382
mapie_multi = MapieRegressor(n_jobs=-1, **STRATEGIES[strategy])
335383
mapie_single.fit(X_toy, y_toy)
336384
mapie_multi.fit(X_toy, y_toy)
337-
y_pred_single, y_pis_single = mapie_single.predict(X_toy, alpha=0.2)
338-
y_pred_multi, y_pis_multi = mapie_multi.predict(X_toy, alpha=0.2)
385+
y_pred_single, y_pis_single = mapie_single.predict(X_toy, alpha=0.5)
386+
y_pred_multi, y_pis_multi = mapie_multi.predict(X_toy, alpha=0.5)
339387
np.testing.assert_allclose(y_pred_single, y_pred_multi)
340388
np.testing.assert_allclose(y_pis_single, y_pis_multi)
341389

@@ -463,7 +511,7 @@ def test_linear_data_confidence_interval(strategy: str) -> None:
463511
"""
464512
mapie = MapieRegressor(**STRATEGIES[strategy])
465513
mapie.fit(X_toy, y_toy)
466-
y_pred, y_pis = mapie.predict(X_toy, alpha=0.2)
514+
y_pred, y_pis = mapie.predict(X_toy, alpha=0.5)
467515
np.testing.assert_allclose(y_pis[:, 0, 0], y_pis[:, 1, 0])
468516
np.testing.assert_allclose(y_pred, y_pis[:, 0, 0])
469517

@@ -506,7 +554,7 @@ def test_results_prefit_naive() -> None:
506554
is equivalent to the "naive" method.
507555
"""
508556
estimator = LinearRegression().fit(X, y)
509-
mapie_reg = MapieRegressor(estimator=estimator, cv="prefit")
557+
mapie_reg = MapieRegressor(estimator=estimator, method="base", cv="prefit")
510558
mapie_reg.fit(X, y)
511559
_, y_pis = mapie_reg.predict(X, alpha=0.05)
512560
width_mean = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean()
@@ -516,20 +564,17 @@ def test_results_prefit_naive() -> None:
516564

517565

518566
def test_results_prefit() -> None:
519-
"""Test prefit results on a standard train/validation/test split."""
520-
X_train_val, X_test, y_train_val, y_test = train_test_split(
521-
X, y, test_size=1 / 10, random_state=1
522-
)
523-
X_train, X_val, y_train, y_val = train_test_split(
524-
X_train_val, y_train_val, test_size=1 / 9, random_state=1
567+
"""Test prefit results on a standard train/calibration split."""
568+
X_train, X_calib, y_train, y_calib = train_test_split(
569+
X, y, test_size=1/2, random_state=1
525570
)
526571
estimator = LinearRegression().fit(X_train, y_train)
527-
mapie_reg = MapieRegressor(estimator=estimator, cv="prefit")
528-
mapie_reg.fit(X_val, y_val)
529-
_, y_pis = mapie_reg.predict(X_test, alpha=0.05)
572+
mapie_reg = MapieRegressor(estimator=estimator, method="base", cv="prefit")
573+
mapie_reg.fit(X_calib, y_calib)
574+
_, y_pis = mapie_reg.predict(X_calib, alpha=0.05)
530575
width_mean = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean()
531576
coverage = regression_coverage_score(
532-
y_test, y_pis[:, 0, 0], y_pis[:, 1, 0]
577+
y_calib, y_pis[:, 0, 0], y_pis[:, 1, 0]
533578
)
534579
np.testing.assert_allclose(width_mean, WIDTHS["prefit"], rtol=1e-2)
535580
np.testing.assert_allclose(coverage, COVERAGES["prefit"], rtol=1e-2)

0 commit comments

Comments
 (0)