Skip to content

Commit 40fa529

Browse files
author
gmartinonQM
committed
enforce 80-character line length
1 parent 686c3ea commit 40fa529

File tree

12 files changed

+470
-188
lines changed

12 files changed

+470
-188
lines changed

.flake8

Lines changed: 0 additions & 2 deletions
This file was deleted.

README.rst

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,21 @@ The estimated prediction intervals can then be plotted as follows.
112112
order = np.argsort(X[:, 0])
113113
plt.plot(X[order], y_preds[order][:, 1, 1], color="C1", ls="--")
114114
plt.plot(X[order], y_preds[order][:, 2, 1], color="C1", ls="--")
115-
plt.fill_between(X[order].ravel(), y_preds[:, 1, 0][order].ravel(), y_preds[:, 2, 0][order].ravel(), alpha=0.2)
116-
coverage_scores = [coverage_score(y, y_preds[:, 1, i], y_preds[:, 2, i]) for i, _ in enumerate(alpha)]
115+
plt.fill_between(
116+
X[order].ravel(),
117+
y_preds[:, 1, 0][order].ravel(),
118+
y_preds[:, 2, 0][order].ravel(),
119+
alpha=0.2
120+
)
121+
coverage_scores = [
122+
coverage_score(y, y_preds[:, 1, i], y_preds[:, 2, i])
123+
for i, _ in enumerate(alpha)
124+
]
117125
plt.title(
118-
f"Target and effective coverages for alpha={alpha[0]:.2f}: ({1-alpha[0]:.3f}, {coverage_scores[0]:.3f})\n" +
119-
f"Target and effective coverages for alpha={alpha[1]:.2f}: ({1-alpha[1]:.3f}, {coverage_scores[1]:.3f})"
126+
f"Target and effective coverages for "
127+
f"alpha={alpha[0]:.2f}: ({1-alpha[0]:.3f}, {coverage_scores[0]:.3f})\n"
128+
f"Target and effective coverages for "
129+
f"alpha={alpha[1]:.2f}: ({1-alpha[1]:.3f}, {coverage_scores[1]:.3f})"
120130
)
121131
plt.show()
122132

examples/plot_barber2020_simulations.py

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
available strategies as function of the dataset dimension.
1717
1818
We then show the prediction interval coverages and widths as a function of the
19-
dimension values for selected strategies with standard error given by the different trials.
19+
dimension values for selected strategies with standard error given by
20+
the different trials.
2021
2122
This simulation is carried out to emphasize the instability of the prediction
2223
intervals estimated by the Jackknife strategy when the dataset dimension is
@@ -42,25 +43,30 @@ def PIs_vs_dimensions(
4243
Compute the prediction intervals for a linear regression problem.
4344
Function adapted from Foygel-Barber et al. (2020).
4445
45-
It generates several times linear data with random noise whose signal-to-noise
46-
is equal to 10 and for several given dimensions, given by the dimensions list.
46+
It generates several times linear data with random noise whose
47+
signal-to-noise is equal to 10 and for several given dimensions,
48+
given by the dimensions list.
4749
48-
Here we use MAPIE, with a LinearRegression base model, to estimate the width
49-
means and the coverage levels of the prediction intervals estimated by all the
50-
available strategies as a function of the dataset dimension.
50+
Here we use MAPIE, with a LinearRegression base model, to estimate
51+
the width means and the coverage levels of the prediction intervals
52+
estimated by all the available strategies as a function of
53+
the dataset dimension.
5154
52-
This simulation is carried out to emphasize the instability of the prediction
53-
intervals estimated by the Jackknife strategy when the dataset dimension is
54-
equal to the number of training samples (here 100).
55+
This simulation is carried out to emphasize the instability
56+
of the prediction intervals estimated by the Jackknife strategy
57+
when the dataset dimension is equal to the number
58+
of training samples (here 100).
5559
5660
Parameters
5761
----------
5862
strategies : Dict[str, Dict[str, Any]]
59-
List of strategies for estimating prediction intervals, with corresponding parameters.
63+
List of strategies for estimating prediction intervals,
64+
with corresponding parameters.
6065
alpha : float
6166
1 - (target coverage level).
6267
n_trial : int
63-
Number of trials for each dimension for estimating prediction intervals.
68+
Number of trials for each dimension for estimating
69+
prediction intervals.
6470
For each trial, a new random noise is generated.
6571
dimensions : List[int]
6672
List of dimension values of input data.
@@ -104,8 +110,8 @@ def PIs_vs_dimensions(
104110
)
105111
mapie.fit(X_train, y_train)
106112
y_preds = mapie.predict(X_test)[:, :, 0]
107-
results[strategy][dimension]["coverage"][trial] = coverage_score(
108-
y_test, y_preds[:, 1], y_preds[:, 2]
113+
results[strategy][dimension]["coverage"][trial] = (
114+
coverage_score(y_test, y_preds[:, 1], y_preds[:, 2])
109115
)
110116
results[strategy][dimension]["width_mean"][trial] = (
111117
y_preds[:, 2] - y_preds[:, 1]
@@ -118,8 +124,9 @@ def plot_simulation_results(
118124
title: str
119125
) -> None:
120126
"""
121-
Show the prediction interval coverages and widths as a function of dimension values
122-
for selected strategies with standard error given by different trials.
127+
Show the prediction interval coverages and widths as a function
128+
of dimension values for selected strategies with standard error
129+
given by different trials.
123130
124131
Parameters
125132
----------
@@ -138,18 +145,32 @@ def plot_simulation_results(
138145
coverage_mean, coverage_SE, width_mean, width_SE = (
139146
np.zeros(n_dim), np.zeros(n_dim), np.zeros(n_dim), np.zeros(n_dim)
140147
)
141-
for idim, dimension in enumerate(dimensions):
142-
coverage_mean[idim] = results[strategy][dimension]["coverage"].mean()
143-
coverage_SE[idim] = results[strategy][dimension]["coverage"].std()/np.sqrt(ntrial)
144-
width_mean[idim] = results[strategy][dimension]["width_mean"].mean()
145-
width_SE[idim] = results[strategy][dimension]["width_mean"].std()/np.sqrt(ntrial)
148+
for idim, dim in enumerate(dimensions):
149+
coverage_mean[idim] = (
150+
results[strategy][dim]["coverage"].mean()
151+
)
152+
coverage_SE[idim] = (
153+
results[strategy][dim]["coverage"].std()/np.sqrt(ntrial)
154+
)
155+
width_mean[idim] = (
156+
results[strategy][dim]["width_mean"].mean()
157+
)
158+
width_SE[idim] = (
159+
results[strategy][dim]["width_mean"].std()/np.sqrt(ntrial)
160+
)
146161
ax1.plot(dimensions, coverage_mean, label=strategy)
147162
ax1.fill_between(
148-
dimensions, coverage_mean - coverage_SE, coverage_mean + coverage_SE, alpha=0.25
163+
dimensions,
164+
coverage_mean - coverage_SE,
165+
coverage_mean + coverage_SE,
166+
alpha=0.25
149167
)
150168
ax2.plot(dimensions, width_mean, label=strategy)
151169
ax2.fill_between(
152-
dimensions, width_mean - width_SE, width_mean + width_SE, alpha=0.25
170+
dimensions,
171+
width_mean - width_SE,
172+
width_mean + width_SE,
173+
alpha=0.25
153174
)
154175
ax1.axhline(1 - alpha, linestyle="dashed", c="k")
155176
ax1.set_ylim(0.0, 1.0)

examples/plot_nested-cv.py

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,37 @@
1111
The model with the set of parameters that gives the best score is then used in
1212
MAPIE to estimate the prediction intervals associated with the predictions.
1313
A limitation of this method is that residuals used by MAPIE are computed on
14-
the validation dataset, which can be subject to overfitting as far as hyperparameter
15-
tuning is concerned.
14+
the validation dataset, which can be subject to overfitting as far as
15+
hyperparameter tuning is concerned.
1616
This fools MAPIE into being slightly too optimistic with confidence intervals.
1717
1818
To solve this problem, an alternative option is to perform a nested
1919
cross-validation parameter search directly within the MAPIE estimator on each
2020
*out-of-fold* dataset.
21-
For each testing fold used by MAPIE to store residuals, an internal cross-validation
22-
occurs on the training fold, optimizing hyperparameters.
23-
This ensures that residuals seen by MAPIE are never seen by the algorithm beforehand.
24-
However, this method is much computationally heavier since it results in
25-
:math:`N * P` calculations, where *N* is the number of *out-of-fold*
26-
models and *P* the number of parameter search iterations, versus :math:`N + P`
27-
for the non-nested approach.
21+
For each testing fold used by MAPIE to store residuals, an internal
22+
cross-validation occurs on the training fold, optimizing hyperparameters.
23+
This ensures that residuals seen by MAPIE are never seen by the algorithm
24+
beforehand. However, this method is much computationally heavier since
25+
it results in :math:`N * P` calculations, where *N* is the number of
26+
*out-of-fold* models and *P* the number of parameter search iterations,
27+
versus :math:`N + P` for the non-nested approach.
2828
2929
Here, we compare the two strategies on the Boston dataset. We use the Random
30-
Forest Regressor as a base regressor for the CV+ strategy. For the sake of light
31-
computation, we adopt a RandomizedSearchCV parameter search strategy with a low
32-
number of iterations and with a reproducible random state.
30+
Forest Regressor as a base regressor for the CV+ strategy. For the sake of
31+
light computation, we adopt a RandomizedSearchCV parameter search strategy
32+
with a low number of iterations and with a reproducible random state.
3333
34-
The two approaches give slightly different predictions with the nested CV approach
35-
estimating slightly larger prediction interval widths by a few percents at most (apart from a
36-
handful of exceptions).
34+
The two approaches give slightly different predictions with the nested CV
35+
approach estimating slightly larger prediction interval widths by a
36+
few percents at most (apart from a handful of exceptions).
3737
38-
For this example, the two approaches result in identical scores and identical effective
39-
coverages.
38+
For this example, the two approaches result in identical scores and identical
39+
effective coverages.
4040
41-
In the general case, the recommended approach is to use nested cross-validation, since it
42-
does not underestimate residuals and hence prediction intervals. However, in this particular
43-
example, effective coverages of both nested and non-nested methods are the same.
41+
In the general case, the recommended approach is to use nested
42+
cross-validation, since it does not underestimate residuals and hence
43+
prediction intervals. However, in this particular example, effective
44+
coverages of both nested and non-nested methods are the same.
4445
"""
4546

4647
import matplotlib.pyplot as plt
@@ -98,8 +99,12 @@
9899
mapie_non_nested.fit(X_train, y_train)
99100
y_preds_non_nested = mapie_non_nested.predict(X_test)[:, :, 0]
100101
widths_non_nested = y_preds_non_nested[:, 2] - y_preds_non_nested[:, 1]
101-
coverage_non_nested = coverage_score(y_test, y_preds_non_nested[:, 1], y_preds_non_nested[:, 2])
102-
score_non_nested = mean_squared_error(y_test, y_preds_non_nested[:, 0], squared=False)
102+
coverage_non_nested = coverage_score(
103+
y_test, y_preds_non_nested[:, 1], y_preds_non_nested[:, 2]
104+
)
105+
score_non_nested = mean_squared_error(
106+
y_test, y_preds_non_nested[:, 0], squared=False
107+
)
103108

104109
# Nested approach with the CV+ strategy using the Random Forest model.
105110
cv_obj = RandomizedSearchCV(
@@ -123,17 +128,23 @@
123128
mapie_nested.fit(X_train, y_train)
124129
y_preds_nested = mapie_nested.predict(X_test)[:, :, 0]
125130
widths_nested = y_preds_nested[:, 2] - y_preds_nested[:, 1]
126-
coverage_nested = coverage_score(y_test, y_preds_nested[:, 1], y_preds_nested[:, 2])
131+
coverage_nested = coverage_score(
132+
y_test, y_preds_nested[:, 1], y_preds_nested[:, 2]
133+
)
127134
score_nested = mean_squared_error(y_test, y_preds_nested[:, 0], squared=False)
128135

129136
# Print scores and effective coverages.
130-
print("Scores and effective coverages for the CV+ strategy using the Random Forest model.")
137+
print(
138+
"Scores and effective coverages for the CV+ strategy using the "
139+
"Random Forest model."
140+
)
131141
print(
132142
"Score on the test set for the non-nested and nested CV approaches: ",
133143
f"{score_non_nested: .3f}, {score_nested: .3f}"
134144
)
135145
print(
136-
"Effective coverage on the test set for the non-nested and nested CV approaches: ",
146+
"Effective coverage on the test set for the non-nested "
147+
"and nested CV approaches: ",
137148
f"{coverage_non_nested: .3f}, {coverage_nested: .3f}"
138149
)
139150

@@ -146,7 +157,9 @@
146157
ax1.set_ylim([min_x, max_x])
147158
ax1.scatter(widths_nested, widths_non_nested)
148159
ax1.plot([min_x, max_x], [min_x, max_x], ls="--", color="k")
149-
ax2.set_xlabel("[width(non-nested CV) - width(nested CV)] / width(non-nested CV)")
160+
ax2.set_xlabel(
161+
"[width(non-nested CV) - width(nested CV)] / width(non-nested CV)"
162+
)
150163
ax2.set_ylabel("Counts")
151164
ax2.hist((widths_non_nested - widths_nested)/widths_non_nested, bins=15)
152165
plt.show()

examples/plot_prefit_nn.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@ def f(x: np.ndarray) -> np.ndarray:
3333
y = f(X) + np.random.normal(0, sigma, n_samples)
3434

3535
# Train/validation/test split
36-
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=1/10)
37-
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1/9)
36+
X_train_val, X_test, y_train_val, y_test = train_test_split(
37+
X, y, test_size=1/10
38+
)
39+
X_train, X_val, y_train, y_val = train_test_split(
40+
X_train_val, y_train_val, test_size=1/9
41+
)
3842

3943
# Train model on training set
4044
model = MLPRegressor(activation="relu", random_state=1)
@@ -55,13 +59,29 @@ def f(x: np.ndarray) -> np.ndarray:
5559
order = np.argsort(X_test)
5660

5761
plt.scatter(X_test, y_test, color="red", alpha=0.3, label="testing", s=2)
58-
plt.plot(X_test[order], y_test_theoretical[order], color="gray", label="True confidence intervals")
59-
plt.plot(X_test[order], y_test_theoretical[order] - theoretical_semi_width, color="gray", ls="--")
60-
plt.plot(X_test[order], y_test_theoretical[order] + theoretical_semi_width, color="gray", ls="--")
62+
plt.plot(
63+
X_test[order],
64+
y_test_theoretical[order],
65+
color="gray",
66+
label="True confidence intervals"
67+
)
68+
plt.plot(
69+
X_test[order],
70+
y_test_theoretical[order] - theoretical_semi_width,
71+
color="gray",
72+
ls="--"
73+
)
74+
plt.plot(
75+
X_test[order],
76+
y_test_theoretical[order] + theoretical_semi_width,
77+
color="gray",
78+
ls="--"
79+
)
6180
plt.plot(X_test[order], y_pred[order], label="Prediction intervals")
6281
plt.fill_between(X_test[order], y_pred_low[order], y_pred_up[order], alpha=0.2)
6382
plt.title(
64-
f"Target and effective coverages for alpha={alpha}: ({1 - alpha:.3f}, {coverage:.3f})"
83+
f"Target and effective coverages for "
84+
f"alpha={alpha}: ({1 - alpha:.3f}, {coverage:.3f})"
6585
)
6686
plt.xlabel("x")
6787
plt.ylabel("y")

examples/plot_toy_model.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,20 @@
2929
order = np.argsort(X[:, 0])
3030
plt.plot(X[order], y_preds[order][:, 1, 1], color="C1", ls="--")
3131
plt.plot(X[order], y_preds[order][:, 2, 1], color="C1", ls="--")
32-
plt.fill_between(X[order].ravel(), y_preds[:, 1, 0][order].ravel(), y_preds[:, 2, 0][order].ravel(), alpha=0.2)
33-
coverage_scores = [coverage_score(y, y_preds[:, 1, i], y_preds[:, 2, i]) for i, _ in enumerate(alpha)]
32+
plt.fill_between(
33+
X[order].ravel(),
34+
y_preds[:, 1, 0][order].ravel(),
35+
y_preds[:, 2, 0][order].ravel(),
36+
alpha=0.2
37+
)
38+
coverage_scores = [
39+
coverage_score(y, y_preds[:, 1, i], y_preds[:, 2, i])
40+
for i, _ in enumerate(alpha)
41+
]
3442
plt.title(
35-
f"Target and effective coverages for alpha={alpha[0]:.2f}: ({1-alpha[0]:.3f}, {coverage_scores[0]:.3f})\n"
36-
f"Target and effective coverages for alpha={alpha[1]:.2f}: ({1-alpha[1]:.3f}, {coverage_scores[1]:.3f})"
43+
f"Target and effective coverages for "
44+
f"alpha={alpha[0]:.2f}: ({1-alpha[0]:.3f}, {coverage_scores[0]:.3f})\n"
45+
f"Target and effective coverages for "
46+
f"alpha={alpha[1]:.2f}: ({1-alpha[1]:.3f}, {coverage_scores[1]:.3f})"
3747
)
3848
plt.show()

0 commit comments

Comments
 (0)