Skip to content

Commit 1110092

Browse files
Merge pull request #461 from scikit-learn-contrib/460-documentation-interrupted-due-to-inaccessibility-of-the-bostom-dataset
Switch from the bostom dataset to the toy dataset
2 parents aeb7894 + 0c42280 commit 1110092

File tree

1 file changed

+11
-14
lines changed

1 file changed

+11
-14
lines changed

examples/regression/2-advanced-analysis/plot_nested-cv.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
*out-of-fold* models and *P* the number of parameter search cross-validations,
2727
versus :math:`N + P` for the non-nested approach.
2828
29-
Here, we compare the two strategies on the Boston dataset. We use the Random
29+
Here, we compare the two strategies on a toy dataset. We use the Random
3030
Forest Regressor as a base regressor for the CV+ strategy. For the sake of
3131
light computation, we adopt a RandomizedSearchCV parameter search strategy
3232
with a low number of iterations and with a reproducible random state.
@@ -45,35 +45,34 @@
4545
"""
4646
import matplotlib.pyplot as plt
4747
import numpy as np
48-
import pandas as pd
4948
from scipy.stats import randint
5049
from sklearn.ensemble import RandomForestRegressor
5150
from sklearn.metrics import mean_squared_error
5251
from sklearn.model_selection import RandomizedSearchCV, train_test_split
52+
from sklearn.datasets import make_sparse_uncorrelated
5353

5454
from mapie.metrics import regression_coverage_score
5555
from mapie.regression import MapieRegressor
5656

57-
# Load the Boston data
58-
data_url = "http://lib.stat.cmu.edu/datasets/boston"
59-
raw_df = pd.read_csv(data_url, sep=r'\s+', skiprows=22, header=None)
60-
X_boston = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
61-
y_boston = raw_df.values[1::2, 2]
57+
58+
random_state = 42
59+
60+
# Load the toy data
61+
X, y = make_sparse_uncorrelated(500, random_state=random_state)
6262

6363
# Split the data into training and test sets.
6464
X_train, X_test, y_train, y_test = train_test_split(
65-
X_boston, y_boston, test_size=0.2, random_state=42
65+
X, y, test_size=0.2, random_state=random_state
6666
)
6767

6868
# Define the Random Forest model as base regressor with parameter ranges.
69-
rf_model = RandomForestRegressor(random_state=59, verbose=0)
69+
rf_model = RandomForestRegressor(random_state=random_state, verbose=0)
7070
rf_params = {"max_depth": randint(2, 10), "n_estimators": randint(10, 100)}
7171

7272
# Cross-validation and prediction-interval parameters.
7373
cv = 10
7474
n_iter = 5
7575
alpha = 0.05
76-
random_state = 59
7776

7877
# Non-nested approach with the CV+ strategy using the Random Forest model.
7978
cv_obj = RandomizedSearchCV(
@@ -144,12 +143,10 @@
144143

145144
# Compare prediction interval widths.
146145
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))
147-
min_x = 14.0
148-
max_x = 17.0
146+
min_x = np.min([np.min(widths_nested), np.min(widths_non_nested)])
147+
max_x = np.max([np.max(widths_nested), np.max(widths_non_nested)])
149148
ax1.set_xlabel("Prediction interval width using the nested CV approach")
150149
ax1.set_ylabel("Prediction interval width using the non-nested CV approach")
151-
ax1.set_xlim([min_x, max_x])
152-
ax1.set_ylim([min_x, max_x])
153150
ax1.scatter(widths_nested, widths_non_nested)
154151
ax1.plot([min_x, max_x], [min_x, max_x], ls="--", color="k")
155152
ax2.axvline(x=0, color="r", lw=2)

0 commit comments

Comments
 (0)