|
26 | 26 | *out-of-fold* models and *P* the number of parameter search cross-validations, |
27 | 27 | versus :math:`N + P` for the non-nested approach. |
28 | 28 |
|
29 | | -Here, we compare the two strategies on the Boston dataset. We use the Random |
| 29 | +Here, we compare the two strategies on a toy dataset. We use the Random |
30 | 30 | Forest Regressor as a base regressor for the CV+ strategy. For the sake of |
31 | 31 | light computation, we adopt a RandomizedSearchCV parameter search strategy |
32 | 32 | with a low number of iterations and with a reproducible random state. |
|
45 | 45 | """ |
46 | 46 | import matplotlib.pyplot as plt |
47 | 47 | import numpy as np |
48 | | -import pandas as pd |
49 | 48 | from scipy.stats import randint |
50 | 49 | from sklearn.ensemble import RandomForestRegressor |
51 | 50 | from sklearn.metrics import mean_squared_error |
52 | 51 | from sklearn.model_selection import RandomizedSearchCV, train_test_split |
| 52 | +from sklearn.datasets import make_sparse_uncorrelated |
53 | 53 |
|
54 | 54 | from mapie.metrics import regression_coverage_score |
55 | 55 | from mapie.regression import MapieRegressor |
56 | 56 |
|
57 | | -# Load the Boston data |
58 | | -data_url = "http://lib.stat.cmu.edu/datasets/boston" |
59 | | -raw_df = pd.read_csv(data_url, sep=r'\s+', skiprows=22, header=None) |
60 | | -X_boston = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) |
61 | | -y_boston = raw_df.values[1::2, 2] |
| 57 | + |
| 58 | +random_state = 42 |
| 59 | + |
| 60 | +# Load the toy data |
| 61 | +X, y = make_sparse_uncorrelated(500, random_state=random_state) |
62 | 62 |
|
63 | 63 | # Split the data into training and test sets. |
64 | 64 | X_train, X_test, y_train, y_test = train_test_split( |
65 | | - X_boston, y_boston, test_size=0.2, random_state=42 |
| 65 | + X, y, test_size=0.2, random_state=random_state |
66 | 66 | ) |
67 | 67 |
|
68 | 68 | # Define the Random Forest model as base regressor with parameter ranges. |
69 | | -rf_model = RandomForestRegressor(random_state=59, verbose=0) |
| 69 | +rf_model = RandomForestRegressor(random_state=random_state, verbose=0) |
70 | 70 | rf_params = {"max_depth": randint(2, 10), "n_estimators": randint(10, 100)} |
71 | 71 |
|
72 | 72 | # Cross-validation and prediction-interval parameters. |
73 | 73 | cv = 10 |
74 | 74 | n_iter = 5 |
75 | 75 | alpha = 0.05 |
76 | | -random_state = 59 |
77 | 76 |
|
78 | 77 | # Non-nested approach with the CV+ strategy using the Random Forest model. |
79 | 78 | cv_obj = RandomizedSearchCV( |
|
144 | 143 |
|
145 | 144 | # Compare prediction interval widths. |
146 | 145 | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6)) |
147 | | -min_x = 14.0 |
148 | | -max_x = 17.0 |
| 146 | +min_x = np.min([np.min(widths_nested), np.min(widths_non_nested)]) |
| 147 | +max_x = np.max([np.max(widths_nested), np.max(widths_non_nested)]) |
149 | 148 | ax1.set_xlabel("Prediction interval width using the nested CV approach") |
150 | 149 | ax1.set_ylabel("Prediction interval width using the non-nested CV approach") |
151 | | -ax1.set_xlim([min_x, max_x]) |
152 | | -ax1.set_ylim([min_x, max_x]) |
153 | 150 | ax1.scatter(widths_nested, widths_non_nested) |
154 | 151 | ax1.plot([min_x, max_x], [min_x, max_x], ls="--", color="k") |
155 | 152 | ax2.axvline(x=0, color="r", lw=2) |
|
0 commit comments