|
| 1 | +r""" |
| 2 | +============================================= |
| 3 | +Tutorial for tabular regression with Mondrian |
| 4 | +============================================= |
| 5 | +
|
| 6 | +In this tutorial, we compare the prediction intervals estimated by MAPIE on a |
| 7 | +simple, one-dimensional, ground truth function with classical conformal |
| 8 | +prediction intervals versus Mondrian conformal prediction intervals. |
| 9 | +The function is a sinusoidal function with added noise, and the data is |
| 10 | +grouped in 10 groups. The goal is to estimate the prediction intervals |
| 11 | +for new data points, and to compare the coverage of the prediction intervals |
| 12 | +by groups. |
| 13 | +Throughout this tutorial, we will answer the following questions: |
| 14 | +
|
| 15 | +
|
| 16 | +- How to use MAPIE to estimate prediction intervals for a regression problem? |
| 17 | +- How to use Mondrian conformal prediction intervals for regression? |
| 18 | +- How to compare the coverage of the prediction intervals by groups? |
| 19 | +""" |
| 20 | + |
| 21 | +import os |
| 22 | +import warnings |
| 23 | + |
| 24 | +import matplotlib.pyplot as plt |
| 25 | +import numpy as np |
| 26 | +from sklearn.model_selection import train_test_split |
| 27 | +from sklearn.ensemble import RandomForestRegressor |
| 28 | + |
| 29 | +from mapie.metrics import regression_coverage_score_v2 |
| 30 | +from mapie.mondrian import MondrianCP |
| 31 | +from mapie.regression import MapieRegressor |
| 32 | + |
| 33 | +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" |
| 34 | +warnings.filterwarnings("ignore") |
| 35 | + |
| 36 | + |
| 37 | +############################################################################## |
| 38 | +# 1. Create the noisy dataset with 10 groups, each of those groups having |
| 39 | +# a different level of noise. |
| 40 | +# ------------------------------------------------------------------- |
| 41 | + |
| 42 | + |
| 43 | +n_points = 100000 |
| 44 | +np.random.seed(0) |
| 45 | +X = np.linspace(0, 10, n_points).reshape(-1, 1) |
| 46 | +group_size = n_points // 10 |
| 47 | +groups_list = [] |
| 48 | +for i in range(10): |
| 49 | + groups_list.append(np.array([i] * group_size)) |
| 50 | +groups = np.concatenate(groups_list) |
| 51 | + |
| 52 | +noise_0_1 = np.random.normal(0, 0.1, group_size) |
| 53 | +noise_1_2 = np.random.normal(0, 0.5, group_size) |
| 54 | +noise_2_3 = np.random.normal(0, 1, group_size) |
| 55 | +noise_3_4 = np.random.normal(0, .4, group_size) |
| 56 | +noise_4_5 = np.random.normal(0, .2, group_size) |
| 57 | +noise_5_6 = np.random.normal(0, .3, group_size) |
| 58 | +noise_6_7 = np.random.normal(0, .6, group_size) |
| 59 | +noise_7_8 = np.random.normal(0, .7, group_size) |
| 60 | +noise_8_9 = np.random.normal(0, .8, group_size) |
| 61 | +noise_9_10 = np.random.normal(0, .9, group_size) |
| 62 | + |
| 63 | +y = np.concatenate( |
| 64 | + [ |
| 65 | + np.sin(X[groups == 0, 0] * 2) + noise_0_1, |
| 66 | + np.sin(X[groups == 1, 0] * 2) + noise_1_2, |
| 67 | + np.sin(X[groups == 2, 0] * 2) + noise_2_3, |
| 68 | + np.sin(X[groups == 3, 0] * 2) + noise_3_4, |
| 69 | + np.sin(X[groups == 4, 0] * 2) + noise_4_5, |
| 70 | + np.sin(X[groups == 5, 0] * 2) + noise_5_6, |
| 71 | + np.sin(X[groups == 6, 0] * 2) + noise_6_7, |
| 72 | + np.sin(X[groups == 7, 0] * 2) + noise_7_8, |
| 73 | + np.sin(X[groups == 8, 0] * 2) + noise_8_9, |
| 74 | + np.sin(X[groups == 9, 0] * 2) + noise_9_10, |
| 75 | + ], axis=0 |
| 76 | +) |
| 77 | + |
| 78 | + |
| 79 | +############################################################################## |
| 80 | +# We plot the dataset with the groups as colors. |
| 81 | + |
| 82 | + |
| 83 | +plt.scatter(X, y, c=groups) |
| 84 | +plt.show() |
| 85 | + |
| 86 | + |
| 87 | +############################################################################## |
| 88 | +# 2. Split the dataset into a training set, a calibration set, and a test set. |
| 89 | + |
| 90 | + |
| 91 | +X_train_temp, X_test, y_train_temp, y_test = train_test_split( |
| 92 | + X, y, test_size=0.2, random_state=0 |
| 93 | +) |
| 94 | +groups_train_temp, groups_test, _, _ = train_test_split( |
| 95 | + groups, y, test_size=0.2, random_state=0 |
| 96 | +) |
| 97 | +X_cal, X_train, y_cal, y_train = train_test_split( |
| 98 | + X_train_temp, y_train_temp, test_size=0.5, random_state=0 |
| 99 | +) |
| 100 | +groups_cal, groups_train, _, _ = train_test_split( |
| 101 | + groups_train_temp, y_train_temp, test_size=0.5, random_state=0 |
| 102 | +) |
| 103 | + |
| 104 | + |
| 105 | +############################################################################## |
| 106 | +# We plot the training set, the calibration set, and the test set. |
| 107 | + |
| 108 | + |
| 109 | +f, ax = plt.subplots(1, 3, figsize=(15, 5)) |
| 110 | +ax[0].scatter(X_train, y_train, c=groups_train) |
| 111 | +ax[0].set_title("Train set") |
| 112 | +ax[1].scatter(X_cal, y_cal, c=groups_cal) |
| 113 | +ax[1].set_title("Calibration set") |
| 114 | +ax[2].scatter(X_test, y_test, c=groups_test) |
| 115 | +ax[2].set_title("Test set") |
| 116 | +plt.show() |
| 117 | + |
| 118 | + |
| 119 | +############################################################################## |
| 120 | +# 3. Fit a random forest regressor on the training set. |
| 121 | + |
| 122 | + |
| 123 | +rf = RandomForestRegressor(n_estimators=100) |
| 124 | +rf.fit(X_train, y_train) |
| 125 | + |
| 126 | + |
| 127 | +############################################################################## |
| 128 | +# 4. Fit a MapieRegressor and a MondrianCP on the calibration set. |
| 129 | + |
| 130 | + |
| 131 | +mapie_regressor = MapieRegressor(rf, cv="prefit") |
| 132 | +mondrian_regressor = MondrianCP(MapieRegressor(rf, cv="prefit")) |
| 133 | +mapie_regressor.fit(X_cal, y_cal) |
| 134 | +mondrian_regressor.fit(X_cal, y_cal, groups=groups_cal) |
| 135 | + |
| 136 | + |
| 137 | +############################################################################## |
| 138 | +# 5. Predict the prediction intervals on the test set with both methods. |
| 139 | + |
| 140 | + |
| 141 | +_, y_pss_split = mapie_regressor.predict(X_test, alpha=.1) |
| 142 | +_, y_pss_mondrian = mondrian_regressor.predict( |
| 143 | + X_test, groups=groups_test, alpha=.1 |
| 144 | +) |
| 145 | + |
| 146 | + |
| 147 | +############################################################################## |
| 148 | +# 6. Compare the coverage by groups, plot both methods side by side. |
| 149 | + |
| 150 | + |
| 151 | +coverages = {} |
| 152 | +for group in np.unique(groups_test): |
| 153 | + coverages[group] = {} |
| 154 | + coverages[group]["split"] = regression_coverage_score_v2( |
| 155 | + y_test[groups_test == group], y_pss_split[groups_test == group] |
| 156 | + ) |
| 157 | + coverages[group]["mondrian"] = regression_coverage_score_v2( |
| 158 | + y_test[groups_test == group], y_pss_mondrian[groups_test == group] |
| 159 | + ) |
| 160 | + |
| 161 | + |
| 162 | +# Plot the coverage by groups, plot both methods side by side |
| 163 | +plt.bar( |
| 164 | + np.arange(len(coverages)) * 2, |
| 165 | + [float(coverages[group]["split"]) for group in coverages], |
| 166 | + label="Split" |
| 167 | +) |
| 168 | +plt.bar( |
| 169 | + np.arange(len(coverages)) * 2 + 1, |
| 170 | + [float(coverages[group]["mondrian"]) for group in coverages], |
| 171 | + label="Mondrian" |
| 172 | +) |
| 173 | +plt.xticks( |
| 174 | + np.arange(len(coverages)) * 2 + .5, |
| 175 | + [f"Group {group}" for group in coverages], |
| 176 | + rotation=45 |
| 177 | +) |
| 178 | +plt.hlines(0.9, -1, 21, label="90% coverage", color="black", linestyle="--") |
| 179 | +plt.ylabel("Coverage") |
| 180 | +plt.legend(loc='upper left', bbox_to_anchor=(1, 1)) |
| 181 | +plt.show() |
0 commit comments