Skip to content

Commit 8773dfc

Browse files
committed
UPD: odc
1 parent 147142d commit 8773dfc

File tree

2 files changed

+182
-1
lines changed

2 files changed

+182
-1
lines changed

doc/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
:caption: MONDRIAN
5656

5757
theoretical_description_mondrian
58-
tutorial_mondrian_regression
58+
examples_mondrian/1-quickstart/plot_main-tutorial-mondrian-regression
5959

6060
.. toctree::
6161
:maxdepth: 2
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
r"""
2+
=============================================
3+
Tutorial for tabular regression with Mondrian
4+
=============================================
5+
6+
In this tutorial, we compare the prediction intervals estimated by MAPIE on a
7+
simple, one-dimensional, ground truth function with classical conformal
8+
prediction intervals versus Mondrian conformal prediction intervals.
9+
The function is a sinusoidal function with added noise, and the data is
10+
grouped in 10 groups. The goal is to estimate the prediction intervals
11+
for new data points, and to compare the coverage of the prediction intervals
12+
by groups.
13+
Throughout this tutorial, we will answer the following questions:
14+
15+
16+
- How to use MAPIE to estimate prediction intervals for a regression problem?
17+
- How to use Mondrian conformal prediction intervals for regression?
18+
- How to compare the coverage of the prediction intervals by groups?
19+
"""
20+
21+
import os
22+
import warnings
23+
24+
import matplotlib.pyplot as plt
25+
import numpy as np
26+
from sklearn.model_selection import train_test_split
27+
from sklearn.ensemble import RandomForestRegressor
28+
29+
from mapie.metrics import regression_coverage_score_v2
30+
from mapie.mondrian import MondrianCP
31+
from mapie.regression import MapieRegressor
32+
33+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
34+
warnings.filterwarnings("ignore")
35+
36+
37+
##############################################################################
38+
# 1. Create the noisy dataset with 10 groups, each of those groups having
39+
# a different level of noise.
40+
# -------------------------------------------------------------------
41+
42+
43+
n_points = 100000
44+
np.random.seed(0)
45+
X = np.linspace(0, 10, n_points).reshape(-1, 1)
46+
group_size = n_points // 10
47+
groups_list = []
48+
for i in range(10):
49+
groups_list.append(np.array([i] * group_size))
50+
groups = np.concatenate(groups_list)
51+
52+
noise_0_1 = np.random.normal(0, 0.1, group_size)
53+
noise_1_2 = np.random.normal(0, 0.5, group_size)
54+
noise_2_3 = np.random.normal(0, 1, group_size)
55+
noise_3_4 = np.random.normal(0, .4, group_size)
56+
noise_4_5 = np.random.normal(0, .2, group_size)
57+
noise_5_6 = np.random.normal(0, .3, group_size)
58+
noise_6_7 = np.random.normal(0, .6, group_size)
59+
noise_7_8 = np.random.normal(0, .7, group_size)
60+
noise_8_9 = np.random.normal(0, .8, group_size)
61+
noise_9_10 = np.random.normal(0, .9, group_size)
62+
63+
y = np.concatenate(
64+
[
65+
np.sin(X[groups == 0, 0] * 2) + noise_0_1,
66+
np.sin(X[groups == 1, 0] * 2) + noise_1_2,
67+
np.sin(X[groups == 2, 0] * 2) + noise_2_3,
68+
np.sin(X[groups == 3, 0] * 2) + noise_3_4,
69+
np.sin(X[groups == 4, 0] * 2) + noise_4_5,
70+
np.sin(X[groups == 5, 0] * 2) + noise_5_6,
71+
np.sin(X[groups == 6, 0] * 2) + noise_6_7,
72+
np.sin(X[groups == 7, 0] * 2) + noise_7_8,
73+
np.sin(X[groups == 8, 0] * 2) + noise_8_9,
74+
np.sin(X[groups == 9, 0] * 2) + noise_9_10,
75+
], axis=0
76+
)
77+
78+
79+
##############################################################################
80+
# We plot the dataset with the groups as colors.
81+
82+
83+
plt.scatter(X, y, c=groups)
84+
plt.show()
85+
86+
87+
##############################################################################
88+
# 2. Split the dataset into a training set, a calibration set, and a test set.
89+
90+
91+
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
92+
X, y, test_size=0.2, random_state=0
93+
)
94+
groups_train_temp, groups_test, _, _ = train_test_split(
95+
groups, y, test_size=0.2, random_state=0
96+
)
97+
X_cal, X_train, y_cal, y_train = train_test_split(
98+
X_train_temp, y_train_temp, test_size=0.5, random_state=0
99+
)
100+
groups_cal, groups_train, _, _ = train_test_split(
101+
groups_train_temp, y_train_temp, test_size=0.5, random_state=0
102+
)
103+
104+
105+
##############################################################################
106+
# We plot the training set, the calibration set, and the test set.
107+
108+
109+
f, ax = plt.subplots(1, 3, figsize=(15, 5))
110+
ax[0].scatter(X_train, y_train, c=groups_train)
111+
ax[0].set_title("Train set")
112+
ax[1].scatter(X_cal, y_cal, c=groups_cal)
113+
ax[1].set_title("Calibration set")
114+
ax[2].scatter(X_test, y_test, c=groups_test)
115+
ax[2].set_title("Test set")
116+
plt.show()
117+
118+
119+
##############################################################################
120+
# 3. Fit a random forest regressor on the training set.
121+
122+
123+
rf = RandomForestRegressor(n_estimators=100)
124+
rf.fit(X_train, y_train)
125+
126+
127+
##############################################################################
128+
# 4. Fit a MapieRegressor and a MondrianCP on the calibration set.
129+
130+
131+
mapie_regressor = MapieRegressor(rf, cv="prefit")
132+
mondrian_regressor = MondrianCP(MapieRegressor(rf, cv="prefit"))
133+
mapie_regressor.fit(X_cal, y_cal)
134+
mondrian_regressor.fit(X_cal, y_cal, groups=groups_cal)
135+
136+
137+
##############################################################################
138+
# 5. Predict the prediction intervals on the test set with both methods.
139+
140+
141+
_, y_pss_split = mapie_regressor.predict(X_test, alpha=.1)
142+
_, y_pss_mondrian = mondrian_regressor.predict(
143+
X_test, groups=groups_test, alpha=.1
144+
)
145+
146+
147+
##############################################################################
148+
# 6. Compare the coverage by groups, plot both methods side by side.
149+
150+
151+
coverages = {}
152+
for group in np.unique(groups_test):
153+
coverages[group] = {}
154+
coverages[group]["split"] = regression_coverage_score_v2(
155+
y_test[groups_test == group], y_pss_split[groups_test == group]
156+
)
157+
coverages[group]["mondrian"] = regression_coverage_score_v2(
158+
y_test[groups_test == group], y_pss_mondrian[groups_test == group]
159+
)
160+
161+
162+
# Plot the coverage by groups, plot both methods side by side
163+
plt.bar(
164+
np.arange(len(coverages)) * 2,
165+
[float(coverages[group]["split"]) for group in coverages],
166+
label="Split"
167+
)
168+
plt.bar(
169+
np.arange(len(coverages)) * 2 + 1,
170+
[float(coverages[group]["mondrian"]) for group in coverages],
171+
label="Mondrian"
172+
)
173+
plt.xticks(
174+
np.arange(len(coverages)) * 2 + .5,
175+
[f"Group {group}" for group in coverages],
176+
rotation=45
177+
)
178+
plt.hlines(0.9, -1, 21, label="90% coverage", color="black", linestyle="--")
179+
plt.ylabel("Coverage")
180+
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
181+
plt.show()

0 commit comments

Comments
 (0)