@@ -13,7 +13,7 @@ kernelspec:
13
13
(GLM-out-of-sample-predictions)=
14
14
# Out-Of-Sample Predictions
15
15
16
- :::{post} December, 2022
16
+ :::{post} December, 2023
17
17
:tags: generalized linear model, logistic regression, out of sample predictions, patsy
18
18
:category: beginner
19
19
:::
@@ -23,13 +23,11 @@ import arviz as az
23
23
import matplotlib.pyplot as plt
24
24
import numpy as np
25
25
import pandas as pd
26
- import patsy
27
26
import pymc as pm
28
27
import seaborn as sns
29
28
30
29
from scipy.special import expit as inverse_logit
31
- from sklearn.metrics import RocCurveDisplay, accuracy_score, auc, roc_curve
32
- from sklearn.model_selection import train_test_split
30
+ from sklearn.metrics import RocCurveDisplay, auc, roc_curve
33
31
```
34
32
35
33
``` {code-cell} ipython3
@@ -55,7 +53,7 @@ beta_x2 = -1
55
53
beta_interaction = 2
56
54
z = intercept + beta_x1 * x1 + beta_x2 * x2 + beta_interaction * x1 * x2
57
55
p = inverse_logit(z)
58
- # note binimial with n=1 is equal to a bernoulli
56
+ # note binomial with n=1 is equal to a Bernoulli
59
57
y = rng.binomial(n=1, p=p, size=n)
60
58
df = pd.DataFrame(dict(x1=x1, x2=x2, y=y))
61
59
df.head()
@@ -81,26 +79,33 @@ ax.set(title="Sample Data", xlim=(-9, 9), ylim=(-9, 9));
81
79
## Prepare Data for Modeling
82
80
83
81
``` {code-cell} ipython3
84
- y, x = patsy.dmatrices("y ~ x1 * x2", data=df)
85
- y = np.asarray(y).flatten()
86
- labels = x.design_info.column_names
87
- x = np.asarray(x)
82
+ labels = ["Intercept", "x1", "x2", "x1:x2"]
83
+ df["Intercept"] = np.ones(len(df))
84
+ df["x1:x2"] = df["x1"] * df["x2"]
85
+ # reorder columns to be in the same order as labels
86
+ df = df[labels]
87
+ x = df.to_numpy()
88
88
```
89
89
90
90
Now we do a train-test split.
91
91
92
92
``` {code-cell} ipython3
93
- x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7)
93
+ indices = rng.permutation(x.shape[0])
94
+ train_prop = 0.7
95
+ train_size = int(train_prop * x.shape[0])
96
+ training_idx, test_idx = indices[:train_size], indices[train_size:]
97
+ x_train, x_test = x[training_idx, :], x[test_idx, :]
98
+ y_train, y_test = y[training_idx], y[test_idx]
94
99
```
95
100
96
101
## Define and Fit the Model
97
102
98
103
We now specify the model in PyMC.
99
104
100
105
``` {code-cell} ipython3
101
- COORDS = {"coeffs": labels}
106
+ coords = {"coeffs": labels}
102
107
103
- with pm.Model(coords=COORDS ) as model:
108
+ with pm.Model(coords=coords ) as model:
104
109
# data containers
105
110
X = pm.MutableData("X", x_train)
106
111
y = pm.MutableData("y", y_train)
@@ -152,15 +157,15 @@ with model:
152
157
``` {code-cell} ipython3
153
158
# Compute the point prediction by taking the mean and defining the category via a threshold.
154
159
p_test_pred = idata.posterior_predictive["obs"].mean(dim=["chain", "draw"])
155
- y_test_pred = (p_test_pred >= 0.5).astype("int")
160
+ y_test_pred = (p_test_pred >= 0.5).astype("int").to_numpy()
156
161
```
157
162
158
163
## Evaluate Model
159
164
160
165
First let us compute the accuracy on the test set.
161
166
162
167
``` {code-cell} ipython3
163
- print(f"accuracy = {accuracy_score(y_true= y_test, y_pred =y_test_pred): 0.3f}")
168
+ print(f"accuracy = {np.mean( y_test= =y_test_pred): 0.3f}")
164
169
```
165
170
166
171
Next, we plot the [ roc curve] ( https://en.wikipedia.org/wiki/Receiver_operating_characteristic ) and compute the [ auc] ( https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve ) .
@@ -218,8 +223,13 @@ x1_grid, x2_grid, x_grid = make_grid()
218
223
219
224
with model:
220
225
# Create features on the grid.
221
- x_grid_ext = patsy.dmatrix(formula_like="x1 * x2", data=dict(x1=x_grid[:, 0], x2=x_grid[:, 1]))
222
- x_grid_ext = np.asarray(x_grid_ext)
226
+ x_grid_ext = np.hstack(
227
+ (
228
+ np.ones((x_grid.shape[0], 1)),
229
+ x_grid,
230
+ (x_grid[:, 0] * x_grid[:, 1]).reshape(-1, 1),
231
+ )
232
+ )
223
233
# set the observed variables
224
234
pm.set_data({"X": x_grid_ext})
225
235
# calculate pushforward values of `p`
@@ -278,12 +288,17 @@ Note that we have computed the model decision boundary by using the mean of the
278
288
- [ Bayesian Analysis with Python (Second edition) - Chapter 4] ( https://github.com/aloctavodia/BAP/blob/master/code/Chp4/04_Generalizing_linear_models.ipynb )
279
289
- [ Statistical Rethinking] ( https://xcelab.net/rm/statistical-rethinking/ )
280
290
291
+ +++
292
+
293
+
294
+
281
295
+++
282
296
283
297
## Authors
284
298
- Created by [ Juan Orduz] ( https://github.com/juanitorduz ) .
285
299
- Updated by [ Benjamin T. Vincent] ( https://github.com/drbenvincent ) to PyMC v4 in June 2022
286
300
- Re-executed by [ Benjamin T. Vincent] ( https://github.com/drbenvincent ) with PyMC v5 in December 2022
301
+ - Updated by [ Christian Luhmann] ( https://github.com/cluhmann ) in December 2023 ([ pymc-examples #616 ] ( https://github.com/pymc-devs/pymc-examples/pull/616 ) )
287
302
288
303
+++
289
304
0 commit comments