@@ -117,7 +117,7 @@ small_sacramento = sacramento.sample(n=30)
117
117
118
118
small_plot = (
119
119
alt.Chart(small_sacramento)
120
- .mark_circle()
120
+ .mark_circle(opacity=1 )
121
121
.encode(
122
122
x=alt.X("sqft")
123
123
.scale(zero=False)
@@ -129,7 +129,50 @@ small_plot = (
129
129
)
130
130
)
131
131
132
- small_plot += small_plot.transform_regression("sqft", "price").mark_line()
132
+
133
+ # create df_lines with one fake/empty line (for starting at 2nd color later)
134
+ df_lines = {"x": [500, 500], "y": [100000, 100000], "number": ["-1", "-1"]}
135
+
136
+ # set the domains (range of x values) of lines
137
+ min_x = small_sacramento["sqft"].min()
138
+ max_x = small_sacramento["sqft"].max()
139
+
140
+ # add the line of best fit
141
+ from sklearn.linear_model import LinearRegression
142
+ lm = LinearRegression()
143
+ lm.fit(small_sacramento[["sqft"]], small_sacramento[["price"]])
144
+ pred_min = float(lm.predict(pd.DataFrame({"sqft": [min_x]})))
145
+ pred_max = float(lm.predict(pd.DataFrame({"sqft": [max_x]})))
146
+
147
+ df_lines["x"].extend([min_x, max_x])
148
+ df_lines["y"].extend([pred_min, pred_max])
149
+ df_lines["number"].extend(["0", "0"])
150
+
151
+ # add other similar looking lines
152
+ intercept_l = [-64542.23, -6900, -64542.23]
153
+ slope_l = [190, 175, 160]
154
+ for i in range(len(slope_l)):
155
+ df_lines["x"].extend([min_x, max_x])
156
+ df_lines["y"].extend([
157
+ intercept_l[i] + slope_l[i] * min_x,
158
+ intercept_l[i] + slope_l[i] * max_x,
159
+ ])
160
+ df_lines["number"].extend([f"{i+1}", f"{i+1}"])
161
+
162
+ df_lines = pd.DataFrame(df_lines)
163
+
164
+ # plot the bogus line to skip the same color as the scatter
165
+ small_plot += alt.Chart(
166
+ df_lines[df_lines["number"] == "-1"]
167
+ ).mark_line().encode(
168
+ x="x", y="y", color=alt.Color("number", legend=None)
169
+ )
170
+ # plot the real line with 2nd color
171
+ small_plot += alt.Chart(
172
+ df_lines[df_lines["number"] == "0"]
173
+ ).mark_line().encode(
174
+ x="x", y="y", color=alt.Color("number", legend=None)
175
+ )
133
176
134
177
small_plot
135
178
```
@@ -189,11 +232,11 @@ prediction = float(lm.predict(pd.DataFrame({"sqft": [2000]})))
189
232
190
233
# the vertical dotted line
191
234
line_df = pd.DataFrame({"x": [2000]})
192
- rule = alt.Chart(line_df).mark_rule(strokeDash=[2, 4] ).encode(x="x")
235
+ rule = alt.Chart(line_df).mark_rule(strokeDash=[6], size=1.5 ).encode(x="x")
193
236
194
237
# the red point
195
238
point_df = pd.DataFrame({"x": [2000], "y": [prediction]})
196
- point = alt.Chart(point_df).mark_circle(color="red", size=100 ).encode(x="x", y="y")
239
+ point = alt.Chart(point_df).mark_circle(color="red", size=80, opacity=1 ).encode(x="x", y="y")
197
240
198
241
# overlay all plots
199
242
small_plot_2000_pred = (
@@ -204,7 +247,7 @@ small_plot_2000_pred = (
204
247
+ alt.Chart(
205
248
pd.DataFrame(
206
249
{
207
- "x": [2350 ],
250
+ "x": [2450 ],
208
251
"y": [prediction - 41000],
209
252
"prediction": ["$" + "{0:,.0f}".format(prediction)],
210
253
}
@@ -242,32 +285,11 @@ Some plausible examples are shown in {numref}`fig:08-several-lines`.
242
285
``` {code-cell} ipython3
243
286
:tags: [remove-cell]
244
287
245
- intercept_l = [-64542.23, -6900, -64542.23]
246
- slope_l = [190, 175, 160]
247
- line_color_l = ["green", "purple", "red"]
248
-
249
- # set the domains (range of x values) of lines
250
- min_x = small_sacramento["sqft"].min()
251
- max_x = small_sacramento["sqft"].max()
252
-
253
288
several_lines_plot = small_plot.copy()
254
289
255
- for i in range(len(slope_l)):
256
- several_lines_plot += (
257
- alt.Chart(
258
- pd.DataFrame(
259
- {
260
- "x": [min_x, max_x],
261
- "y": [
262
- intercept_l[i] + slope_l[i] * min_x,
263
- intercept_l[i] + slope_l[i] * max_x,
264
- ],
265
- }
266
- )
267
- )
268
- .mark_line(color=line_color_l[i])
269
- .encode(x="x", y="y")
270
- )
290
+ several_lines_plot += alt.Chart(
291
+ df_lines[df_lines["number"] != "0"]
292
+ ).mark_line().encode(x="x", y="y", color=alt.Color("number",legend=None))
271
293
272
294
several_lines_plot
273
295
```
@@ -292,7 +314,7 @@ Scatter plot of sale price versus size with many possible lines that could be dr
292
314
Simple linear regression chooses the straight line of best fit by choosing
293
315
the line that minimizes the ** average squared vertical distance** between itself and
294
316
each of the observed data points in the training data. {numref}` fig:08-verticalDistToMin ` illustrates
295
- these vertical distances as red lines. Finally, to assess the predictive
317
+ these vertical distances as lines. Finally, to assess the predictive
296
318
accuracy of a simple linear regression model,
297
319
we use RMSPE&mdash ; the same measure of predictive performance we used with K-NN regression.
298
320
@@ -313,7 +335,7 @@ v_lines = []
313
335
for i in range(len(small_sacramento)):
314
336
sqft_val = small_sacramento.iloc[i]["sqft"]
315
337
line_df = small_sacramento_pred.query("sqft == @sqft_val")
316
- v_lines.append(alt.Chart(line_df).mark_line(color="red ").encode(x="sqft", y="value"))
338
+ v_lines.append(alt.Chart(line_df).mark_line(color="black ").encode(x="sqft", y="value"))
317
339
318
340
error_plot = alt.layer(*v_lines, small_plot).configure_circle(opacity=1)
319
341
error_plot
@@ -328,7 +350,7 @@ glue("fig:08-verticalDistToMin", error_plot)
328
350
:::{glue: figure } fig:08-verticalDistToMin
329
351
:name: fig:08-verticalDistToMin
330
352
331
- Scatter plot of sale price versus size with red lines denoting the vertical distances between the predicted values and the observed data points.
353
+ Scatter plot of sale price versus size with lines denoting the vertical distances between the predicted values and the observed data points.
332
354
:::
333
355
334
356
+++
@@ -482,7 +504,7 @@ so that we can qualitatively assess if the model seems to fit the data well.
482
504
sqft_prediction_grid = sacramento[["sqft"]].agg(["min", "max"])
483
505
sqft_prediction_grid["predicted"] = lm.predict(sqft_prediction_grid)
484
506
485
- all_points = alt.Chart(sacramento).mark_circle(opacity=0.4 ).encode(
507
+ all_points = alt.Chart(sacramento).mark_circle().encode(
486
508
x=alt.X("sqft")
487
509
.scale(zero=False)
488
510
.title("House size (square feet)"),
@@ -966,7 +988,7 @@ lm_plot_outlier += lm_plot_outlier.transform_regression("sqft", "price").mark_li
966
988
967
989
outlier_pt = (
968
990
alt.Chart(sacramento_outlier)
969
- .mark_circle(color="red ", size=100)
991
+ .mark_circle(color="#d62728 ", size=100)
970
992
.encode(x="sqft", y="price")
971
993
)
972
994
@@ -987,7 +1009,7 @@ outlier_line = (
987
1009
)
988
1010
)
989
1011
.transform_regression("sqft", "price")
990
- .mark_line(color="red ")
1012
+ .mark_line(color="#d62728 ")
991
1013
)
992
1014
993
1015
lm_plot_outlier += outlier_pt + outlier_line
@@ -1051,7 +1073,7 @@ outlier_line = (
1051
1073
)
1052
1074
)
1053
1075
.transform_regression("sqft", "price")
1054
- .mark_line(color="red ")
1076
+ .mark_line(color="#d62728 ")
1055
1077
)
1056
1078
1057
1079
lm_plot_outlier_large += outlier_pt + outlier_line
0 commit comments