Skip to content

Commit 21170d6

Browse files
vertical line, mark rule, color consistency improvements
1 parent 1cedf29 commit 21170d6

File tree

4 files changed

+78
-55
lines changed

4 files changed

+78
-55
lines changed

source/inference.md

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,7 @@ reliable—is there any way to improve the estimate? One way to improve a
694694
point estimate is to take a *larger* sample. To illustrate what effect this
695695
has, we will take many samples of size 20, 50, 100, and 500, and plot the
696696
sampling distribution of the sample mean. We indicate the mean of the sampling
697-
distribution with a orange vertical line.
697+
distribution with a vertical line.
698698

699699
```{code-cell} ipython3
700700
:tags: [remove-input]
@@ -721,10 +721,10 @@ glue(
721721
alt.X("price", bin=alt.Bin(maxbins=30)),
722722
alt.Y("count()")
723723
),
724-
base.mark_rule(color="#f58518", size=3).encode(
724+
base.mark_rule(color="black", size=1.5, strokeDash=[6]).encode(
725725
x="mean(price)"
726726
),
727-
base.mark_text(align="left", color="#f58518", size=12, fontWeight="bold", dx=10).transform_aggregate(
727+
base.mark_text(align="left", color="black", size=12, fontWeight="bold", dx=10).transform_aggregate(
728728
mean_price="mean(price)",
729729
).transform_calculate(
730730
label="'Mean = ' + round(datum.mean_price * 10) / 10"
@@ -755,7 +755,7 @@ glue(
755755
:name: fig:11-example-means7
756756
:figclass: caption-hack
757757
758-
Comparison of sampling distributions, with mean highlighted as a vertical orange line.
758+
Comparison of sampling distributions, with mean highlighted as a vertical line.
759759
```
760760

761761
+++
@@ -1154,17 +1154,17 @@ sampling_distribution.encoding.x["bin"]["extent"] = (90, 250)
11541154
alt.vconcat(
11551155
alt.layer(
11561156
sampling_distribution,
1157-
alt.Chart(sample_estimates).mark_rule(color="#f58518", size=2).encode(x="mean(mean_price)"),
1158-
alt.Chart(sample_estimates).mark_text(color="#f58518", size=12, align="left", dx=16, fontWeight="bold").encode(
1157+
alt.Chart(sample_estimates).mark_rule(color="black", size=1.5, strokeDash=[6]).encode(x="mean(mean_price)"),
1158+
alt.Chart(sample_estimates).mark_text(color="black", size=12, align="left", dx=16, fontWeight="bold").encode(
11591159
x="mean(mean_price)",
11601160
y=alt.value(7),
11611161
text=alt.value(f"Mean = {sampling_distribution['data']['mean_price'].mean().round(1)}")
11621162
)
11631163
).properties(title="Sampling distribution", height=150),
11641164
alt.layer(
11651165
boot_est_dist,
1166-
alt.Chart(boot20000_means).mark_rule(color="#f58518", size=2).encode(x="mean(mean_price)"),
1167-
alt.Chart(boot20000_means).mark_text(color="#f58518", size=12, align="left", dx=18, fontWeight="bold").encode(
1166+
alt.Chart(boot20000_means).mark_rule(color="black", size=1.5, strokeDash=[6]).encode(x="mean(mean_price)"),
1167+
alt.Chart(boot20000_means).mark_text(color="black", size=12, align="left", dx=18, fontWeight="bold").encode(
11681168
x="mean(mean_price)",
11691169
y=alt.value(7),
11701170
text=alt.value(f"Mean = {boot_est_dist['data']['mean_price'].mean().round(1)}")
@@ -1275,14 +1275,15 @@ the middle 95\% of the sample mean prices in the bootstrap distribution. We can
12751275
visualize the interval on our distribution in {numref}`fig:11-bootstrapping9`.
12761276

12771277
```{code-cell} ipython3
1278+
:tags: [remove-input]
12781279
# Create the annotation for for the 2.5th percentile
1279-
rule_025 = alt.Chart().mark_rule(color="#f58518", size=3, strokeDash=[5]).encode(
1280+
rule_025 = alt.Chart().mark_rule(color="black", size=1.5, strokeDash=[6]).encode(
12801281
x=alt.datum(ci_bounds[0.025])
12811282
).properties(
12821283
width=500
12831284
)
12841285
text_025 = rule_025.mark_text(
1285-
color="#f58518",
1286+
color="black",
12861287
size=12,
12871288
fontWeight="bold",
12881289
dy=-160

source/regression1.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ the sale price?
257257
```{code-cell} ipython3
258258
:tags: [remove-output]
259259
260-
small_plot = alt.Chart(small_sacramento).mark_circle().encode(
260+
small_plot = alt.Chart(small_sacramento).mark_circle(opacity=1).encode(
261261
x=alt.X("sqft")
262262
.scale(zero=False)
263263
.title("House size (square feet)"),
@@ -268,7 +268,7 @@ small_plot = alt.Chart(small_sacramento).mark_circle().encode(
268268
269269
# add an overlay to the base plot
270270
line_df = pd.DataFrame({"x": [2000]})
271-
rule = alt.Chart(line_df).mark_rule(strokeDash=[2, 4]).encode(x="x")
271+
rule = alt.Chart(line_df).mark_rule(strokeDash=[6], size=1.5, color="black").encode(x="x")
272272
273273
small_plot + rule
274274
```
@@ -315,7 +315,7 @@ for i in range(5):
315315
"sqft": [nearest_neighbors.iloc[i, 4], 2000],
316316
"price": [nearest_neighbors.iloc[i, 6]] * 2
317317
})
318-
h_lines.append(alt.Chart(h_line_df).mark_line(color="orange").encode(x="sqft", y="price"))
318+
h_lines.append(alt.Chart(h_line_df).mark_line(color="black").encode(x="sqft", y="price"))
319319
320320
nn_plot = alt.layer(*h_lines, small_plot, rule)
321321
```
@@ -352,7 +352,7 @@ prediction
352352
353353
nn_plot_pred = nn_plot + alt.Chart(
354354
pd.DataFrame({"sqft": [2000], "price": [prediction]})
355-
).mark_circle(size=40).encode(x="sqft", y="price", color=alt.value("red"))
355+
).mark_circle(size=80, opacity=1, color="#d62728").encode(x="sqft", y="price")
356356
```
357357

358358
```{code-cell} ipython3
@@ -493,15 +493,15 @@ sacr_new_preds_hid = pd.concat(
493493
sacr_new_preds_melted_df = sacr_new_preds_hid.melt(id_vars=["sqft"])
494494
errors_plot = (
495495
small_plot
496-
+ alt.Chart(sacr_full_preds_hid).mark_line().encode(x="sqft", y="predicted")
496+
+ alt.Chart(sacr_full_preds_hid).mark_line(color="#ff7f0e").encode(x="sqft", y="predicted")
497497
+ alt.Chart(sacr_new_preds_hid)
498498
.mark_circle(opacity=1)
499499
.encode(x="sqft", y="price")
500500
)
501501
v_lines = []
502502
for i in pts["sqft"]:
503503
line_df = sacr_new_preds_melted_df.query("sqft == @i")
504-
v_lines.append(alt.Chart(line_df).mark_line(color="red").encode(x="sqft", y="value"))
504+
v_lines.append(alt.Chart(line_df).mark_line(color="black").encode(x="sqft", y="value"))
505505
506506
errors_plot = alt.layer(*v_lines, errors_plot)
507507
errors_plot
@@ -516,7 +516,7 @@ glue("fig:07-verticalerrors", errors_plot, display=False)
516516
:::{glue:figure} fig:07-verticalerrors
517517
:name: fig:07-verticalerrors
518518

519-
Scatter plot of price (USD) versus house size (square feet) with example predictions (blue line) and the error in those predictions compared with true response values for three selected observations (vertical red lines).
519+
Scatter plot of price (USD) versus house size (square feet) with example predictions (orange line) and the error in those predictions compared with true response values for three selected observations (vertical lines).
520520
:::
521521

522522
+++

source/regression2.md

Lines changed: 58 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ small_sacramento = sacramento.sample(n=30)
117117
118118
small_plot = (
119119
alt.Chart(small_sacramento)
120-
.mark_circle()
120+
.mark_circle(opacity=1)
121121
.encode(
122122
x=alt.X("sqft")
123123
.scale(zero=False)
@@ -129,7 +129,50 @@ small_plot = (
129129
)
130130
)
131131
132-
small_plot += small_plot.transform_regression("sqft", "price").mark_line()
132+
133+
# create df_lines with one fake/empty line (for starting at 2nd color later)
134+
df_lines = {"x": [500, 500], "y": [100000, 100000], "number": ["-1", "-1"]}
135+
136+
# set the domains (range of x values) of lines
137+
min_x = small_sacramento["sqft"].min()
138+
max_x = small_sacramento["sqft"].max()
139+
140+
# add the line of best fit
141+
from sklearn.linear_model import LinearRegression
142+
lm = LinearRegression()
143+
lm.fit(small_sacramento[["sqft"]], small_sacramento[["price"]])
144+
pred_min = float(lm.predict(pd.DataFrame({"sqft": [min_x]})))
145+
pred_max = float(lm.predict(pd.DataFrame({"sqft": [max_x]})))
146+
147+
df_lines["x"].extend([min_x, max_x])
148+
df_lines["y"].extend([pred_min, pred_max])
149+
df_lines["number"].extend(["0", "0"])
150+
151+
# add other similar looking lines
152+
intercept_l = [-64542.23, -6900, -64542.23]
153+
slope_l = [190, 175, 160]
154+
for i in range(len(slope_l)):
155+
df_lines["x"].extend([min_x, max_x])
156+
df_lines["y"].extend([
157+
intercept_l[i] + slope_l[i] * min_x,
158+
intercept_l[i] + slope_l[i] * max_x,
159+
])
160+
df_lines["number"].extend([f"{i+1}", f"{i+1}"])
161+
162+
df_lines = pd.DataFrame(df_lines)
163+
164+
# plot the bogus line to skip the same color as the scatter
165+
small_plot += alt.Chart(
166+
df_lines[df_lines["number"] == "-1"]
167+
).mark_line().encode(
168+
x="x", y="y", color=alt.Color("number", legend=None)
169+
)
170+
# plot the real line with 2nd color
171+
small_plot += alt.Chart(
172+
df_lines[df_lines["number"] == "0"]
173+
).mark_line().encode(
174+
x="x", y="y", color=alt.Color("number", legend=None)
175+
)
133176
134177
small_plot
135178
```
@@ -189,11 +232,11 @@ prediction = float(lm.predict(pd.DataFrame({"sqft": [2000]})))
189232
190233
# the vertical dotted line
191234
line_df = pd.DataFrame({"x": [2000]})
192-
rule = alt.Chart(line_df).mark_rule(strokeDash=[2, 4]).encode(x="x")
235+
rule = alt.Chart(line_df).mark_rule(strokeDash=[6], size=1.5).encode(x="x")
193236
194237
# the red point
195238
point_df = pd.DataFrame({"x": [2000], "y": [prediction]})
196-
point = alt.Chart(point_df).mark_circle(color="red", size=100).encode(x="x", y="y")
239+
point = alt.Chart(point_df).mark_circle(color="red", size=80, opacity=1).encode(x="x", y="y")
197240
198241
# overlay all plots
199242
small_plot_2000_pred = (
@@ -204,7 +247,7 @@ small_plot_2000_pred = (
204247
+ alt.Chart(
205248
pd.DataFrame(
206249
{
207-
"x": [2350],
250+
"x": [2450],
208251
"y": [prediction - 41000],
209252
"prediction": ["$" + "{0:,.0f}".format(prediction)],
210253
}
@@ -242,32 +285,11 @@ Some plausible examples are shown in {numref}`fig:08-several-lines`.
242285
```{code-cell} ipython3
243286
:tags: [remove-cell]
244287
245-
intercept_l = [-64542.23, -6900, -64542.23]
246-
slope_l = [190, 175, 160]
247-
line_color_l = ["green", "purple", "red"]
248-
249-
# set the domains (range of x values) of lines
250-
min_x = small_sacramento["sqft"].min()
251-
max_x = small_sacramento["sqft"].max()
252-
253288
several_lines_plot = small_plot.copy()
254289
255-
for i in range(len(slope_l)):
256-
several_lines_plot += (
257-
alt.Chart(
258-
pd.DataFrame(
259-
{
260-
"x": [min_x, max_x],
261-
"y": [
262-
intercept_l[i] + slope_l[i] * min_x,
263-
intercept_l[i] + slope_l[i] * max_x,
264-
],
265-
}
266-
)
267-
)
268-
.mark_line(color=line_color_l[i])
269-
.encode(x="x", y="y")
270-
)
290+
several_lines_plot += alt.Chart(
291+
df_lines[df_lines["number"] != "0"]
292+
).mark_line().encode(x="x", y="y", color=alt.Color("number",legend=None))
271293
272294
several_lines_plot
273295
```
@@ -292,7 +314,7 @@ Scatter plot of sale price versus size with many possible lines that could be dr
292314
Simple linear regression chooses the straight line of best fit by choosing
293315
the line that minimizes the **average squared vertical distance** between itself and
294316
each of the observed data points in the training data. {numref}`fig:08-verticalDistToMin` illustrates
295-
these vertical distances as red lines. Finally, to assess the predictive
317+
these vertical distances as lines. Finally, to assess the predictive
296318
accuracy of a simple linear regression model,
297319
we use RMSPE—the same measure of predictive performance we used with K-NN regression.
298320

@@ -313,7 +335,7 @@ v_lines = []
313335
for i in range(len(small_sacramento)):
314336
sqft_val = small_sacramento.iloc[i]["sqft"]
315337
line_df = small_sacramento_pred.query("sqft == @sqft_val")
316-
v_lines.append(alt.Chart(line_df).mark_line(color="red").encode(x="sqft", y="value"))
338+
v_lines.append(alt.Chart(line_df).mark_line(color="black").encode(x="sqft", y="value"))
317339
318340
error_plot = alt.layer(*v_lines, small_plot).configure_circle(opacity=1)
319341
error_plot
@@ -328,7 +350,7 @@ glue("fig:08-verticalDistToMin", error_plot)
328350
:::{glue:figure} fig:08-verticalDistToMin
329351
:name: fig:08-verticalDistToMin
330352

331-
Scatter plot of sale price versus size with red lines denoting the vertical distances between the predicted values and the observed data points.
353+
Scatter plot of sale price versus size with lines denoting the vertical distances between the predicted values and the observed data points.
332354
:::
333355

334356
+++
@@ -482,7 +504,7 @@ so that we can qualitatively assess if the model seems to fit the data well.
482504
sqft_prediction_grid = sacramento[["sqft"]].agg(["min", "max"])
483505
sqft_prediction_grid["predicted"] = lm.predict(sqft_prediction_grid)
484506
485-
all_points = alt.Chart(sacramento).mark_circle(opacity=0.4).encode(
507+
all_points = alt.Chart(sacramento).mark_circle().encode(
486508
x=alt.X("sqft")
487509
.scale(zero=False)
488510
.title("House size (square feet)"),
@@ -966,7 +988,7 @@ lm_plot_outlier += lm_plot_outlier.transform_regression("sqft", "price").mark_li
966988
967989
outlier_pt = (
968990
alt.Chart(sacramento_outlier)
969-
.mark_circle(color="red", size=100)
991+
.mark_circle(color="#d62728", size=100)
970992
.encode(x="sqft", y="price")
971993
)
972994
@@ -987,7 +1009,7 @@ outlier_line = (
9871009
)
9881010
)
9891011
.transform_regression("sqft", "price")
990-
.mark_line(color="red")
1012+
.mark_line(color="#d62728")
9911013
)
9921014
9931015
lm_plot_outlier += outlier_pt + outlier_line
@@ -1051,7 +1073,7 @@ outlier_line = (
10511073
)
10521074
)
10531075
.transform_regression("sqft", "price")
1054-
.mark_line(color="red")
1076+
.mark_line(color="#d62728")
10551077
)
10561078
10571079
lm_plot_outlier_large += outlier_pt + outlier_line

source/viz.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1474,7 +1474,7 @@ so we are including it here already.
14741474
```
14751475

14761476
```{code-cell} ipython3
1477-
v_line = alt.Chart(morley_df).mark_rule(strokeDash=[5], size=2).encode(
1477+
v_line = alt.Chart(morley_df).mark_rule(strokeDash=[6], size=1.5).encode(
14781478
x=alt.datum(792.458)
14791479
)
14801480
@@ -1672,7 +1672,7 @@ morley_hist_rel = alt.Chart(morley_df).mark_bar().encode(
16721672
)
16731673
16741674
# Recreating v_line to indicate that the speed of light is at 0% relative error
1675-
v_line = alt.Chart(morley_df).mark_rule(strokeDash=[5], size=2).encode(
1675+
v_line = alt.Chart(morley_df).mark_rule(strokeDash=[6], size=1.5).encode(
16761676
x=alt.datum(0)
16771677
)
16781678

0 commit comments

Comments
 (0)