Skip to content

Commit 610ae81

Browse files
improvements to consistency in visualizations in reg1,2
1 parent 71a0f67 commit 610ae81

File tree

2 files changed

+33
-28
lines changed

2 files changed

+33
-28
lines changed

source/regression1.Rmd

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -244,13 +244,17 @@ nearest_neighbors <- small_sacramento |>
244244
nearest_neighbors
245245
```
246246

247-
```{r 07-knn3-example, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with lines to 5 nearest neighbors."}
247+
```{r 07-knn3-example, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with lines to 5 nearest neighbors (highlighted in orange)."}
248248
nearest_neighbors <- mutate(nearest_neighbors, twothou = rep(2000, 5))
249249
250250
nn_plot <- small_plot +
251251
geom_segment(
252252
data = nearest_neighbors,
253-
aes(x = twothou, xend = sqft, y = price, yend = price), color = "orange"
253+
aes(x = twothou, xend = sqft, y = price, yend = price), color = "black"
254+
) +
255+
geom_point(
256+
data = nearest_neighbors,
257+
aes(x = sqft, y = price), color = "darkorange"
254258
)
255259
256260
nn_plot
@@ -347,13 +351,13 @@ different from the true values, then RMSPE will be quite large. When we
347351
use cross-validation, we will choose the $K$ that gives
348352
us the smallest RMSPE.
349353

350-
```{r 07-verticalerrors, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with example predictions (blue line) and the error in those predictions compared with true response values for three selected observations (vertical red lines).", fig.height = 3.5, fig.width = 4.5}
354+
```{r 07-verticalerrors, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with example predictions (blue line) and the error in those predictions compared with true response values (vertical lines).", fig.height = 3.5, fig.width = 4.5}
351355
# save the seed
352356
seedval <- .Random.seed
353357
354358
# (synthetic) new prediction points
355-
pts <- tibble(sqft = c(1250, 1850, 2250), price = c(250000, 200000, 500000))
356-
finegrid <- tibble(sqft = seq(from = 900, to = 3900, by = 10))
359+
pts <- tibble(sqft = c(1500, 1810, 2200), price = c(300000, 160000, 500000))
360+
finegrid <- tibble(sqft = seq(from = 500, to = 4100, by = 10))
357361
358362
# fit the model
359363
sacr_recipe_hid <- recipe(price ~ sqft, data = small_sacramento) |>
@@ -375,8 +379,8 @@ sacr_full_preds_hid <- sacr_fit_hid |>
375379
bind_cols(finegrid)
376380
377381
sacr_new_preds_hid <- sacr_fit_hid |>
378-
predict(pts) |>
379-
bind_cols(pts)
382+
predict(small_sacramento) |>
383+
bind_cols(small_sacramento)
380384
381385
# plot the vertical prediction errors
382386
errors_plot <- ggplot(small_sacramento, aes(x = sqft, y = price)) +
@@ -386,11 +390,11 @@ errors_plot <- ggplot(small_sacramento, aes(x = sqft, y = price)) +
386390
scale_y_continuous(labels = dollar_format()) +
387391
geom_line(data = sacr_full_preds_hid,
388392
aes(x = sqft, y = .pred),
389-
color = "blue") +
393+
color = "steelblue", linewidth=1) +
390394
geom_segment(
391395
data = sacr_new_preds_hid,
392396
aes(x = sqft, xend = sqft, y = price, yend = .pred),
393-
color = "red") +
397+
color = "black") +
394398
geom_point(data = sacr_new_preds_hid,
395399
aes(x = sqft, y = price),
396400
color = "black")
@@ -543,7 +547,7 @@ for (i in 1:6) {
543547
xlab("House size (square feet)") +
544548
ylab("Price (USD)") +
545549
scale_y_continuous(labels = dollar_format()) +
546-
geom_line(data = sacr_preds, aes(x = sqft, y = .pred), color = "blue") +
550+
geom_line(data = sacr_preds, aes(x = sqft, y = .pred), color = "steelblue", linewidth=1) +
547551
ggtitle(paste0("K = ", gridvals[[i]])) +
548552
theme(text = element_text(size = 20), axis.title=element_text(size=20))
549553
}
@@ -676,7 +680,8 @@ plot_final <- ggplot(sacramento, aes(x = sqft, y = price)) +
676680
geom_point(alpha = 0.4) +
677681
geom_line(data = sacr_preds,
678682
mapping = aes(x = sqft, y = .pred),
679-
color = "blue") +
683+
color = "steelblue",
684+
linewidth = 1) +
680685
xlab("House size (square feet)") +
681686
ylab("Price (USD)") +
682687
scale_y_continuous(labels = dollar_format()) +

source/regression2.Rmd

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ small_plot <- ggplot(small_sacramento, aes(x = sqft, y = price)) +
117117
xlab("House size (square feet)") +
118118
ylab("Price (USD)") +
119119
scale_y_continuous(labels = dollar_format()) +
120-
geom_smooth(method = "lm", se = FALSE)
120+
geom_smooth(method = "lm", se = FALSE, color = "steelblue")
121121
122122
small_plot
123123
```
@@ -154,10 +154,9 @@ small_model <- lm(price ~ sqft, data = small_sacramento)
154154
prediction <- predict(small_model, data.frame(sqft = 2000))
155155
156156
small_plot +
157-
geom_vline(xintercept = 2000, linetype = "dotted") +
158-
geom_point(aes(x = 2000,
159-
y = prediction[[1]],
160-
color = "red", size = 2.5)) +
157+
geom_vline(xintercept = 2000, linetype = "dashed") +
158+
geom_point(aes(x = 2000, y = prediction[[1]]),
159+
color = "red", size = 2.5) +
161160
annotate("text",
162161
x = 2350,
163162
y = prediction[[1]]-30000,
@@ -179,15 +178,15 @@ Some plausible examples are shown in Figure \@ref(fig:08-several-lines).
179178

180179
```{r 08-several-lines, echo = FALSE, message = FALSE, warning = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of sale price versus size with many possible lines that could be drawn through the data points."}
181180
small_plot +
182-
geom_abline(intercept = -64542.23, slope = 190, color = "green") +
183-
geom_abline(intercept = -6900, slope = 175, color = "purple") +
184-
geom_abline(intercept = -64542.23, slope = 160, color = "red")
181+
geom_abline(intercept = -64542.23, slope = 190, color = "darkorange", linewidth = 1) +
182+
geom_abline(intercept = -6900, slope = 175, color = "purple", linewidth = 1) +
183+
geom_abline(intercept = -64542.23, slope = 160, color = "firebrick", linewidth = 1)
185184
```
186185

187186
Simple linear regression chooses the straight line of best fit by choosing
188187
the line that minimizes the **average squared vertical distance** between itself and
189-
each of the observed data points in the training data. Figure \@ref(fig:08-verticalDistToMin) illustrates
190-
these vertical distances as red lines. Finally, to assess the predictive
188+
each of the observed data points in the training data (equivalent to minimizing the RMSE).
189+
Figure \@ref(fig:08-verticalDistToMin) illustrates these vertical distances as red lines. Finally, to assess the predictive
191190
accuracy of a simple linear regression model,
192191
we use RMSPE&mdash;the same measure of predictive performance we used with K-NN regression.
193192
\index{RMSPE}
@@ -199,7 +198,7 @@ small_sacramento <- small_sacramento |>
199198
small_plot +
200199
geom_segment(data = small_sacramento,
201200
aes(xend = sqft, yend = predicted),
202-
colour = "red")
201+
colour = "black")
203202
```
204203

205204
## Linear regression in R
@@ -317,7 +316,8 @@ lm_plot_final <- ggplot(sacramento, aes(x = sqft, y = price)) +
317316
geom_point(alpha = 0.4) +
318317
geom_line(data = sacr_preds,
319318
mapping = aes(x = sqft, y = .pred),
320-
color = "blue") +
319+
color = "steelblue",
320+
linewidth = 1) +
321321
xlab("House size (square feet)") +
322322
ylab("Price (USD)") +
323323
scale_y_continuous(labels = dollar_format()) +
@@ -390,7 +390,7 @@ knn_plot_final <- ggplot(sacr_preds, aes(x = sqft, y = price)) +
390390
xlab("House size (square feet)") +
391391
ylab("Price (USD)") +
392392
scale_y_continuous(labels = dollar_format()) +
393-
geom_line(data = sacr_preds, aes(x = sqft, y = .pred), color = "blue") +
393+
geom_line(data = sacr_preds, aes(x = sqft, y = .pred), color = "steelblue", linewidth = 1) +
394394
ggtitle("K-NN regression") +
395395
annotate("text", x = 3500, y = 100000, label = paste("RMSPE =", sacr_rmspe)) +
396396
theme(text = element_text(size = 18), axis.title=element_text(size=18))
@@ -663,7 +663,7 @@ lm_plot_outlier <- ggplot(sacramento_train_small, aes(x = sqft, y = price)) +
663663
xlab("House size (square feet)") +
664664
ylab("Price (USD)") +
665665
scale_y_continuous(labels = dollar_format()) +
666-
geom_smooth(method = "lm", se = FALSE) +
666+
geom_smooth(method = "lm", se = FALSE, color = "steelblue") +
667667
geom_smooth(data = sacramento_train_small |>
668668
add_row(sqft = 5000, price = 50000),
669669
method = "lm", se = FALSE, color = "red")
@@ -693,7 +693,7 @@ lm_plot_outlier_large <- ggplot(sacramento_train, aes(x = sqft, y = price)) +
693693
xlab("House size (square feet)") +
694694
ylab("Price (USD)") +
695695
scale_y_continuous(labels = dollar_format()) +
696-
geom_smooth(method = "lm", se = FALSE) +
696+
geom_smooth(method = "lm", se = FALSE, color = "steelblue") +
697697
geom_smooth(data = sacramento_train |>
698698
add_row(sqft = 5000, price = 50000),
699699
method = "lm",
@@ -853,7 +853,7 @@ curve_plt <- ggplot(df, aes(x = x, y = y)) +
853853
geom_point() +
854854
xlab("x") +
855855
ylab("y") +
856-
geom_smooth(method = "lm", se = FALSE)
856+
geom_smooth(method = "lm", se = FALSE, color = "steelblue")
857857
curve_plt
858858
```
859859

@@ -880,7 +880,7 @@ curve_plt2 <- ggplot(df, aes(x = z, y = y)) +
880880
geom_point() +
881881
xlab(expression(paste("z = x"^"3"))) +
882882
ylab("y") +
883-
geom_smooth(method = "lm", se = FALSE)
883+
geom_smooth(method = "lm", se = FALSE, color = "steelblue")
884884
885885
curve_plt2
886886
```

0 commit comments

Comments
 (0)