Skip to content

Commit a8b24f6

Browse files
committed
changing fig sizes for classification and regression
1 parent 9ce7713 commit a8b24f6

File tree

2 files changed

+21
-18
lines changed

2 files changed

+21
-18
lines changed

classification2.Rmd

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ tumor cell concavity versus smoothness colored by diagnosis in Figure \@ref(fig:
188188
You will also notice that we set the random seed here at the beginning of the analysis
189189
using the `set.seed` function, as described in Section \@ref(randomseeds).
190190

191-
```{r 06-precode, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of tumor cell concavity versus smoothness colored by diagnosis label.", message = F, warning = F}
191+
```{r 06-precode, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of tumor cell concavity versus smoothness colored by diagnosis label.", message = F, warning = F}
192192
# load packages
193193
library(tidyverse)
194194
library(tidymodels)
@@ -754,7 +754,7 @@ We can select the best value of the number of neighbors (i.e., the one that resu
754754
in the highest classifier accuracy estimate) by plotting the accuracy versus $K$
755755
in Figure \@ref(fig:06-find-k).
756756

757-
```{r 06-find-k, fig.height = 4, fig.width = 5, fig.cap= "Plot of estimated accuracy versus the number of neighbors."}
757+
```{r 06-find-k, fig.height = 3.5, fig.width = 4, fig.cap= "Plot of estimated accuracy versus the number of neighbors."}
758758
accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
759759
geom_point() +
760760
geom_line() +
@@ -800,7 +800,7 @@ we vary $K$ from 1 to almost the number of observations in the data set.
800800
set.seed(1)
801801
```
802802

803-
```{r 06-lots-of-ks, message = FALSE, fig.height = 4, fig.width = 5, fig.cap="Plot of accuracy estimate versus number of neighbors for many K values."}
803+
```{r 06-lots-of-ks, message = FALSE, fig.height = 3.5, fig.width = 4, fig.cap="Plot of accuracy estimate versus number of neighbors for many K values."}
804804
k_lots <- tibble(neighbors = seq(from = 1, to = 385, by = 10))
805805
806806
knn_results <- workflow() |>
@@ -894,7 +894,8 @@ for (i in 1:length(ks)) {
894894
labs(color = "Diagnosis") +
895895
ggtitle(paste("K = ", ks[[i]])) +
896896
scale_color_manual(labels = c("Malignant", "Benign"),
897-
values = c("orange2", "steelblue2"))
897+
values = c("orange2", "steelblue2")) +
898+
theme(text = element_text(size = 16))
898899
}
899900
900901
p_no_legend <- lapply(plots, function(x) x + theme(legend.position = "none"))
@@ -1004,7 +1005,7 @@ variables there are, the more (random) influence they have, and the more they
10041005
corrupt the set of nearest neighbors that vote on the class of the new
10051006
observation to predict.
10061007

1007-
```{r 06-performance-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Effect of inclusion of irrelevant predictors."}
1008+
```{r 06-performance-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Effect of inclusion of irrelevant predictors."}
10081009
# get accuracies after including k irrelevant features
10091010
ks <- c(0, 5, 10, 15, 20, 40)
10101011
fixedaccs <- list()
@@ -1093,7 +1094,7 @@ variables, the number of neighbors does not increase smoothly; but the general t
10931094
Figure \@ref(fig:06-fixed-irrelevant-features) corroborates
10941095
this evidence; if we fix the number of neighbors to $K=3$, the accuracy falls off more quickly.
10951096

1096-
```{r 06-neighbors-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Tuned number of neighbors for varying number of irrelevant predictors."}
1097+
```{r 06-neighbors-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Tuned number of neighbors for varying number of irrelevant predictors."}
10971098
plt_irrelevant_nghbrs <- ggplot(res) +
10981099
geom_line(mapping = aes(x=ks, y=nghbrs)) +
10991100
labs(x = "Number of Irrelevant Predictors",
@@ -1102,7 +1103,7 @@ plt_irrelevant_nghbrs <- ggplot(res) +
11021103
plt_irrelevant_nghbrs
11031104
```
11041105

1105-
```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
1106+
```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
11061107
res_tmp <- res %>% pivot_longer(cols=c("accs", "fixedaccs"),
11071108
names_to="Type",
11081109
values_to="accuracy")
@@ -1338,7 +1339,7 @@ where the elbow occurs, and whether adding a variable provides a meaningful incr
13381339
> part of tuning your classifier, you *cannot use your test data* for this
13391340
> process!
13401341
1341-
```{r 06-fwdsel-3, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Estimated accuracy versus the number of predictors for the sequence of models built using forward selection."}
1342+
```{r 06-fwdsel-3, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Estimated accuracy versus the number of predictors for the sequence of models built using forward selection."}
13421343
fwd_sel_accuracies_plot <- accuracies |>
13431344
ggplot(aes(x = size, y = accuracy)) +
13441345
geom_line() +

regression1.Rmd

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ want to predict (sale price) on the y-axis.
125125
> (from the `scales` package)
126126
> to the `labels` argument of the `scale_y_continuous` function.
127127
128-
```{r 07-edaRegr, message = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of price (USD) versus house size (square feet)."}
128+
```{r 07-edaRegr, message = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of price (USD) versus house size (square feet)."}
129129
eda <- ggplot(sacramento, aes(x = sqft, y = price)) +
130130
geom_point(alpha = 0.4) +
131131
xlab("House size (square feet)") +
@@ -179,7 +179,7 @@ you can see that we have no
179179
observations of a house of size *exactly* 2,000 square feet. How can we predict
180180
the sale price?
181181

182-
```{r 07-small-eda-regr, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with vertical line indicating 2,000 square feet on x-axis."}
182+
```{r 07-small-eda-regr, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with vertical line indicating 2,000 square feet on x-axis."}
183183
small_plot <- ggplot(small_sacramento, aes(x = sqft, y = price)) +
184184
geom_point() +
185185
xlab("House size (square feet)") +
@@ -207,7 +207,7 @@ nearest_neighbors <- small_sacramento |>
207207
nearest_neighbors
208208
```
209209

210-
```{r 07-knn3-example, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with lines to 5 nearest neighbors."}
210+
```{r 07-knn3-example, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with lines to 5 nearest neighbors."}
211211
nearest_neighbors <- mutate(nearest_neighbors, twothou = rep(2000, 5))
212212
213213
nn_plot <- small_plot +
@@ -234,7 +234,7 @@ prediction <- nearest_neighbors |>
234234
prediction
235235
```
236236

237-
```{r 07-predictedViz-knn, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with predicted price for a 2,000 square-foot house based on 5 nearest neighbors represented as a red dot."}
237+
```{r 07-predictedViz-knn, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with predicted price for a 2,000 square-foot house based on 5 nearest neighbors represented as a red dot."}
238238
nn_plot +
239239
geom_point(aes(x = 2000, y = prediction[[1]]), color = "red", size = 2.5)
240240
```
@@ -305,7 +305,7 @@ different from the true values, then RMSPE will be quite large. When we
305305
use cross validation, we will choose the $K$ that gives
306306
us the smallest RMSPE.
307307

308-
```{r 07-verticalerrors, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with example predictions (blue line) and the error in those predictions compared with true response values for three selected observations (vertical red lines).", fig.height = 4, fig.width = 5}
308+
```{r 07-verticalerrors, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with example predictions (blue line) and the error in those predictions compared with true response values for three selected observations (vertical red lines).", fig.height = 3.5, fig.width = 4.5}
309309
# save the seed
310310
seedval <- .Random.seed
311311
@@ -434,7 +434,7 @@ sacr_results <- sacr_wkflw |>
434434
sacr_results
435435
```
436436

437-
```{r 07-choose-k-knn-plot, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Effect of the number of neighbors on the RMSPE."}
437+
```{r 07-choose-k-knn-plot, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Effect of the number of neighbors on the RMSPE."}
438438
sacr_tunek_plot <- ggplot(sacr_results, aes(x = neighbors, y = mean)) +
439439
geom_point() +
440440
geom_line() +
@@ -499,7 +499,8 @@ for (i in 1:6) {
499499
ylab("Price (USD)") +
500500
scale_y_continuous(labels = dollar_format()) +
501501
geom_line(data = sacr_preds, aes(x = sqft, y = .pred), color = "blue") +
502-
ggtitle(paste0("K = ", gridvals[[i]]))
502+
ggtitle(paste0("K = ", gridvals[[i]])) +
503+
theme(text = element_text(size = 16))
503504
} else {
504505
plots[[i]] <- ggplot(sacr_preds, aes(x = sqft, y = price)) +
505506
geom_point(alpha = 0.4) +
@@ -510,7 +511,8 @@ for (i in 1:6) {
510511
mapping = aes(x = sqft),
511512
yintercept = mean(sacr_preds$price),
512513
color = "blue") +
513-
ggtitle(paste0("K = ", gridvals[[i]]))
514+
ggtitle(paste0("K = ", gridvals[[i]])) +
515+
theme(text = element_text(size = 16))
514516
}
515517
}
516518
@@ -618,7 +620,7 @@ the range of house sizes we might encounter in the Sacramento area&mdash;from 50
618620
You have already seen a few plots like this in this chapter, but here we also provide the code that generated it
619621
as a learning challenge.
620622

621-
```{r 07-predict-all, warning = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Predicted values of house price (blue line) for the final KNN regression model."}
623+
```{r 07-predict-all, warning = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Predicted values of house price (blue line) for the final KNN regression model."}
622624
sacr_preds <- tibble(sqft = seq(from = 500, to = 5000, by = 10))
623625
624626
sacr_preds <- sacr_fit |>
@@ -665,7 +667,7 @@ visualizing the data, before we start modeling the data. Figure \@ref(fig:07-bed
665667
shows that the number of bedrooms might provide useful information
666668
to help predict the sale price of a house.
667669

668-
```{r 07-bedscatter, fig.height = 5, fig.width = 6, fig.cap = "Scatter plot of the sale price of houses versus the number of bedrooms."}
670+
```{r 07-bedscatter, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of the sale price of houses versus the number of bedrooms."}
669671
plot_beds <- sacramento |>
670672
ggplot(aes(x = beds, y = price)) +
671673
geom_point(alpha = 0.4) +

0 commit comments

Comments
 (0)