changing fig sizes for classification and regression

leem44 · leem44 · commit a8b24f6c6af6 · 2021-10-26T20:36:21.000-07:00
diff --git a/classification2.Rmd b/classification2.Rmd
@@ -188,7 +188,7 @@ tumor cell concavity versus smoothness colored by diagnosis in Figure \@ref(fig:
 You will also notice that we set the random seed here at the beginning of the analysis
 using the `set.seed` function, as described in Section \@ref(randomseeds).
 
-```{r 06-precode, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of tumor cell concavity versus smoothness colored by diagnosis label.", message = F, warning = F}
+```{r 06-precode, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of tumor cell concavity versus smoothness colored by diagnosis label.", message = F, warning = F}
 # load packages
 library(tidyverse)
 library(tidymodels)
@@ -754,7 +754,7 @@ We can select the best value of the number of neighbors (i.e., the one that resu
 in the highest classifier accuracy estimate) by plotting the accuracy versus $K$ 
 in Figure \@ref(fig:06-find-k).
 
-```{r 06-find-k, fig.height = 4, fig.width = 5, fig.cap= "Plot of estimated accuracy versus the number of neighbors."}
+```{r 06-find-k,  fig.height = 3.5, fig.width = 4, fig.cap= "Plot of estimated accuracy versus the number of neighbors."}
 accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
   geom_point() +
   geom_line() +
@@ -800,7 +800,7 @@ we vary $K$ from 1 to almost the number of observations in the data set.
 set.seed(1)
 ```
 
-```{r 06-lots-of-ks, message = FALSE, fig.height = 4, fig.width = 5, fig.cap="Plot of accuracy estimate versus number of neighbors for many K values."}
+```{r 06-lots-of-ks, message = FALSE, fig.height = 3.5, fig.width = 4, fig.cap="Plot of accuracy estimate versus number of neighbors for many K values."}
 k_lots <- tibble(neighbors = seq(from = 1, to = 385, by = 10))
 
 knn_results <- workflow() |>
@@ -894,7 +894,8 @@ for (i in 1:length(ks)) {
     labs(color = "Diagnosis") +
     ggtitle(paste("K = ", ks[[i]])) +
     scale_color_manual(labels = c("Malignant", "Benign"), 
-                       values = c("orange2", "steelblue2")) 
+                       values = c("orange2", "steelblue2"))  +
+  theme(text = element_text(size = 16))
   }
 
 p_no_legend <- lapply(plots, function(x) x + theme(legend.position = "none"))
@@ -1004,7 +1005,7 @@ variables there are, the more (random) influence they have, and the more they
 corrupt the set of nearest neighbors that vote on the class of the new
 observation to predict.  
 
-```{r 06-performance-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Effect of inclusion of irrelevant predictors."}
+```{r 06-performance-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Effect of inclusion of irrelevant predictors."}
 # get accuracies after including k irrelevant features
 ks <- c(0, 5, 10, 15, 20, 40)
 fixedaccs <- list()
@@ -1093,7 +1094,7 @@ variables, the number of neighbors does not increase smoothly; but the general t
 Figure \@ref(fig:06-fixed-irrelevant-features) corroborates
 this evidence; if we fix the number of neighbors to $K=3$, the accuracy falls off more quickly.
 
-```{r 06-neighbors-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Tuned number of neighbors for varying number of irrelevant predictors."}
+```{r 06-neighbors-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Tuned number of neighbors for varying number of irrelevant predictors."}
 plt_irrelevant_nghbrs <- ggplot(res) +
               geom_line(mapping = aes(x=ks, y=nghbrs)) +
               labs(x = "Number of Irrelevant Predictors", 
@@ -1102,7 +1103,7 @@ plt_irrelevant_nghbrs <- ggplot(res) +
 plt_irrelevant_nghbrs
 ```
 
-```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
+```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
 res_tmp <- res %>% pivot_longer(cols=c("accs", "fixedaccs"), 
                                 names_to="Type", 
                                 values_to="accuracy")
@@ -1338,7 +1339,7 @@ where the elbow occurs, and whether adding a variable provides a meaningful incr
 > part of tuning your classifier, you *cannot use your test data* for this
 > process! 
 
-```{r 06-fwdsel-3, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Estimated accuracy versus the number of predictors for the sequence of models built using forward selection."}
+```{r 06-fwdsel-3, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Estimated accuracy versus the number of predictors for the sequence of models built using forward selection."}
 fwd_sel_accuracies_plot <- accuracies |>
   ggplot(aes(x = size, y = accuracy)) +
   geom_line() +
diff --git a/regression1.Rmd b/regression1.Rmd
@@ -125,7 +125,7 @@ want to predict (sale price) on the y-axis.
 > (from the `scales` package)
 > to the `labels` argument of the `scale_y_continuous` function.
 
-```{r 07-edaRegr, message = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of price (USD) versus house size (square feet)."}
+```{r 07-edaRegr, message = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of price (USD) versus house size (square feet)."}
 eda <- ggplot(sacramento, aes(x = sqft, y = price)) +
   geom_point(alpha = 0.4) +
   xlab("House size (square feet)") +
@@ -179,7 +179,7 @@ you can see that we have no
 observations of a house of size *exactly* 2,000 square feet. How can we predict
 the sale price? 
 
-```{r 07-small-eda-regr, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with vertical line indicating 2,000 square feet on x-axis."}
+```{r 07-small-eda-regr, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with vertical line indicating 2,000 square feet on x-axis."}
 small_plot <- ggplot(small_sacramento, aes(x = sqft, y = price)) +
   geom_point() +
   xlab("House size (square feet)") +
@@ -207,7 +207,7 @@ nearest_neighbors <- small_sacramento |>
 nearest_neighbors
 ```
 
-```{r 07-knn3-example, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with lines to 5 nearest neighbors."}
+```{r 07-knn3-example, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with lines to 5 nearest neighbors."}
 nearest_neighbors <- mutate(nearest_neighbors, twothou = rep(2000, 5))
 
 nn_plot <- small_plot +
@@ -234,7 +234,7 @@ prediction <- nearest_neighbors |>
 prediction
 ```
 
-```{r 07-predictedViz-knn, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with predicted price for a 2,000 square-foot house based on 5 nearest neighbors represented as a red dot."}
+```{r 07-predictedViz-knn, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with predicted price for a 2,000 square-foot house based on 5 nearest neighbors represented as a red dot."}
 nn_plot +
   geom_point(aes(x = 2000, y = prediction[[1]]), color = "red", size = 2.5)
 ```
@@ -305,7 +305,7 @@ different from the true values, then RMSPE will be quite large. When we
 use cross validation, we will choose the $K$ that gives
 us the smallest RMSPE.
 
-```{r 07-verticalerrors, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with example predictions (blue line) and the error in those predictions compared with true response values for three selected observations (vertical red lines).", fig.height = 4, fig.width = 5}
+```{r 07-verticalerrors, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Scatter plot of price (USD) versus house size (square feet) with example predictions (blue line) and the error in those predictions compared with true response values for three selected observations (vertical red lines).", fig.height = 3.5, fig.width = 4.5}
 # save the seed
 seedval <- .Random.seed
 
@@ -434,7 +434,7 @@ sacr_results <- sacr_wkflw |>
 sacr_results
 ```
 
-```{r 07-choose-k-knn-plot, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Effect of the number of neighbors on the RMSPE."} 
+```{r 07-choose-k-knn-plot, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Effect of the number of neighbors on the RMSPE."} 
 sacr_tunek_plot <- ggplot(sacr_results, aes(x = neighbors, y = mean)) +
   geom_point() +
   geom_line() +
@@ -499,7 +499,8 @@ for (i in 1:6) {
         ylab("Price (USD)") +
         scale_y_continuous(labels = dollar_format()) +
         geom_line(data = sacr_preds, aes(x = sqft, y = .pred), color = "blue") +
-        ggtitle(paste0("K = ", gridvals[[i]]))
+        ggtitle(paste0("K = ", gridvals[[i]])) +
+  theme(text = element_text(size = 16))
   } else {
       plots[[i]] <- ggplot(sacr_preds, aes(x = sqft, y = price)) +
         geom_point(alpha = 0.4) +
@@ -510,7 +511,8 @@ for (i in 1:6) {
                    mapping = aes(x = sqft), 
                    yintercept = mean(sacr_preds$price), 
                    color = "blue") +
-        ggtitle(paste0("K = ", gridvals[[i]]))
+        ggtitle(paste0("K = ", gridvals[[i]])) +
+  theme(text = element_text(size = 16))
   }
 }
 
@@ -618,7 +620,7 @@ the range of house sizes we might encounter in the Sacramento area&mdash;from 50
 You have already seen a few plots like this in this chapter, but here we also provide the code that generated it
 as a learning challenge.
 
-```{r 07-predict-all, warning = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Predicted values of house price (blue line) for the final KNN regression model."}
+```{r 07-predict-all, warning = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Predicted values of house price (blue line) for the final KNN regression model."}
 sacr_preds <- tibble(sqft = seq(from = 500, to = 5000, by = 10))
 
 sacr_preds <- sacr_fit |>
@@ -665,7 +667,7 @@ visualizing the data, before we start modeling the data. Figure \@ref(fig:07-bed
 shows that the number of bedrooms might provide useful information
 to help predict the sale price of a house.
 
-```{r 07-bedscatter, fig.height = 5, fig.width = 6, fig.cap = "Scatter plot of the sale price of houses versus the number of bedrooms."}
+```{r 07-bedscatter, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of the sale price of houses versus the number of bedrooms."}
 plot_beds <- sacramento |>
              ggplot(aes(x = beds, y = price)) +
              geom_point(alpha = 0.4) +