fixing figure caption

leem44 · leem44 · commit 6edfb121ce15 · 2021-10-26T22:46:12.000-07:00
diff --git a/regression2.Rmd b/regression2.Rmd
@@ -72,7 +72,7 @@ to draw the straight line of best fit through our existing data points.
 The small subset of data as well as the line of best fit are shown
 in Figure \@ref(fig:08-lin-reg1).
 
-```{r 08-lin-reg1, message = FALSE, warning = FALSE, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of sale price versus size with line of best fit for subset of the Sacramento housing data."}
+```{r 08-lin-reg1, message = FALSE, warning = FALSE, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of sale price versus size with line of best fit for subset of the Sacramento housing data."}
 library(tidyverse)
 library(tidymodels)
 library(scales)
@@ -122,7 +122,7 @@ above to evaluate the predicted sale price given the value we have for the
 predictor variable&mdash;here 2,000 square feet. Figure
 \@ref(fig:08-lin-reg2) demonstrates this process.
 
-```{r 08-lin-reg2, message = FALSE, warning = FALSE, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of sale price versus size with line of best fit and a red dot at the predicted sale price for a 2000 square foot home."}
+```{r 08-lin-reg2, message = FALSE, warning = FALSE, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of sale price versus size with line of best fit and a red dot at the predicted sale price for a 2000 square foot home."}
 small_model <- lm(price ~ sqft, data = small_sacramento)
 prediction <- predict(small_model, data.frame(sqft = 2000))
 
@@ -150,7 +150,7 @@ exactly does simple linear regression choose the line of best fit? Many
 different lines could be drawn through the data points. 
 Some plausible examples are shown in Figure \@ref(fig:08-several-lines).
 
-```{r 08-several-lines, echo = FALSE, message = FALSE, warning = FALSE, fig.height = 4, fig.width = 5,  fig.cap = "Scatter plot of sale price versus size with many possible lines that could be drawn through the data points."}
+```{r 08-several-lines, echo = FALSE, message = FALSE, warning = FALSE, fig.height = 3.5, fig.width = 4.5,  fig.cap = "Scatter plot of sale price versus size with many possible lines that could be drawn through the data points."}
 small_plot +
   geom_abline(intercept = -64542.23, slope = 190, color = "green") +
   geom_abline(intercept = -6900, slope = 175, color = "purple") +
@@ -165,7 +165,7 @@ accuracy of a simple linear regression model,
 we use RMSPE&mdash;the same measure of predictive performance we used with KNN regression.
 \index{RMSPE}
 
-```{r 08-verticalDistToMin,  echo = FALSE, message = FALSE, warning = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of sale price versus size with red lines denoting the vertical distances between the predicted values and the observed data points."}
+```{r 08-verticalDistToMin,  echo = FALSE, message = FALSE, warning = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Scatter plot of sale price versus size with red lines denoting the vertical distances between the predicted values and the observed data points."}
 small_sacramento <- small_sacramento |>
   mutate(predicted = predict(small_model))
 
@@ -206,7 +206,7 @@ sacramento_test <- testing(sacramento_split)
 
 Now that we have our training data, we will create the model specification
 and recipe, and fit our simple linear regression model:
-```{r 08-fitLM, fig.height = 4, fig.width = 5}
+```{r 08-fitLM, fig.height = 3.5, fig.width = 4.5}
 lm_spec <- linear_reg() |>
   set_engine("lm") |>
   set_mode("regression")
@@ -268,7 +268,7 @@ linear regression predicted line of best fit. By default `geom_smooth` adds some
 to the plot that we are not interested in at this point; we provide the argument `se = FALSE` to
 tell `geom_smooth` not to show that information. Figure \@ref(fig:08-lm-predict-all) displays the result.
 
-```{r 08-lm-predict-all, fig.height = 4, fig.width = 5, warning = FALSE, message = FALSE, fig.cap = "Scatter plot of sale price versus size with line of best fit for the full Sacramento housing data."}
+```{r 08-lm-predict-all, fig.height = 3.5, fig.width = 4.5, warning = FALSE, message = FALSE, fig.cap = "Scatter plot of sale price versus size with line of best fit for the full Sacramento housing data."}
 lm_plot_final <- ggplot(sacramento_train, aes(x = sqft, y = price)) +
   geom_point(alpha = 0.4) +
   xlab("House size (square feet)") +
@@ -344,7 +344,8 @@ knn_plot_final <- ggplot(sacr_preds, aes(x = sqft, y = price)) +
   scale_y_continuous(labels = dollar_format()) +
   geom_line(data = sacr_preds, aes(x = sqft, y = .pred), color = "blue") +
   ggtitle("KNN regression") +
-  annotate("text", x = 3500, y = 100000, label = paste("RMSPE =", sacr_rmspe))
+  annotate("text", x = 3500, y = 100000, label = paste("RMSPE =", sacr_rmspe)) +
+  theme(text = element_text(size = 14))
 
 lm_rmspe <- lm_test_results |>
   filter(.metric == "rmse") |>
@@ -353,7 +354,8 @@ lm_rmspe <- lm_test_results |>
 
 lm_plot_final <- lm_plot_final +
   annotate("text", x = 3500, y = 100000, label = paste("RMSPE =", lm_rmspe)) +
-  ggtitle("linear regression")
+  ggtitle("linear regression") +
+  theme(text = element_text(size = 14))
 
 grid.arrange(lm_plot_final, knn_plot_final, ncol = 2)
 ```
@@ -597,7 +599,7 @@ the data point is an *outlier*. In blue we plot the original line of best fit, a
 we plot the new line of best fit including the outlier. You can see how different the red line
 is from the blue line, which is entirely caused by that one extra outlier data point.
 
-```{r 08-lm-outlier, fig.height = 4, fig.width = 5, message = FALSE, warning = FALSE, echo = FALSE, fig.cap = "Scatter plot of a subset of the data, with outlier highlighted in red."}
+```{r 08-lm-outlier, fig.height = 3.5, fig.width = 4.5, message = FALSE, warning = FALSE, echo = FALSE, fig.cap = "Scatter plot of a subset of the data, with outlier highlighted in red."}
 sacramento_train_small <- sacramento_train |> sample_n(100)
 sacramento_outlier <- tibble(sqft = 5000, price = 50000)
 
@@ -626,7 +628,7 @@ changes much less when adding the outlier.
 Nevertheless, it is still important when working with linear regression to critically
 think about how much any individual data point is influencing the model.
 
-```{r 08-lm-outlier-2, fig.height = 4, fig.width = 5, warning = FALSE, message = FALSE, echo = FALSE, fig.cap = "Scatter plot of the full data, with outlier highlighted in red."}
+```{r 08-lm-outlier-2, fig.height = 3.5, fig.width = 4.5, warning = FALSE, message = FALSE, echo = FALSE, fig.cap = "Scatter plot of the full data, with outlier highlighted in red."}
 sacramento_outlier <- tibble(sqft = 5000, price = 50000)
 
 lm_plot_outlier_large <- ggplot(sacramento_train, aes(x = sqft, y = price)) +
@@ -660,7 +662,7 @@ Since the two people are each slightly inaccurate, the two measurements might
 not agree exactly, but they are very strongly linearly related to each other,
 as shown in Figure \@ref(fig:08-lm-multicol).
 
-```{r 08-lm-multicol, fig.height = 4, fig.width = 5, warning = FALSE, echo = FALSE, fig.cap = "Scatter plot of the with possible outlier highlighted in red."}
+```{r 08-lm-multicol, fig.height = 3.5, fig.width = 4.5, warning = FALSE, echo = FALSE, fig.cap = "Scatter plot of house size (in square inches) versus house size (in square feet)."}
 sacramento_train <- sacramento_train |>
                           mutate(sqft1 = sqft + 100 * sample(1000000,
                                                              size=nrow(sacramento_train),
@@ -793,7 +795,7 @@ df <- df |>
 df
 ```
 
-```{r 08-predictor-design, message = FALSE, warning = FALSE, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Example of a data set with a nonlinear relationship between the predictor and the response."}
+```{r 08-predictor-design, message = FALSE, warning = FALSE, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Example of a data set with a nonlinear relationship between the predictor and the response."}
 curve_plt <- ggplot(df, aes(x = x, y = y)) +
   geom_point() +
   xlab("x") +
@@ -820,7 +822,7 @@ Note that none of the `y` response values have changed between Figures \@ref(fig
 and \@ref(fig:08-predictor-design-2); the only change is that the `x` values
 have been replaced by `z` values.
 
-```{r 08-predictor-design-2, message = FALSE, warning = FALSE, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Relationship between the transformed predictor and the response."}
+```{r 08-predictor-design-2, message = FALSE, warning = FALSE, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Relationship between the transformed predictor and the response."}
 curve_plt2 <- ggplot(df, aes(x = z, y = y)) +
   geom_point() +
   xlab(paste0("z = ", expression(x^3))) +