UBC-DSCI
diff --git a/‎classification1.Rmd
Lines changed: 6 additions & 3 deletions b/‎classification1.Rmd
Lines changed: 6 additions & 3 deletions
diff --git a/‎classification2.Rmd
Lines changed: 18 additions & 13 deletions b/‎classification2.Rmd
Lines changed: 18 additions & 13 deletions
diff --git a/‎img/tidy_data.key
320 KB b/‎img/tidy_data.key
320 KB
diff --git a/‎img/tidy_data/tidy_data.001.jpeg
461 KB b/‎img/tidy_data/tidy_data.001.jpeg
461 KB
diff --git a/‎inference.Rmd
Lines changed: 13 additions & 13 deletions b/‎inference.Rmd
Lines changed: 13 additions & 13 deletions
@@ -1021,7 +1021,7 @@ ggarrange(unscaled, scaled, ncol = 2, common.legend = TRUE, legend = "bottom")
 
 ```
 
-```{r 05-scaling-plt-zoomed, fig.height = 5, fig.width = 10, echo = FALSE, fig.cap = "Close up of three nearest neighbors for unstandardized data."}
+```{r 05-scaling-plt-zoomed, fig.height = 4.5, fig.width = 9, echo = FALSE, fig.cap = "Close up of three nearest neighbors for unstandardized data."}
 library(ggforce)
 ggplot(unscaled_cancer, aes(x = Area, 
                             y = Smoothness, 
@@ -1056,7 +1056,8 @@ ggplot(unscaled_cancer, aes(x = Area,
   ), color = "black") +  
    facet_zoom(x = ( Area > 380 & Area < 420) , 
               y = (Smoothness > 0.08 & Smoothness < 0.14), zoom.size = 2) + 
-  theme_bw() + theme(legend.position="bottom", text = element_text(size = 16))
+    theme_bw() + 
+    theme(text = element_text(size = 14), legend.position="bottom")
 ```
 
 ### Balancing
@@ -1394,7 +1395,9 @@ wkflw_plot <-
                            color = Class), 
              alpha = 0.02, 
              size = 5) +
-  labs(color = "Diagnosis") +
+  labs(color = "Diagnosis", 
+       x = "Area (standardized)", 
+       y = "Smoothness (standardized)") +
   scale_color_manual(labels = c("Malignant", "Benign"), 
                      values = c("orange2", "steelblue2"))
 
 
@@ -206,7 +206,7 @@ tumor cell concavity versus smoothness colored by diagnosis in Figure \@ref(fig:
 You will also notice that we set the random seed here at the beginning of the analysis
 using the `set.seed` function, as described in Section \@ref(randomseeds).
 
-```{r 06-precode, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of tumor cell concavity versus smoothness colored by diagnosis label.", message = F, warning = F}
+```{r 06-precode, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of tumor cell concavity versus smoothness colored by diagnosis label.", message = F, warning = F}
 # load packages
 library(tidyverse)
 library(tidymodels)
@@ -778,7 +778,7 @@ We can select the best value of the number of neighbors (i.e., the one that resu
 in the highest classifier accuracy estimate) by plotting the accuracy versus $K$ 
 in Figure \@ref(fig:06-find-k).
 
-```{r 06-find-k, fig.height = 4, fig.width = 5, fig.cap= "Plot of estimated accuracy versus the number of neighbors."}
+```{r 06-find-k,  fig.height = 3.5, fig.width = 4, fig.cap= "Plot of estimated accuracy versus the number of neighbors."}
 accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
   geom_point() +
   geom_line() +
@@ -824,7 +824,7 @@ we vary $K$ from 1 to almost the number of observations in the data set.
 set.seed(1)
 ```
 
-```{r 06-lots-of-ks, message = FALSE, fig.height = 4, fig.width = 5, fig.cap="Plot of accuracy estimate versus number of neighbors for many K values."}
+```{r 06-lots-of-ks, message = FALSE, fig.height = 3.5, fig.width = 4, fig.cap="Plot of accuracy estimate versus number of neighbors for many K values."}
 k_lots <- tibble(neighbors = seq(from = 1, to = 385, by = 10))
 
 knn_results <- workflow() |>
@@ -918,7 +918,8 @@ for (i in 1:length(ks)) {
     labs(color = "Diagnosis") +
     ggtitle(paste("K = ", ks[[i]])) +
     scale_color_manual(labels = c("Malignant", "Benign"), 
-                       values = c("orange2", "steelblue2")) 
+                       values = c("orange2", "steelblue2"))  +
+  theme(text = element_text(size = 18))
   }
 
 p_no_legend <- lapply(plots, function(x) x + theme(legend.position = "none"))
@@ -1028,7 +1029,7 @@ variables there are, the more (random) influence they have, and the more they
 corrupt the set of nearest neighbors that vote on the class of the new
 observation to predict.  
 
-```{r 06-performance-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Effect of inclusion of irrelevant predictors."}
+```{r 06-performance-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "60%", fig.cap = "Effect of inclusion of irrelevant predictors."}
 # get accuracies after including k irrelevant features
 ks <- c(0, 5, 10, 15, 20, 40)
 fixedaccs <- list()
@@ -1101,7 +1102,8 @@ res <- tibble(ks = ks, accs = accs, fixedaccs = fixedaccs, nghbrs = nghbrs)
 plt_irrelevant_accuracies <- ggplot(res) +
               geom_line(mapping = aes(x=ks, y=accs)) +
               labs(x = "Number of Irrelevant Predictors", 
-                   y = "Model Accuracy Estimate") 
+                   y = "Model Accuracy Estimate") + 
+  theme(text = element_text(size = 18))
 
 plt_irrelevant_accuracies
 ```
@@ -1117,24 +1119,26 @@ variables, the number of neighbors does not increase smoothly; but the general t
 Figure \@ref(fig:06-fixed-irrelevant-features) corroborates
 this evidence; if we fix the number of neighbors to $K=3$, the accuracy falls off more quickly.
 
-```{r 06-neighbors-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Tuned number of neighbors for varying number of irrelevant predictors."}
+```{r 06-neighbors-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "60%", fig.cap = "Tuned number of neighbors for varying number of irrelevant predictors."}
 plt_irrelevant_nghbrs <- ggplot(res) +
               geom_line(mapping = aes(x=ks, y=nghbrs)) +
               labs(x = "Number of Irrelevant Predictors", 
-                   y = "Number of neighbors")
+                   y = "Number of neighbors") + 
+  theme(text = element_text(size = 18))
 
 plt_irrelevant_nghbrs
 ```
 
-```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
-res_tmp <- res |> pivot_longer(cols=c("accs", "fixedaccs"), 
+```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
+res_tmp <- res %>% pivot_longer(cols=c("accs", "fixedaccs"), 
                                 names_to="Type", 
                                 values_to="accuracy")
 
 plt_irrelevant_nghbrs <- ggplot(res_tmp) +
               geom_line(mapping = aes(x=ks, y=accuracy, color=Type)) +
               labs(x = "Number of Irrelevant Predictors", y = "Accuracy") + 
-              scale_color_discrete(labels= c("Tuned K", "K = 3"))
+              scale_color_discrete(labels= c("Tuned K", "K = 3")) + 
+  theme(text = element_text(size = 16))
 
 plt_irrelevant_nghbrs
 ```
@@ -1362,11 +1366,12 @@ where the elbow occurs, and whether adding a variable provides a meaningful incr
 > part of tuning your classifier, you *cannot use your test data* for this
 > process! 
 
-```{r 06-fwdsel-3, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Estimated accuracy versus the number of predictors for the sequence of models built using forward selection."}
+```{r 06-fwdsel-3, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "60%", fig.cap = "Estimated accuracy versus the number of predictors for the sequence of models built using forward selection."}
 fwd_sel_accuracies_plot <- accuracies |>
   ggplot(aes(x = size, y = accuracy)) +
   geom_line() +
-  labs(x = "Number of Predictors", y = "Estimated Accuracy")
+  labs(x = "Number of Predictors", y = "Estimated Accuracy")  +
+  theme(text = element_text(size = 18))
 
 fwd_sel_accuracies_plot
 ```
 
@@ -287,7 +287,7 @@ We have created this particular example
 such that we *do* have access to the full population, which lets us visualize the 
 sampling distribution directly for learning purposes.
 
-```{r 11-example-proportions7, echo = TRUE, message = FALSE, warning = FALSE,fig.cap = "Sampling distribution of the sample proportion for sample size 40.", fig.retina = 2, out.width = "100%"}
+```{r 11-example-proportions7, echo = TRUE, message = FALSE, warning = FALSE,fig.cap = "Sampling distribution of the sample proportion for sample size 40.", fig.height = 3.3, fig.width = 4.2}
 sampling_distribution <- ggplot(sample_estimates, aes(x = sample_proportion)) +
   geom_histogram(fill = "dodgerblue3", color = "lightgrey", bins = 12) +
   ylab("Count") +
@@ -335,7 +335,7 @@ We can visualize the population distribution of the price per night with a histo
 options(pillar.sigfig = 5)
 ```
 
-```{r 11-example-means2, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Population distribution of price per night (Canadian dollars) for all Airbnb listings in Vancouver, Canada.", fig.retina = 2, out.width = "100%"}
+```{r 11-example-means2, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Population distribution of price per night (Canadian dollars) for all Airbnb listings in Vancouver, Canada.", fig.height = 3.5, fig.width = 4.5}
 population_distribution <- ggplot(airbnb, aes(x = price)) +
   geom_histogram(fill = "dodgerblue3", color = "lightgrey") +
   ylab("Count") + 
@@ -380,7 +380,7 @@ We can create a histogram to visualize the distribution of observations in the
 sample (Figure \@ref(fig:11-example-means-sample-hist)), and calculate the mean
 of our sample.
 
-```{r 11-example-means-sample-hist, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Distribution of price per night (Canadian dollars) for sample of 40 Airbnb listings.", fig.retina = 2, out.width = "100%"}
+```{r 11-example-means-sample-hist, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Distribution of price per night (Canadian dollars) for sample of 40 Airbnb listings.", fig.height = 3.5, fig.width = 4.5}
 sample_distribution <- ggplot(one_sample, aes(price)) +
   geom_histogram(fill = "dodgerblue3", color = "lightgrey") +
   ylab("Count") + 
@@ -422,7 +422,7 @@ samples
 Now we can calculate the sample mean for each replicate and plot the sampling
 distribution of sample means for samples of size 40.
 
-```{r 11-example-means4, echo = TRUE, message = FALSE, warning = FALSE, fig.cap= "Sampling distribution of the sample means for sample size of 40.", fig.retina = 2, out.width = "100%"}
+```{r 11-example-means4, echo = TRUE, message = FALSE, warning = FALSE, fig.cap= "Sampling distribution of the sample means for sample size of 40.", fig.height = 3.5, fig.width = 4.5}
 sample_estimates <- samples |>
   group_by(replicate) |>
   summarize(sample_mean = mean(price))
@@ -468,15 +468,15 @@ Notice that the mean of the sample means is \$`r round(mean(sample_estimates$sam
 was \$`r round(mean(airbnb$price),2)`. 
 -->
 
-```{r 11-example-means5, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Comparison of population distribution, sample distribution, and sampling distribution."}
+```{r 11-example-means5, echo = FALSE, message = FALSE, warning = FALSE, fig.height = 5.5, fig.width = 4, fig.cap = "Comparison of population distribution, sample distribution, and sampling distribution."}
 grid.arrange(population_distribution +
   ggtitle("Population") +
   xlim(min(airbnb$price), 600),
 sample_distribution +
   ggtitle("Sample (n = 40)") +
   xlim(min(airbnb$price), 600),
 sampling_distribution_40 +
-  ggtitle("Sampling distribution of the mean for samples of size 40") +
+  ggtitle("Sampling distribution of the mean \n for samples of size 40") +
   xlim(min(airbnb$price), 600),
 nrow = 3
 )
@@ -664,7 +664,7 @@ see that the sample’s distribution looks like that of the population for a
 large enough sample.
 
 
-```{r 11-example-bootstrapping0, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Comparison of samples of different sizes from the population."}
+```{r 11-example-bootstrapping0, echo = FALSE, message = FALSE, warning = FALSE, fig.height = 7, fig.cap = "Comparison of samples of different sizes from the population."}
 sample_10 <- airbnb |>
   rep_sample_n(10)
 sample_distribution_10 <- ggplot(sample_10, aes(price)) +
@@ -773,7 +773,7 @@ one_sample <- one_sample |>
   ungroup() |> select(-replicate)
 ```
 
-```{r 11-bootstrapping1, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Histogram of price per night (Canadian dollars) for one sample of size 40.", fig.retina = 2, out.width = "100%"}
+```{r 11-bootstrapping1, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Histogram of price per night (Canadian dollars) for one sample of size 40.", fig.height = 3.5, fig.width = 4.5}
 one_sample
 
 one_sample_dist <- ggplot(one_sample, aes(price)) +
@@ -799,7 +799,7 @@ we change the argument for `replace` from its default value of `FALSE` to `TRUE`
 \index{bootstrap!in R}
 \index{rep\_sample\_n!bootstrap}
 
-```{r 11-bootstrapping3, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Bootstrap distribution.", fig.retina = 2, out.width = "100%"}
+```{r 11-bootstrapping3, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Bootstrap distribution.", fig.height = 3.5, fig.width = 4.5}
 boot1 <- one_sample |>
   rep_sample_n(size = 40, replace = TRUE, reps = 1)
 boot1_dist <- ggplot(boot1, aes(price)) +
@@ -865,7 +865,7 @@ generate a bootstrap distribution of our point estimates. The bootstrap
 distribution (Figure \@ref(fig:11-bootstrapping5)) suggests how we might expect
 our point estimate to behave if we took another sample.
 
-```{r 11-bootstrapping5, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Distribution of the bootstrap sample means.", out.width = "100%"}
+```{r 11-bootstrapping5, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Distribution of the bootstrap sample means.", fig.height = 3.5, fig.width = 4.5}
 boot20000_means <- boot20000 |>
   group_by(replicate) |>
   summarize(mean = mean(price))
@@ -884,7 +884,7 @@ boot_est_dist
 Let's compare the bootstrap distribution&mdash;which we construct by taking many samples from our original sample of size 40&mdash;with 
 the true sampling distribution&mdash;which corresponds to taking many samples from the population.
 
-```{r 11-bootstrapping6, echo = F, message = FALSE, warning = FALSE, fig.cap = "Comparison of the distribution of the bootstrap sample means and sampling distribution.", out.height = "70%"}
+```{r 11-bootstrapping6, echo = F, message = FALSE, warning = FALSE, fig.cap = "Comparison of the distribution of the bootstrap sample means and sampling distribution.", fig.height = 3.5}
 samples <- rep_sample_n(airbnb, size = 40, reps = 20000)
 
 sample_estimates <- samples |>
@@ -1125,11 +1125,11 @@ the middle 95\% of the sample mean prices in the bootstrap distribution. We can
 visualize the interval on our distribution in Figure
 \@ref(fig:11-bootstrapping9). 
 
-```{r 11-bootstrapping9, echo = F, message = FALSE, warning = FALSE, fig.cap = "Distribution of the bootstrap sample means with percentile lower and upper bounds.", out.width = "100%"}
+```{r 11-bootstrapping9, echo = F, message = FALSE, warning = FALSE, fig.cap = "Distribution of the bootstrap sample means with percentile lower and upper bounds.", fig.height=4, fig.width = 6.5}
 boot_est_dist +
   geom_vline(xintercept = bounds, col = "#E69F00", size = 2, linetype = 2) +
   annotate("text",
-    x = bounds[1], max_count(boot_est_dist), hjust = 0.5, vjust = 2,
+    x = bounds[1], max_count(boot_est_dist), hjust = 0.6, vjust = 2,
     label = paste("2.5th percentile =", round(bounds[1], 2))
   ) +
   annotate("text",