UBC-DSCI
diff --git a/‎README.md
Lines changed: 5 additions & 2 deletions b/‎README.md
Lines changed: 5 additions & 2 deletions
diff --git a/‎build_html.sh
Lines changed: 1 addition & 1 deletion b/‎build_html.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎build_pdf.sh
Lines changed: 1 addition & 1 deletion b/‎build_pdf.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎classification1.Rmd
Lines changed: 24 additions & 20 deletions b/‎classification1.Rmd
Lines changed: 24 additions & 20 deletions
diff --git a/‎classification2.Rmd
Lines changed: 25 additions & 15 deletions b/‎classification2.Rmd
Lines changed: 25 additions & 15 deletions
@@ -142,13 +142,16 @@ bookdown::gitbook:
 #### Figures
 - make sure all figures get (capitalized) labels ("Figure \\@ref(blah)", not "figure below" or "figure above")
 - make sure all figures get captions
-- specify image widths in terms of linewidth percent (e.g. `out.width="70%"`)
+- specify image widths of pngs and jpegs in terms of linewidth percent 
+(e.g. `out.width="70%"`),
+for plots we create in R use `fig.width` and `fig.height`.
 - center align all images via `fig.align = "center"`
 - make sure we have permission for every figure/logo that we use
 - Make sure all figures follow the visualization principles in Chapter 4
 - Make sure axes are set appropriately to not inflate/deflate differences artificially *where it does not compromise clarity* (e.g. in the classification
   chapter there are a few examples where zoomed-in accuracy axes are better than using the full range 0 to 1)
-- 
+- Fig size for bar charts should be: `fig.width=5, fig.height=3` (an exception are figs 1.7 & 1.8 so that we can read the axis labels)
+- cropping width for syntax diagrams is 1625 (done using `image_crop`)
 
 #### Tables
 - make sure all tables get capitalized labels ("Table \\@ref(blah)", not "table below" or "table above")
 
@@ -1,2 +1,2 @@
 # Script to generate HTML book
-docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.19.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience; Rscript _build_html.r"
+docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.21.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience; Rscript _build_html.r"
@@ -24,7 +24,7 @@ cp -r data/ pdf/data
 cp -r img/ pdf/img
 
 # Build the book with bookdown
-docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.19.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience/pdf; Rscript _build_pdf.r"
+docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.21.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience/pdf; Rscript _build_pdf.r"
 
 # clean files in pdf dir
 rm -rf pdf/references.bib
 
@@ -5,6 +5,7 @@ library(formatR)
 library(plotly)
 library(knitr)
 library(kableExtra)
+library(ggpubr)
 
 knitr::opts_chunk$set(echo = TRUE, 
                       fig.align = "center")
@@ -209,7 +210,7 @@ for light orange and `"steelblue2"` for light blue&mdash;and
 We also make the category labels ("B" and "M") more readable by 
 changing them to "Benign" and "Malignant" using the `labels` argument.
 
-```{r 05-scatter, fig.height = 4, fig.width = 5, fig.cap= "Scatter plot of concavity versus perimeter colored by diagnosis label."}
+```{r 05-scatter, fig.height = 3.5, fig.width = 4.5, fig.cap= "Scatter plot of concavity versus perimeter colored by diagnosis label."}
 perim_concav <- cancer %>%
   ggplot(aes(x = Perimeter, y = Concavity, color = Class)) +
   geom_point(alpha = 0.6) +
@@ -285,7 +286,7 @@ new observation, with standardized perimeter of `r new_point[1]` and standardize
 diagnosis "Class" is unknown. This new observation is depicted by the red, diamond point in
 Figure \@ref(fig:05-knn-1).
 
-```{r 05-knn-1, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of concavity versus perimeter with new observation represented as a red diamond."}
+```{r 05-knn-1, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of concavity versus perimeter with new observation represented as a red diamond."}
 perim_concav_with_new_point <-  bind_rows(cancer, 
                                           tibble(Perimeter = new_point[1], 
                                                  Concavity = new_point[2], 
@@ -317,7 +318,7 @@ then the perimeter and concavity values are similar, and so we may expect that
 they would have the same diagnosis. 
 
 
-```{r 05-knn-2, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of concavity versus perimeter. The new observation is represented as a red diamond with a line to the one nearest neighbor, which has a malignant label."}
+```{r 05-knn-2, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of concavity versus perimeter. The new observation is represented as a red diamond with a line to the one nearest neighbor, which has a malignant label."}
 perim_concav_with_new_point +
   geom_segment(aes(
     x = new_point[1],
@@ -342,7 +343,7 @@ Does this seem like the right prediction to make for this observation? Probably
 not, if you consider the other nearby points...
 
 
-```{r 05-knn-4, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of concavity versus perimeter. The new observation is represented as a red diamond with a line to the one nearest neighbor, which has a benign label."}
+```{r 05-knn-4, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of concavity versus perimeter. The new observation is represented as a red diamond with a line to the one nearest neighbor, which has a benign label."}
 
 perim_concav_with_new_point2 <- bind_rows(cancer, 
                                           tibble(Perimeter = new_point[1], 
@@ -382,7 +383,7 @@ see that the diagnoses of 2 of the 3 nearest neighbors to our new observation
 are malignant. Therefore we take majority vote and classify our new red, diamond
 observation as malignant. 
 
-```{r 05-knn-5, echo =  FALSE, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of concavity versus perimeter with three nearest neighbors."}
+```{r 05-knn-5, echo =  FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of concavity versus perimeter with three nearest neighbors."}
 perim_concav_with_new_point2 + 
   geom_segment(aes(
     x = new_point[1], y = new_point[2],
@@ -432,7 +433,7 @@ You will see in the `mutate` \index{mutate} step below, we compute the straight-
 distance using the formula above: we square the differences between the two observations' perimeter 
 and concavity coordinates, add the squared differences, and then take the square root.
 
-```{r 05-multiknn-1, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of concavity versus perimeter with new observation represented as a red diamond."}
+```{r 05-multiknn-1, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of concavity versus perimeter with new observation represented as a red diamond."}
 perim_concav <- bind_rows(cancer, 
                           tibble(Perimeter = new_point[1], 
                                  Concavity = new_point[2], 
@@ -514,7 +515,7 @@ The result of this computation shows that 3 of the 5 nearest neighbors to our ne
 malignant (`M`); since this is the majority, we classify our new observation as malignant. 
 These 5 neighbors are circled in Figure \@ref(fig:05-multiknn-3).
 
-```{r 05-multiknn-3, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of concavity versus perimeter with 5 nearest neighbors circled."}
+```{r 05-multiknn-3, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of concavity versus perimeter with 5 nearest neighbors circled."}
 perim_concav + annotate("path",
   x = new_point[1] + 1.4 * cos(seq(0, 2 * pi,
     length.out = 100
@@ -903,7 +904,7 @@ Standardizing your data should be a part of the preprocessing you do
 before predictive modeling and you should always think carefully about your problem domain and
 whether you need to standardize your data. 
 
-```{r 05-scaling-plt, echo = FALSE, fig.height = 4, fig.width = 10, fig.cap = "Comparison of K = 3 nearest neighbors with standardized and unstandardized data."}
+```{r 05-scaling-plt, echo = FALSE, fig.height = 4, fig.cap = "Comparison of K = 3 nearest neighbors with standardized and unstandardized data."}
 
 attrs <- c("Area", "Smoothness")
 
@@ -994,10 +995,11 @@ scaled <- ggplot(scaled_cancer, aes(x = Area,
     yend = unlist(neighbors_scaled[3, attrs[2]])
   ), color = "black", size = 0.5)
 
-gridExtra::grid.arrange(unscaled, scaled, ncol = 2)
+ggarrange(unscaled, scaled, ncol = 2, common.legend = TRUE, legend = "bottom")
+
 ```
 
-```{r 05-scaling-plt-zoomed, fig.height = 4, fig.width = 10, echo = FALSE, fig.cap = "Close up of three nearest neighbors for unstandardized data."}
+```{r 05-scaling-plt-zoomed, fig.height = 4.5, fig.width = 9, echo = FALSE, fig.cap = "Close up of three nearest neighbors for unstandardized data."}
 library(ggforce)
 ggplot(unscaled_cancer, aes(x = Area, 
                             y = Smoothness, 
@@ -1029,11 +1031,11 @@ ggplot(unscaled_cancer, aes(x = Area,
     x = unlist(new_obs[1]), y = unlist(new_obs[2]),
     xend = unlist(neighbors[3, attrs[1]]),
     yend = unlist(neighbors[3, attrs[2]])
-  ), color = "black") +   theme_light() +  
-# facet_zoom( xlim = c(399.7, 401.6), ylim = c(0.08, 0.14), zoom.size = 2) + 
+  ), color = "black") +  
    facet_zoom(x = ( Area > 380 & Area < 420) , 
               y = (Smoothness > 0.08 & Smoothness < 0.14), zoom.size = 2) + 
-  theme_bw()
+    theme_bw() + 
+    theme(text = element_text(size = 14), legend.position="bottom")
 ```
 
 ### Balancing
@@ -1058,14 +1060,14 @@ function, which takes two arguments: a data frame-like object,
 and the number of rows to select from the top (`n`).
 The new imbalanced data is shown in Figure \@ref(fig:05-unbalanced).
 
-```{r 05-unbalanced-seed, echo = FALSE, fig.height = 4, fig.width = 5, warning = FALSE, message = FALSE}
+```{r 05-unbalanced-seed, echo = FALSE, fig.height = 3.5, fig.width = 4.5, warning = FALSE, message = FALSE}
 # hidden seed here for reproducibility 
 # randomness shouldn't affect much in this use of step_upsample,
 # but just in case...
 set.seed(3)
 ```
 
-```{r 05-unbalanced, fig.height = 4, fig.width = 5, fig.cap = "Imbalanced data."}
+```{r 05-unbalanced, fig.height = 3.5, fig.width = 4.5, fig.cap = "Imbalanced data."}
 rare_cancer <- bind_rows(
       filter(cancer, Class == "B"),
       cancer |> filter(Class == "M") |> slice_head(n = 3)
@@ -1093,7 +1095,7 @@ benign, and the benign vote will always win. For example, Figure \@ref(fig:05-up
 shows what happens for a new tumor observation that is quite close to three observations
 in the training data that were tagged as malignant.
 
-```{r 05-upsample, echo=FALSE, fig.height = 4, fig.width = 5, fig.cap = "Imbalanced data with 7 nearest neighbors to a new observation highlighted."}
+```{r 05-upsample, echo=FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Imbalanced data with 7 nearest neighbors to a new observation highlighted."}
 new_point <- c(2, 2)
 attrs <- c("Perimeter", "Concavity")
 my_distances <- table_with_distances(rare_cancer[, attrs], new_point)
@@ -1145,7 +1147,7 @@ each area of the plot to the predictions the $K$-nearest neighbor
 classifier would make. We can see that the decision is 
 always "benign," corresponding to the blue color.
 
-```{r 05-upsample-2, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Imbalanced data with background color indicating the decision of the classifier and the points represent the labeled data."}
+```{r 05-upsample-2, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Imbalanced data with background color indicating the decision of the classifier and the points represent the labeled data."}
 
 knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 7) |>
   set_engine("kknn") |>
@@ -1223,7 +1225,7 @@ classifier would make. We can see that the decision is more reasonable; when the
 to those labeled malignant, the classifier predicts a malignant tumor, and vice versa when they are 
 closer to the benign tumor observations.
 
-```{r 05-upsample-plot, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap = "Upsampled data with background color indicating the decision of the classifier."}
+```{r 05-upsample-plot, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Upsampled data with background color indicating the decision of the classifier."}
 knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 7) |>
   set_engine("kknn") |>
   set_mode("classification")
@@ -1333,7 +1335,7 @@ predict the label of each, and visualize the predictions with a colored scatter
 > textbook. It is included for those readers who would like to use similar
 > visualizations in their own data analyses. 
 
-```{r 05-workflow-plot-show, fig.height = 4, fig.width = 5, fig.cap = "Scatter plot of smoothness versus area where background color indicates the decision of the classifier."}
+```{r 05-workflow-plot-show, fig.height = 3.5, fig.width = 4.6, fig.cap = "Scatter plot of smoothness versus area where background color indicates the decision of the classifier."}
 # create the grid of area/smoothness vals, and arrange in a data frame
 are_grid <- seq(min(unscaled_cancer$Area), 
                 max(unscaled_cancer$Area), 
@@ -1367,7 +1369,9 @@ wkflw_plot <-
                            color = Class), 
              alpha = 0.02, 
              size = 5) +
-  labs(color = "Diagnosis") +
+  labs(color = "Diagnosis", 
+       x = "Area (standardized)", 
+       y = "Smoothness (standardized)") +
   scale_color_manual(labels = c("Malignant", "Benign"), 
                      values = c("orange2", "steelblue2"))
 
 
@@ -2,6 +2,7 @@
 
 ```{r classification2-setup, echo = FALSE, message = FALSE, warning = FALSE}
 library(gridExtra)
+library(cowplot)
 
 knitr::opts_chunk$set(fig.align = "center")
 ```
@@ -187,7 +188,7 @@ tumor cell concavity versus smoothness colored by diagnosis in Figure \@ref(fig:
 You will also notice that we set the random seed here at the beginning of the analysis
 using the `set.seed` function, as described in Section \@ref(randomseeds).
 
-```{r 06-precode, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of tumor cell concavity versus smoothness colored by diagnosis label.", message = F, warning = F}
+```{r 06-precode, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of tumor cell concavity versus smoothness colored by diagnosis label.", message = F, warning = F}
 # load packages
 library(tidyverse)
 library(tidymodels)
@@ -753,7 +754,7 @@ We can select the best value of the number of neighbors (i.e., the one that resu
 in the highest classifier accuracy estimate) by plotting the accuracy versus $K$ 
 in Figure \@ref(fig:06-find-k).
 
-```{r 06-find-k, fig.height = 4, fig.width = 5, fig.cap= "Plot of estimated accuracy versus the number of neighbors."}
+```{r 06-find-k,  fig.height = 3.5, fig.width = 4, fig.cap= "Plot of estimated accuracy versus the number of neighbors."}
 accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
   geom_point() +
   geom_line() +
@@ -799,7 +800,7 @@ we vary $K$ from 1 to almost the number of observations in the data set.
 set.seed(1)
 ```
 
-```{r 06-lots-of-ks, message = FALSE, fig.height = 4, fig.width = 5, fig.cap="Plot of accuracy estimate versus number of neighbors for many K values."}
+```{r 06-lots-of-ks, message = FALSE, fig.height = 3.5, fig.width = 4, fig.cap="Plot of accuracy estimate versus number of neighbors for many K values."}
 k_lots <- tibble(neighbors = seq(from = 1, to = 385, by = 10))
 
 knn_results <- workflow() |>
@@ -848,7 +849,7 @@ a balance between the two. You can see these two effects in Figure
 \@ref(fig:06-decision-grid-K), which shows how the classifier changes as 
 we set the number of neighbors $K$ to 1, 7, 20, and 300.
 
-```{r 06-decision-grid-K, echo = FALSE, message = FALSE, fig.height = 7, fig.width = 10, fig.cap = "Effect of K in overfitting and underfitting."}
+```{r 06-decision-grid-K, echo = FALSE, message = FALSE, fig.height = 10, fig.width = 10, fig.cap = "Effect of K in overfitting and underfitting."}
 ks <- c(1, 7, 20, 300)
 plots <- list()
 
@@ -893,9 +894,14 @@ for (i in 1:length(ks)) {
     labs(color = "Diagnosis") +
     ggtitle(paste("K = ", ks[[i]])) +
     scale_color_manual(labels = c("Malignant", "Benign"), 
-                       values = c("orange2", "steelblue2"))
-}
-grid.arrange(grobs = plots)
+                       values = c("orange2", "steelblue2"))  +
+  theme(text = element_text(size = 18))
+  }
+
+p_no_legend <- lapply(plots, function(x) x + theme(legend.position = "none"))
+legend <- get_legend(plots[[1]] + theme(legend.position = "bottom"))
+p_grid <- plot_grid(plotlist = p_no_legend, ncol = 2)
+plot_grid(p_grid, legend, ncol = 1, rel_heights = c(1, 0.2))
 ```
 
 ## Summary
@@ -999,7 +1005,7 @@ variables there are, the more (random) influence they have, and the more they
 corrupt the set of nearest neighbors that vote on the class of the new
 observation to predict.  
 
-```{r 06-performance-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Effect of inclusion of irrelevant predictors."}
+```{r 06-performance-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "60%", fig.cap = "Effect of inclusion of irrelevant predictors."}
 # get accuracies after including k irrelevant features
 ks <- c(0, 5, 10, 15, 20, 40)
 fixedaccs <- list()
@@ -1072,7 +1078,8 @@ res <- tibble(ks = ks, accs = accs, fixedaccs = fixedaccs, nghbrs = nghbrs)
 plt_irrelevant_accuracies <- ggplot(res) +
               geom_line(mapping = aes(x=ks, y=accs)) +
               labs(x = "Number of Irrelevant Predictors", 
-                   y = "Model Accuracy Estimate") 
+                   y = "Model Accuracy Estimate") + 
+  theme(text = element_text(size = 18))
 
 plt_irrelevant_accuracies
 ```
@@ -1088,24 +1095,26 @@ variables, the number of neighbors does not increase smoothly; but the general t
 Figure \@ref(fig:06-fixed-irrelevant-features) corroborates
 this evidence; if we fix the number of neighbors to $K=3$, the accuracy falls off more quickly.
 
-```{r 06-neighbors-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Tuned number of neighbors for varying number of irrelevant predictors."}
+```{r 06-neighbors-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "60%", fig.cap = "Tuned number of neighbors for varying number of irrelevant predictors."}
 plt_irrelevant_nghbrs <- ggplot(res) +
               geom_line(mapping = aes(x=ks, y=nghbrs)) +
               labs(x = "Number of Irrelevant Predictors", 
-                   y = "Number of neighbors")
+                   y = "Number of neighbors") + 
+  theme(text = element_text(size = 18))
 
 plt_irrelevant_nghbrs
 ```
 
-```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
+```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
 res_tmp <- res %>% pivot_longer(cols=c("accs", "fixedaccs"), 
                                 names_to="Type", 
                                 values_to="accuracy")
 
 plt_irrelevant_nghbrs <- ggplot(res_tmp) +
               geom_line(mapping = aes(x=ks, y=accuracy, color=Type)) +
               labs(x = "Number of Irrelevant Predictors", y = "Accuracy") + 
-              scale_color_discrete(labels= c("Tuned K", "K = 3"))
+              scale_color_discrete(labels= c("Tuned K", "K = 3")) + 
+  theme(text = element_text(size = 16))
 
 plt_irrelevant_nghbrs
 ```
@@ -1333,11 +1342,12 @@ where the elbow occurs, and whether adding a variable provides a meaningful incr
 > part of tuning your classifier, you *cannot use your test data* for this
 > process! 
 
-```{r 06-fwdsel-3, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Estimated accuracy versus the number of predictors for the sequence of models built using forward selection."}
+```{r 06-fwdsel-3, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "60%", fig.cap = "Estimated accuracy versus the number of predictors for the sequence of models built using forward selection."}
 fwd_sel_accuracies_plot <- accuracies |>
   ggplot(aes(x = size, y = accuracy)) +
   geom_line() +
-  labs(x = "Number of Predictors", y = "Estimated Accuracy")
+  labs(x = "Number of Predictors", y = "Estimated Accuracy")  +
+  theme(text = element_text(size = 18))
 
 fwd_sel_accuracies_plot
 ```
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# Script to generate HTML book`
`2`		`-docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.19.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience; Rscript _build_html.r"`
	`2`	`+docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.21.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience; Rscript _build_html.r"`