UBC-DSCI
diff --git a/‎CNAME
Lines changed: 1 addition & 0 deletions b/‎CNAME
Lines changed: 1 addition & 0 deletions
diff --git a/‎Dockerfile
Lines changed: 7 additions & 0 deletions b/‎Dockerfile
Lines changed: 7 additions & 0 deletions
diff --git a/‎acknowledgements.Rmd
Lines changed: 2 additions & 2 deletions b/‎acknowledgements.Rmd
Lines changed: 2 additions & 2 deletions
diff --git a/‎build_html.sh
Lines changed: 1 addition & 1 deletion b/‎build_html.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎build_pdf.sh
Lines changed: 3 additions & 1 deletion b/‎build_pdf.sh
Lines changed: 3 additions & 1 deletion
diff --git a/‎classification1.Rmd
Lines changed: 14 additions & 9 deletions b/‎classification1.Rmd
Lines changed: 14 additions & 9 deletions
diff --git a/‎classification2.Rmd
Lines changed: 31 additions & 26 deletions b/‎classification2.Rmd
Lines changed: 31 additions & 26 deletions
@@ -0,0 +1 @@
+datasciencebook.ca
@@ -39,6 +39,7 @@ RUN Rscript -e "reticulate::install_miniconda()"
 RUN Rscript -e "reticulate::conda_install('r-reticulate', 'python-kaleido')"
 RUN Rscript -e "reticulate::conda_install('r-reticulate', 'plotly', channel = 'plotly')"
 
+RUN Rscript -e "devtools::install_github('mountainMath/cancensus@5a5d61759d477986d40dd87fa9a6532ff6037efe')"
 RUN Rscript -e "devtools::install_github('ttimbers/[email protected]')"
 
 # install LaTeX packages
@@ -100,3 +101,9 @@ RUN tlmgr install amsmath \
 RUN sed -i 's/256MiB/4GiB/' /etc/ImageMagick-6/policy.xml
 RUN sed -i 's/512MiB/4GiB/' /etc/ImageMagick-6/policy.xml
 RUN sed -i 's/1GiB/4GiB/' /etc/ImageMagick-6/policy.xml
+
+# install version of tinytex with fixed index double-compile (no release for this yet, so install from commit hash)
+RUN Rscript -e "remove.packages('xfun')"
+RUN Rscript -e "devtools::install_github('yihui/[email protected]')"
+RUN Rscript -e "remove.packages('tinytex')"
+RUN Rscript -e "devtools::install_github('yihui/tinytex@5d211d43944d322fca49e5f0d97f34b9c46ff9ab')"
@@ -1,7 +1,7 @@
 # Acknowledgments {-}
 
 We'd like to thank everyone that has contributed to the development of 
-[*Data Science: A First Introduction*](https://ubc-dsci.github.io/introduction-to-datascience/). 
+[*Data Science: A First Introduction*](https://datasciencebook.ca).
 This is an open source textbook that began as a collection of course readings
 for DSCI 100, a new introductory data science course 
 at the University of British Columbia (UBC).
@@ -19,7 +19,7 @@ Rohan Alexander, Isabella Ghement, Virgilio Gómez Rubio, Albert Kim, Adam Loy,
 The book was improved substantially by their insights.
 We would like to give special thanks to Jim Zidek
 for his support and encouragement throughout the process, and to
-Roger Peng for graciously offering to write the foreword.
+Roger Peng for graciously offering to write the Foreword.
 
 Finally, we owe a debt of gratitude to all of the students of DSCI 100 over the past
 few years. They provided invaluable feedback on the book and worksheets; 
 
@@ -1,2 +1,2 @@
 # Script to generate HTML book
-docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.21.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience; Rscript _build_html.r"
+docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.23.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience; Rscript _build_html.r"
@@ -3,6 +3,7 @@
 # Copy files
 cp references.bib pdf/
 cp authors.Rmd pdf/
+cp foreword-text.Rmd pdf/
 cp preface-text.Rmd pdf/
 cp acknowledgements.Rmd pdf/
 cp intro.Rmd pdf/
@@ -24,11 +25,12 @@ cp -r data/ pdf/data
 cp -r img/ pdf/img
 
 # Build the book with bookdown
-docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.21.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience/pdf; Rscript _build_pdf.r"
+docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.23.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience/pdf; Rscript _build_pdf.r"
 
 # clean files in pdf dir
 rm -rf pdf/references.bib
 rm -rf pdf/authors.Rmd
+rm -rf pdf/foreword-text.Rmd
 rm -rf pdf/preface-text.Rmd
 rm -rf pdf/acknowledgements.Rmd
 rm -rf pdf/intro.Rmd 
 
@@ -455,7 +455,7 @@ You will see in the `mutate` \index{mutate} step below, we compute the straight-
 distance using the formula above: we square the differences between the two observations' perimeter 
 and concavity coordinates, add the squared differences, and then take the square root.
 
-```{r 05-multiknn-1, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of concavity versus perimeter with new observation represented as a red diamond."}
+```{r 05-multiknn-1, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.pos = "H", out.extra="", fig.cap="Scatter plot of concavity versus perimeter with new observation represented as a red diamond."}
 perim_concav <- bind_rows(cancer, 
                           tibble(Perimeter = new_point[1], 
                                  Concavity = new_point[2], 
@@ -1096,7 +1096,7 @@ The new imbalanced data is shown in Figure \@ref(fig:05-unbalanced).
 set.seed(3)
 ```
 
-```{r 05-unbalanced, fig.height = 3.5, fig.width = 4.5, fig.cap = "Imbalanced data."}
+```{r 05-unbalanced, fig.height = 3.5, fig.width = 4.5, fig.pos = "H", out.extra="", fig.cap = "Imbalanced data."}
 rare_cancer <- bind_rows(
       filter(cancer, Class == "B"),
       cancer |> filter(Class == "M") |> slice_head(n = 3)
@@ -1255,7 +1255,7 @@ classifier would make. We can see that the decision is more reasonable; when the
 to those labeled malignant, the classifier predicts a malignant tumor, and vice versa when they are 
 closer to the benign tumor observations.
 
-```{r 05-upsample-plot, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap = "Upsampled data with background color indicating the decision of the classifier."}
+```{r 05-upsample-plot, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.pos = "H", out.extra="", fig.cap = "Upsampled data with background color indicating the decision of the classifier."}
 knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 7) |>
   set_engine("kknn") |>
   set_mode("classification")
@@ -1415,9 +1415,14 @@ wkflw_plot
 ## Exercises
 
 Practice exercises for the material covered in this chapter 
-can be found in the accompanying [worksheet](https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets/blob/main/worksheet_classification1/worksheet_classification1.ipynb).
-The worksheet tries to provide automated feedback 
-and help guide you through the problems. 
-To make sure this functionality works as intended, 
-please follow the instructions for computer setup needed to run the worksheets 
-found in Chapter \@ref(move-to-your-own-machine).
+can be found in the accompanying 
+[worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets#readme)
+in the "Classification I: training and predicting" row.
+You can launch an interactive version of the worksheet in your browser by clicking the "launch binder" button.
+You can also preview a non-interactive version of the worksheet by clicking "view worksheet."
+If you instead decide to download the worksheet and run it on your own machine,
+make sure to follow the instructions for computer setup
+found in Chapter \@ref(move-to-your-own-machine). This will ensure that the automated feedback
+and guidance that the worksheets provide will function as intended.
+
+
@@ -643,7 +643,7 @@ Here, $C=5$ different chunks of the data set are used,
 resulting in 5 different choices for the **validation set**; we call this
 *5-fold* cross-validation. 
 
-```{r 06-cv-image, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "5-fold cross-validation.", fig.retina = 2, out.width = "100%"}
+```{r 06-cv-image, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "5-fold cross-validation.", fig.pos = "H", out.extra="", fig.retina = 2, out.width = "100%"}
 knitr::include_graphics("img/cv.png")
 ```
 
@@ -863,24 +863,7 @@ regardless of what the new observation looks like. In general, if the model
 *isn't influenced enough* by the training data, it is said to **underfit** the
 data.
 
-**Overfitting:** \index{overfitting!classification} In contrast, when we decrease the number of neighbors, each
-individual data point has a stronger and stronger vote regarding nearby points.
-Since the data themselves are noisy, this causes a more "jagged" boundary
-corresponding to a *less simple* model.  If you take this case to the extreme,
-setting $K = 1$, then the classifier is essentially just matching each new
-observation to its closest neighbor in the training data set. This is just as
-problematic as the large $K$ case, because the classifier becomes unreliable on
-new data: if we had a different training set, the predictions would be
-completely different.  In general, if the model *is influenced too much* by the
-training data, it is said to **overfit** the data.
-
-Both overfitting and underfitting are problematic and will lead to a model 
-that does not generalize well to new data. When fitting a model, we need to strike
-a balance between the two. You can see these two effects in Figure 
-\@ref(fig:06-decision-grid-K), which shows how the classifier changes as 
-we set the number of neighbors $K$ to 1, 7, 20, and 300.
-
-```{r 06-decision-grid-K, echo = FALSE, message = FALSE, fig.height = 10, fig.width = 10, fig.cap = "Effect of K in overfitting and underfitting."}
+```{r 06-decision-grid-K, echo = FALSE, message = FALSE, fig.height = 10, fig.width = 10, fig.pos = "H", out.extra="", fig.cap = "Effect of K in overfitting and underfitting."}
 ks <- c(1, 7, 20, 300)
 plots <- list()
 
@@ -935,6 +918,23 @@ p_grid <- plot_grid(plotlist = p_no_legend, ncol = 2)
 plot_grid(p_grid, legend, ncol = 1, rel_heights = c(1, 0.2))
 ```
 
+**Overfitting:** \index{overfitting!classification} In contrast, when we decrease the number of neighbors, each
+individual data point has a stronger and stronger vote regarding nearby points.
+Since the data themselves are noisy, this causes a more "jagged" boundary
+corresponding to a *less simple* model.  If you take this case to the extreme,
+setting $K = 1$, then the classifier is essentially just matching each new
+observation to its closest neighbor in the training data set. This is just as
+problematic as the large $K$ case, because the classifier becomes unreliable on
+new data: if we had a different training set, the predictions would be
+completely different.  In general, if the model *is influenced too much* by the
+training data, it is said to **overfit** the data.
+
+Both overfitting and underfitting are problematic and will lead to a model 
+that does not generalize well to new data. When fitting a model, we need to strike
+a balance between the two. You can see these two effects in Figure 
+\@ref(fig:06-decision-grid-K), which shows how the classifier changes as 
+we set the number of neighbors $K$ to 1, 7, 20, and 300.
+
 ## Summary
 
 Classification algorithms use one or more quantitative variables to predict the
@@ -948,7 +948,7 @@ can tune the classifier (e.g., select the number of neighbors $K$ in $K$-NN)
 by maximizing estimated accuracy via cross-validation. The overall 
 process is summarized in Figure \@ref(fig:06-overview).
 
-```{r 06-overview, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Overview of KNN classification.", fig.retina = 2, out.width = "100%"}
+```{r 06-overview, echo = FALSE, message = FALSE, warning = FALSE, fig.pos = "H", out.extra="", fig.cap = "Overview of KNN classification.", fig.retina = 2, out.width = "100%"}
 knitr::include_graphics("img/train-test-overview.jpeg")
 ```
 
@@ -1386,12 +1386,17 @@ fwd_sel_accuracies_plot
 ## Exercises
 
 Practice exercises for the material covered in this chapter 
-can be found in the accompanying [worksheet](https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets/blob/main/worksheet_classification2/worksheet_classification2.ipynb).
-The worksheet tries to provide automated feedback 
-and help guide you through the problems. 
-To make sure this functionality works as intended, 
-please follow the instructions for computer setup needed to run the worksheets 
-found in Chapter \@ref(move-to-your-own-machine).
+can be found in the accompanying 
+[worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets#readme)
+in the "Classification II: evaluation and tuning" row.
+You can launch an interactive version of the worksheet in your browser by clicking the "launch binder" button.
+You can also preview a non-interactive version of the worksheet by clicking "view worksheet."
+If you instead decide to download the worksheet and run it on your own machine,
+make sure to follow the instructions for computer setup
+found in Chapter \@ref(move-to-your-own-machine). This will ensure that the automated feedback
+and guidance that the worksheets provide will function as intended.
+
+
 
 ## Additional resources
 - The [`tidymodels` website](https://tidymodels.org/packages) is an excellent
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# Script to generate HTML book`
`2`		`-docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.21.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience; Rscript _build_html.r"`
	`2`	`+docker run --rm -m 5g -v $(pwd):/home/rstudio/introduction-to-datascience ubcdsci/intro-to-ds:v0.23.0 /bin/bash -c "cd /home/rstudio/introduction-to-datascience; Rscript _build_html.r"`