Fixed tidymodels width issue and converted outstanding magrittr pipes (%>%) to base R pipes (|>).

ttimbers · ttimbers · commit 296ab28e6fd6 · 2021-10-26T23:05:40.000-07:00
diff --git a/classification1.Rmd b/classification1.Rmd
@@ -6,13 +6,31 @@ library(plotly)
 library(knitr)
 library(kableExtra)
 library(ggpubr)
+library(stringr)
 
 knitr::opts_chunk$set(echo = TRUE, 
                       fig.align = "center")
 options(knitr.table.format = function() {
   if (knitr::is_latex_output()) 'latex' else 'pandoc'
 })
 reticulate::use_miniconda('r-reticulate')
+
+print_tidymodels <- function(tidymodels_object) {
+  if(!is_latex_output()) {
+    tidymodels_object
+  } else {
+    output <- capture.output(tidymodels_object)
+    
+    for (i in seq_along(output)) {
+      if (nchar(output[i]) <= 80) {
+        cat(output[i], sep = "\n")
+      } else {
+        cat(str_sub(output[i], start = 1, end = 80), sep = "\n")
+        cat(str_sub(output[i], start = 81, end = nchar(output[i])), sep = "\n")
+      }
+    }
+  }
+}
 ```
 
 ## Overview 
@@ -211,7 +229,7 @@ We also make the category labels ("B" and "M") more readable by
 changing them to "Benign" and "Malignant" using the `labels` argument.
 
 ```{r 05-scatter, fig.height = 3.5, fig.width = 4.5, fig.cap= "Scatter plot of concavity versus perimeter colored by diagnosis label."}
-perim_concav <- cancer %>%
+perim_concav <- cancer |>
   ggplot(aes(x = Perimeter, y = Concavity, color = Class)) +
   geom_point(alpha = 0.6) +
   labs(x = "Perimeter (standardized)", 
@@ -290,7 +308,7 @@ Figure \@ref(fig:05-knn-1).
 perim_concav_with_new_point <-  bind_rows(cancer, 
                                           tibble(Perimeter = new_point[1], 
                                                  Concavity = new_point[2], 
-                                                 Class = "unknown")) %>%
+                                                 Class = "unknown")) |>
   ggplot(aes(x = Perimeter, 
              y = Concavity, 
              color = Class, 
@@ -348,7 +366,7 @@ not, if you consider the other nearby points...
 perim_concav_with_new_point2 <- bind_rows(cancer, 
                                           tibble(Perimeter = new_point[1], 
                                                  Concavity = new_point[2], 
-                                                 Class = "unknown")) %>%
+                                                 Class = "unknown")) |>
   ggplot(aes(x = Perimeter, 
              y = Concavity, 
              color = Class, 
@@ -496,11 +514,11 @@ math_table <- tibble(Perimeter = round(tab[1:5,1],2),
                      Concavity = round(tab[1:5,2],2), 
                           dist = round(neighbors[1:5, "Distance"], 2)
                     )
-math_table <- math_table %>% 
+math_table <- math_table |> 
                     mutate(Distance = paste0("$\\sqrt{(", new_obs_Perimeter, " - ", ifelse(Perimeter < 0, "(", ""), Perimeter, ifelse(Perimeter < 0,")",""), ")^2",
                                              " + ",
-                                             "(", new_obs_Concavity, " - ", ifelse(Concavity < 0,"(",""), Concavity, ifelse(Concavity < 0,")",""), ")^2} = ", dist, "$")) %>%
-                    select(-dist) %>%
+                                             "(", new_obs_Concavity, " - ", ifelse(Concavity < 0,"(",""), Concavity, ifelse(Concavity < 0,")",""), ")^2} = ", dist, "$")) |>
+                    select(-dist) |>
                     mutate(Class= tab[1:5, "Class"])
 ```
 
@@ -586,10 +604,10 @@ my_distances_3 <- table_with_distances(cancer[, attrs],
                                        new_obs_3[, attrs])
 neighbors_3 <- cancer[order(my_distances_3$Distance), ]
 
-data <- neighbors_3 %>% select(Perimeter, Concavity, Symmetry) %>% slice(1:5)
+data <- neighbors_3 |> select(Perimeter, Concavity, Symmetry) |> slice(1:5)
 
 # add to the df
-scaled_cancer_3 <- bind_rows(cancer, new_obs_3) %>% 
+scaled_cancer_3 <- bind_rows(cancer, new_obs_3) |> 
   mutate(Class = fct_recode(Class, "Benign" = "B", "Malignant"= "M"))
 
 plot_3d <- scaled_cancer_3 |>
@@ -598,7 +616,7 @@ plot_3d <- scaled_cancer_3 |>
     xaxis = list(title = "Perimeter"),
     yaxis = list(title = "Concavity"),
     zaxis = list(title = "Symmetry")
-  )) %>% 
+  )) |> 
   add_trace(x = ~Perimeter,
             y = ~Concavity,
             z = ~Symmetry,
@@ -628,23 +646,23 @@ x5 <- c(pull(new_obs_3[1]), data$Perimeter[5])
 y5 <- c(pull(new_obs_3[2]), data$Concavity[5])
 z5 <- c(pull(new_obs_3[3]), data$Symmetry[5])
 
-plot_3d <- plot_3d  %>%
+plot_3d <- plot_3d  |>
   add_trace(x = x1, y = y1, z = z1, type = "scatter3d", mode = "lines", 
-            name = "lines", showlegend = FALSE, color = I("steelblue2")) %>%
+            name = "lines", showlegend = FALSE, color = I("steelblue2")) |>
   add_trace(x = x2, y = y2, z = z2, type = "scatter3d", mode = "lines", 
-            name = "lines", showlegend = FALSE, color =  I("steelblue2")) %>%
+            name = "lines", showlegend = FALSE, color =  I("steelblue2")) |>
   add_trace(x = x3, y = y3, z = z3, type = "scatter3d", mode = "lines", 
-            name = "lines", showlegend = FALSE, color =  I("steelblue2")) %>%
+            name = "lines", showlegend = FALSE, color =  I("steelblue2")) |>
   add_trace(x = x4, y = y4, z = z4, type = "scatter3d", mode = "lines", 
-            name = "lines", showlegend = FALSE, color =  I("orange2")) %>%
+            name = "lines", showlegend = FALSE, color =  I("orange2")) |>
   add_trace(x = x5, y = y5, z = z5, type = "scatter3d", mode = "lines", 
             name = "lines", showlegend = FALSE, color =  I("steelblue2"))
 
 if(!is_latex_output()){  
   plot_3d
 } else {
   # scene = list(camera = list(eye = list(x=2, y=2, z = 1.5)))
-  # plot_3d <- plot_3d  %>% layout(scene = scene)
+  # plot_3d <- plot_3d  |> layout(scene = scene)
   # save_image(plot_3d, "img/plot3d_knn_classification.png", scale = 10)
   # cannot adjust size of points in this plot for pdf 
   # so using a screenshot for now instead
@@ -731,12 +749,16 @@ data frame, `Class ~ Perimeter + Concavity` and `Class ~ .` are equivalent.
 In general, you can choose individual predictors using the `+` symbol, or you can specify to
 use *all* predictors using the `.` symbol.
 
-```{r 05-tidymodels-4b, tidy = TRUE, tidy.opts=list(width.cutoff = 60)}
+```{r 05-tidymodels-4b, results = 'hide', echo = TRUE}
 knn_fit <- knn_spec |>
   fit(Class ~ ., data = cancer_train)
 knn_fit
 ```
 
+```{r echo = FALSE}
+print_tidymodels(knn_fit)
+```
+
 Here you can see the final trained model summary. It confirms that the computational engine used
 to train the model  was `kknn::train.kknn`. It also shows the fraction of errors made by
 the nearest neighbor model, but we will ignore this for now and discuss it in more detail
@@ -1295,7 +1317,7 @@ and finally we will use the `fit` function to run the whole workflow on the `uns
 Note another difference from earlier here: we do not include a formula in the `fit` function. This \index{tidymodels!fit}
 is again because we included the formula in the recipe, so there is no need to respecify it:
 
-```{r 05-workflow-add, tidy = TRUE, tidy.opts=list(width.cutoff = 60)}
+```{r 05-workflow-add, results = 'hide', echo = TRUE}
 knn_fit <- workflow() |>
   add_recipe(uc_recipe) |>
   add_model(knn_spec) |>
@@ -1304,6 +1326,10 @@ knn_fit <- workflow() |>
 knn_fit
 ```
 
+```{r echo = FALSE}
+print_tidymodels(knn_fit)
+```
+
 As before, the fit object lists the function that trains the model as well as the "best" settings
 for the number of neighbors and weight function (for now, these are just the values we chose
  manually when we created `knn_spec` above). But now the fit object also includes information about
diff --git a/classification2.Rmd b/classification2.Rmd
@@ -5,6 +5,23 @@ library(gridExtra)
 library(cowplot)
 
 knitr::opts_chunk$set(fig.align = "center")
+
+print_tidymodels <- function(tidymodels_object) {
+  if(!is_latex_output()) {
+    tidymodels_object
+  } else {
+    output <- capture.output(tidymodels_object)
+    
+    for (i in seq_along(output)) {
+      if (nchar(output[i]) <= 80) {
+        cat(output[i], sep = "\n")
+      } else {
+        cat(str_sub(output[i], start = 1, end = 80), sep = "\n")
+        cat(str_sub(output[i], start = 81, end = nchar(output[i])), sep = "\n")
+      }
+    }
+  }
+}
 ```
 
 ## Overview 
@@ -324,7 +341,7 @@ use `fit` with the training data `cancer_train` to build the classifier.
 set.seed(1)
 ```
 
-```{r 06-create-K-nearest neighbor-classifier}
+```{r 06-create-K-nearest neighbor-classifier, results = 'hide', echo = TRUE}
 knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
   set_engine("kknn") |>
   set_mode("classification")
@@ -337,6 +354,10 @@ knn_fit <- workflow() |>
 knn_fit
 ```
 
+```{r echo = FALSE}
+print_tidymodels(knn_fit)
+```
+
 ### Predict the labels in the test set
 
 Now that we have a $K$-nearest neighbors classifier object, we can use it to
@@ -530,26 +551,28 @@ cancer_validation <- testing(cancer_split)
 
 # recreate the standardization recipe from before 
 # (since it must be based on the training data)
-cancer_recipe <- recipe(Class ~ Smoothness + Concavity, data = cancer_subtrain) %>%
-  step_scale(all_predictors()) %>%
+cancer_recipe <- recipe(Class ~ Smoothness + Concavity, 
+                        data = cancer_subtrain) |>
+  step_scale(all_predictors()) |>
   step_center(all_predictors())
 
 # fit the knn model (we can reuse the old knn_spec model from before)
-knn_fit <- workflow() %>%
-  add_recipe(cancer_recipe) %>%
-  add_model(knn_spec) %>%
+knn_fit <- workflow() |>
+  add_recipe(cancer_recipe) |>
+  add_model(knn_spec) |>
   fit(data = cancer_subtrain)
 
 # get predictions on the validation data
-validation_predicted <- predict(knn_fit, cancer_validation) %>%
+validation_predicted <- predict(knn_fit, cancer_validation) |>
   bind_cols(cancer_validation)
 
 # compute the accuracy
-acc <- validation_predicted %>%
-  metrics(truth = Class, estimate = .pred_class) %>%
-  filter(.metric == "accuracy") %>%
-  select(.estimate) %>%
+acc <- validation_predicted |>
+  metrics(truth = Class, estimate = .pred_class) |>
+  filter(.metric == "accuracy") |>
+  select(.estimate) |>
   pull()
+
 acc
 ```
 
@@ -699,13 +722,13 @@ vfold_metrics
 ### Parameter value selection
 
 Using 5- and 10-fold cross-validation, we have estimated that the prediction
-accuracy of our classifier is somewhere around `r round(100*(vfold_metrics %>% filter(.metric == "accuracy"))$mean,0)`%. 
+accuracy of our classifier is somewhere around `r round(100*(vfold_metrics |> filter(.metric == "accuracy"))$mean,0)`%. 
 Whether that is good or not
 depends entirely on the downstream application of the data analysis. In the
 present situation, we are trying to predict a tumor diagnosis, with expensive,
 damaging chemo/radiation therapy or patient death as potential consequences of
 misprediction. Hence, we might like to 
-do better than `r round(100*(vfold_metrics %>% filter(.metric == "accuracy"))$mean,0)`% for this application.  
+do better than `r round(100*(vfold_metrics |> filter(.metric == "accuracy"))$mean,0)`% for this application.  
 
 In order to improve our classifier, we have one choice of parameter: the number of
 neighbors, $K$. Since cross-validation helps us evaluate the accuracy of our
@@ -764,13 +787,13 @@ accuracy_vs_k
 ```
 
 Setting the number of 
-neighbors to $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors`
-provides the highest accuracy (`r (accuracies %>% arrange(desc(mean)) %>% slice(1) %>% pull(mean) %>% round(4))*100`%). But there is no exact or perfect answer here;
+neighbors to $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors`
+provides the highest accuracy (`r (accuracies |> arrange(desc(mean)) |> slice(1) |> pull(mean) |> round(4))*100`%). But there is no exact or perfect answer here;
 any selection from $K = 3$ and $15$ would be reasonably justified, as all
 of these differ in classifier accuracy by a small amount. Remember: the
 values you see on this plot are *estimates* of the true accuracy of our
 classifier. Although the 
-$K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors` value is 
+$K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` value is 
 higher than the others on this plot,
 that doesn't mean the classifier is actually more accurate with this parameter
 value! Generally, when selecting $K$ (and other parameters for other predictive
@@ -780,12 +803,12 @@ models), we are looking for a value where:
 - changing the value to a nearby one (e.g., adding or subtracting 1) doesn't decrease accuracy too much, so that our choice is reliable in the presence of uncertainty
 - the cost of training the model is not prohibitive (e.g., in our situation, if $K$ is too large, predicting becomes expensive!)
 
-We know that $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors` 
+We know that $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` 
 provides the highest estimated accuracy. Further, Figure \@ref(fig:06-find-k) shows that the estimated accuracy 
-changes by only a small amount if we increase or decrease $K$ near $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors`.
-And finally, $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors` does not create a prohibitively expensive
+changes by only a small amount if we increase or decrease $K$ near $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors`.
+And finally, $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` does not create a prohibitively expensive
 computational cost of training. Considering these three points, we would indeed select
-$K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors` for the classifier.
+$K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` for the classifier.
 
 ### Under/Overfitting
 
@@ -981,13 +1004,13 @@ cancer_irrelevant <- cancer |> select(Class, Smoothness, Concavity, Perimeter)
 for (i in 1:500) {
     # create column
     col = (sample(2, size=nrow(cancer_irrelevant), replace=TRUE)-1)
-    cancer_irrelevant <- cancer_irrelevant %>% 
+    cancer_irrelevant <- cancer_irrelevant |> 
 	add_column( !!paste("Irrelevant", i, sep="") := col)
 }
 ```
 
 ```{r 06-irrelevant-printdata, warning = FALSE}
-cancer_irrelevant %>% 
+cancer_irrelevant |> 
       select(Class, Smoothness, Concavity, Perimeter, Irrelevant1, Irrelevant2)
 ```
 
@@ -1051,23 +1074,23 @@ for (i in 1:length(ks)) {
     head(1)
   fixedaccs[[i]] <- res$mean
 }
-accs <- accs %>% unlist()
-nghbrs <- nghbrs %>% unlist()
-fixedaccs <- fixedaccs %>% unlist()
+accs <- accs |> unlist()
+nghbrs <- nghbrs |> unlist()
+fixedaccs <- fixedaccs |> unlist()
 
 ## get accuracy if we always just guess the most frequent label
-#base_acc <- cancer_irrelevant %>%
-#                group_by(Class) %>%
-#                summarize(n = n()) %>%
-#                mutate(frac = n/sum(n)) %>%
-#                summarize(mx = max(frac)) %>%
+#base_acc <- cancer_irrelevant |>
+#                group_by(Class) |>
+#                summarize(n = n()) |>
+#                mutate(frac = n/sum(n)) |>
+#                summarize(mx = max(frac)) |>
 #                select(mx)
-#base_acc <- base_acc$mx %>% unlist()
+#base_acc <- base_acc$mx |> unlist()
 
 # plot
 res <- tibble(ks = ks, accs = accs, fixedaccs = fixedaccs, nghbrs = nghbrs)
-#res <- res %>% mutate(base_acc = base_acc)
-#plt_irrelevant_accuracies <- res %>%
+#res <- res |> mutate(base_acc = base_acc)
+#plt_irrelevant_accuracies <- res |>
 #  ggplot() +
 #  geom_line(mapping = aes(x=ks, y=accs, linetype="Tuned KNN")) +
 #  geom_hline(data=res, mapping=aes(yintercept=base_acc, linetype="Always Predict Benign")) +
@@ -1103,7 +1126,7 @@ plt_irrelevant_nghbrs
 ```
 
 ```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
-res_tmp <- res %>% pivot_longer(cols=c("accs", "fixedaccs"), 
+res_tmp <- res |> pivot_longer(cols=c("accs", "fixedaccs"), 
                                 names_to="Type", 
                                 values_to="accuracy")
 
@@ -1307,7 +1330,7 @@ for (i in 1:n_total) {
         models[[j]] <- model_string
     }
     jstar <- which.max(unlist(accs))
-    accuracies <- accuracies %>% 
+    accuracies <- accuracies |> 
       add_row(size = i, 
               model_string = models[[jstar]], 
               accuracy = accs[[jstar]])
diff --git a/regression1.Rmd b/regression1.Rmd
@@ -391,7 +391,7 @@ Then we create a 5-fold cross validation object, and put the recipe and model sp
 in a workflow.
 \index{tidymodels}\index{recipe}\index{workflow}
 
-```{r 07-choose-k-knn}
+```{r 07-choose-k-knn, results = 'hide', echo = TRUE}
 sacr_recipe <- recipe(price ~ sqft, data = sacramento_train) |>
   step_scale(all_predictors()) |>
   step_center(all_predictors())
@@ -410,6 +410,10 @@ sacr_wkflw <- workflow() |>
 sacr_wkflw
 ```
 
+```{r echo = FALSE}
+print_tidymodels(sacr_wkflw)
+```
+
 Next we run cross validation for a grid of numbers of neighbors ranging from 1 to 200. 
 The following code tunes
 the model and returns the RMSPE for each number of neighbors. In the output of the `sacr_results`
diff --git a/regression2.Rmd b/regression2.Rmd