Skip to content

Commit efe655e

Browse files
Merge pull request #366 from UBC-DSCI/patch-tidymodels-output
Fixed tidymodels width issue and converted outstanding magrittr pipes…
2 parents 83c635d + cb642a0 commit efe655e

File tree

6 files changed

+167
-83
lines changed

6 files changed

+167
-83
lines changed

classification1.Rmd

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,31 @@ library(plotly)
66
library(knitr)
77
library(kableExtra)
88
library(ggpubr)
9+
library(stringr)
910
1011
knitr::opts_chunk$set(echo = TRUE,
1112
fig.align = "center")
1213
options(knitr.table.format = function() {
1314
if (knitr::is_latex_output()) 'latex' else 'pandoc'
1415
})
1516
reticulate::use_miniconda('r-reticulate')
17+
18+
print_tidymodels <- function(tidymodels_object) {
19+
if(!is_latex_output()) {
20+
tidymodels_object
21+
} else {
22+
output <- capture.output(tidymodels_object)
23+
24+
for (i in seq_along(output)) {
25+
if (nchar(output[i]) <= 80) {
26+
cat(output[i], sep = "\n")
27+
} else {
28+
cat(str_sub(output[i], start = 1, end = 80), sep = "\n")
29+
cat(str_sub(output[i], start = 81, end = nchar(output[i])), sep = "\n")
30+
}
31+
}
32+
}
33+
}
1634
```
1735

1836
## Overview
@@ -211,7 +229,7 @@ We also make the category labels ("B" and "M") more readable by
211229
changing them to "Benign" and "Malignant" using the `labels` argument.
212230

213231
```{r 05-scatter, fig.height = 3.5, fig.width = 4.5, fig.cap= "Scatter plot of concavity versus perimeter colored by diagnosis label."}
214-
perim_concav <- cancer %>%
232+
perim_concav <- cancer |>
215233
ggplot(aes(x = Perimeter, y = Concavity, color = Class)) +
216234
geom_point(alpha = 0.6) +
217235
labs(x = "Perimeter (standardized)",
@@ -290,7 +308,7 @@ Figure \@ref(fig:05-knn-1).
290308
perim_concav_with_new_point <- bind_rows(cancer,
291309
tibble(Perimeter = new_point[1],
292310
Concavity = new_point[2],
293-
Class = "unknown")) %>%
311+
Class = "unknown")) |>
294312
ggplot(aes(x = Perimeter,
295313
y = Concavity,
296314
color = Class,
@@ -348,7 +366,7 @@ not, if you consider the other nearby points...
348366
perim_concav_with_new_point2 <- bind_rows(cancer,
349367
tibble(Perimeter = new_point[1],
350368
Concavity = new_point[2],
351-
Class = "unknown")) %>%
369+
Class = "unknown")) |>
352370
ggplot(aes(x = Perimeter,
353371
y = Concavity,
354372
color = Class,
@@ -496,11 +514,11 @@ math_table <- tibble(Perimeter = round(tab[1:5,1],2),
496514
Concavity = round(tab[1:5,2],2),
497515
dist = round(neighbors[1:5, "Distance"], 2)
498516
)
499-
math_table <- math_table %>%
517+
math_table <- math_table |>
500518
mutate(Distance = paste0("$\\sqrt{(", new_obs_Perimeter, " - ", ifelse(Perimeter < 0, "(", ""), Perimeter, ifelse(Perimeter < 0,")",""), ")^2",
501519
" + ",
502-
"(", new_obs_Concavity, " - ", ifelse(Concavity < 0,"(",""), Concavity, ifelse(Concavity < 0,")",""), ")^2} = ", dist, "$")) %>%
503-
select(-dist) %>%
520+
"(", new_obs_Concavity, " - ", ifelse(Concavity < 0,"(",""), Concavity, ifelse(Concavity < 0,")",""), ")^2} = ", dist, "$")) |>
521+
select(-dist) |>
504522
mutate(Class= tab[1:5, "Class"])
505523
```
506524

@@ -586,10 +604,10 @@ my_distances_3 <- table_with_distances(cancer[, attrs],
586604
new_obs_3[, attrs])
587605
neighbors_3 <- cancer[order(my_distances_3$Distance), ]
588606
589-
data <- neighbors_3 %>% select(Perimeter, Concavity, Symmetry) %>% slice(1:5)
607+
data <- neighbors_3 |> select(Perimeter, Concavity, Symmetry) |> slice(1:5)
590608
591609
# add to the df
592-
scaled_cancer_3 <- bind_rows(cancer, new_obs_3) %>%
610+
scaled_cancer_3 <- bind_rows(cancer, new_obs_3) |>
593611
mutate(Class = fct_recode(Class, "Benign" = "B", "Malignant"= "M"))
594612
595613
plot_3d <- scaled_cancer_3 |>
@@ -598,7 +616,7 @@ plot_3d <- scaled_cancer_3 |>
598616
xaxis = list(title = "Perimeter"),
599617
yaxis = list(title = "Concavity"),
600618
zaxis = list(title = "Symmetry")
601-
)) %>%
619+
)) |>
602620
add_trace(x = ~Perimeter,
603621
y = ~Concavity,
604622
z = ~Symmetry,
@@ -628,23 +646,23 @@ x5 <- c(pull(new_obs_3[1]), data$Perimeter[5])
628646
y5 <- c(pull(new_obs_3[2]), data$Concavity[5])
629647
z5 <- c(pull(new_obs_3[3]), data$Symmetry[5])
630648
631-
plot_3d <- plot_3d %>%
649+
plot_3d <- plot_3d |>
632650
add_trace(x = x1, y = y1, z = z1, type = "scatter3d", mode = "lines",
633-
name = "lines", showlegend = FALSE, color = I("steelblue2")) %>%
651+
name = "lines", showlegend = FALSE, color = I("steelblue2")) |>
634652
add_trace(x = x2, y = y2, z = z2, type = "scatter3d", mode = "lines",
635-
name = "lines", showlegend = FALSE, color = I("steelblue2")) %>%
653+
name = "lines", showlegend = FALSE, color = I("steelblue2")) |>
636654
add_trace(x = x3, y = y3, z = z3, type = "scatter3d", mode = "lines",
637-
name = "lines", showlegend = FALSE, color = I("steelblue2")) %>%
655+
name = "lines", showlegend = FALSE, color = I("steelblue2")) |>
638656
add_trace(x = x4, y = y4, z = z4, type = "scatter3d", mode = "lines",
639-
name = "lines", showlegend = FALSE, color = I("orange2")) %>%
657+
name = "lines", showlegend = FALSE, color = I("orange2")) |>
640658
add_trace(x = x5, y = y5, z = z5, type = "scatter3d", mode = "lines",
641659
name = "lines", showlegend = FALSE, color = I("steelblue2"))
642660
643661
if(!is_latex_output()){
644662
plot_3d
645663
} else {
646664
# scene = list(camera = list(eye = list(x=2, y=2, z = 1.5)))
647-
# plot_3d <- plot_3d %>% layout(scene = scene)
665+
# plot_3d <- plot_3d |> layout(scene = scene)
648666
# save_image(plot_3d, "img/plot3d_knn_classification.png", scale = 10)
649667
# cannot adjust size of points in this plot for pdf
650668
# so using a screenshot for now instead
@@ -731,12 +749,16 @@ data frame, `Class ~ Perimeter + Concavity` and `Class ~ .` are equivalent.
731749
In general, you can choose individual predictors using the `+` symbol, or you can specify to
732750
use *all* predictors using the `.` symbol.
733751

734-
```{r 05-tidymodels-4b, tidy = TRUE, tidy.opts=list(width.cutoff = 60)}
752+
```{r 05-tidymodels-4b, results = 'hide', echo = TRUE}
735753
knn_fit <- knn_spec |>
736754
fit(Class ~ ., data = cancer_train)
737755
knn_fit
738756
```
739757

758+
```{r echo = FALSE}
759+
print_tidymodels(knn_fit)
760+
```
761+
740762
Here you can see the final trained model summary. It confirms that the computational engine used
741763
to train the model was `kknn::train.kknn`. It also shows the fraction of errors made by
742764
the nearest neighbor model, but we will ignore this for now and discuss it in more detail
@@ -1296,7 +1318,7 @@ and finally we will use the `fit` function to run the whole workflow on the `uns
12961318
Note another difference from earlier here: we do not include a formula in the `fit` function. This \index{tidymodels!fit}
12971319
is again because we included the formula in the recipe, so there is no need to respecify it:
12981320

1299-
```{r 05-workflow-add, tidy = TRUE, tidy.opts=list(width.cutoff = 60)}
1321+
```{r 05-workflow-add, results = 'hide', echo = TRUE}
13001322
knn_fit <- workflow() |>
13011323
add_recipe(uc_recipe) |>
13021324
add_model(knn_spec) |>
@@ -1305,6 +1327,10 @@ knn_fit <- workflow() |>
13051327
knn_fit
13061328
```
13071329

1330+
```{r echo = FALSE}
1331+
print_tidymodels(knn_fit)
1332+
```
1333+
13081334
As before, the fit object lists the function that trains the model as well as the "best" settings
13091335
for the number of neighbors and weight function (for now, these are just the values we chose
13101336
manually when we created `knn_spec` above). But now the fit object also includes information about

classification2.Rmd

Lines changed: 58 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,26 @@
33
```{r classification2-setup, echo = FALSE, message = FALSE, warning = FALSE}
44
library(gridExtra)
55
library(cowplot)
6+
library(stringr)
67
78
knitr::opts_chunk$set(fig.align = "center")
9+
10+
print_tidymodels <- function(tidymodels_object) {
11+
if(!is_latex_output()) {
12+
tidymodels_object
13+
} else {
14+
output <- capture.output(tidymodels_object)
15+
16+
for (i in seq_along(output)) {
17+
if (nchar(output[i]) <= 80) {
18+
cat(output[i], sep = "\n")
19+
} else {
20+
cat(str_sub(output[i], start = 1, end = 80), sep = "\n")
21+
cat(str_sub(output[i], start = 81, end = nchar(output[i])), sep = "\n")
22+
}
23+
}
24+
}
25+
}
826
```
927

1028
## Overview
@@ -324,7 +342,7 @@ use `fit` with the training data `cancer_train` to build the classifier.
324342
set.seed(1)
325343
```
326344

327-
```{r 06-create-K-nearest neighbor-classifier}
345+
```{r 06-create-K-nearest neighbor-classifier, results = 'hide', echo = TRUE}
328346
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
329347
set_engine("kknn") |>
330348
set_mode("classification")
@@ -337,6 +355,10 @@ knn_fit <- workflow() |>
337355
knn_fit
338356
```
339357

358+
```{r echo = FALSE}
359+
print_tidymodels(knn_fit)
360+
```
361+
340362
### Predict the labels in the test set
341363

342364
Now that we have a $K$-nearest neighbors classifier object, we can use it to
@@ -530,26 +552,28 @@ cancer_validation <- testing(cancer_split)
530552
531553
# recreate the standardization recipe from before
532554
# (since it must be based on the training data)
533-
cancer_recipe <- recipe(Class ~ Smoothness + Concavity, data = cancer_subtrain) %>%
534-
step_scale(all_predictors()) %>%
555+
cancer_recipe <- recipe(Class ~ Smoothness + Concavity,
556+
data = cancer_subtrain) |>
557+
step_scale(all_predictors()) |>
535558
step_center(all_predictors())
536559
537560
# fit the knn model (we can reuse the old knn_spec model from before)
538-
knn_fit <- workflow() %>%
539-
add_recipe(cancer_recipe) %>%
540-
add_model(knn_spec) %>%
561+
knn_fit <- workflow() |>
562+
add_recipe(cancer_recipe) |>
563+
add_model(knn_spec) |>
541564
fit(data = cancer_subtrain)
542565
543566
# get predictions on the validation data
544-
validation_predicted <- predict(knn_fit, cancer_validation) %>%
567+
validation_predicted <- predict(knn_fit, cancer_validation) |>
545568
bind_cols(cancer_validation)
546569
547570
# compute the accuracy
548-
acc <- validation_predicted %>%
549-
metrics(truth = Class, estimate = .pred_class) %>%
550-
filter(.metric == "accuracy") %>%
551-
select(.estimate) %>%
571+
acc <- validation_predicted |>
572+
metrics(truth = Class, estimate = .pred_class) |>
573+
filter(.metric == "accuracy") |>
574+
select(.estimate) |>
552575
pull()
576+
553577
acc
554578
```
555579

@@ -699,13 +723,13 @@ vfold_metrics
699723
### Parameter value selection
700724

701725
Using 5- and 10-fold cross-validation, we have estimated that the prediction
702-
accuracy of our classifier is somewhere around `r round(100*(vfold_metrics %>% filter(.metric == "accuracy"))$mean,0)`%.
726+
accuracy of our classifier is somewhere around `r round(100*(vfold_metrics |> filter(.metric == "accuracy"))$mean,0)`%.
703727
Whether that is good or not
704728
depends entirely on the downstream application of the data analysis. In the
705729
present situation, we are trying to predict a tumor diagnosis, with expensive,
706730
damaging chemo/radiation therapy or patient death as potential consequences of
707731
misprediction. Hence, we might like to
708-
do better than `r round(100*(vfold_metrics %>% filter(.metric == "accuracy"))$mean,0)`% for this application.
732+
do better than `r round(100*(vfold_metrics |> filter(.metric == "accuracy"))$mean,0)`% for this application.
709733

710734
In order to improve our classifier, we have one choice of parameter: the number of
711735
neighbors, $K$. Since cross-validation helps us evaluate the accuracy of our
@@ -764,13 +788,13 @@ accuracy_vs_k
764788
```
765789

766790
Setting the number of
767-
neighbors to $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors`
768-
provides the highest accuracy (`r (accuracies %>% arrange(desc(mean)) %>% slice(1) %>% pull(mean) %>% round(4))*100`%). But there is no exact or perfect answer here;
791+
neighbors to $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors`
792+
provides the highest accuracy (`r (accuracies |> arrange(desc(mean)) |> slice(1) |> pull(mean) |> round(4))*100`%). But there is no exact or perfect answer here;
769793
any selection from $K = 3$ and $15$ would be reasonably justified, as all
770794
of these differ in classifier accuracy by a small amount. Remember: the
771795
values you see on this plot are *estimates* of the true accuracy of our
772796
classifier. Although the
773-
$K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors` value is
797+
$K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` value is
774798
higher than the others on this plot,
775799
that doesn't mean the classifier is actually more accurate with this parameter
776800
value! Generally, when selecting $K$ (and other parameters for other predictive
@@ -780,12 +804,12 @@ models), we are looking for a value where:
780804
- changing the value to a nearby one (e.g., adding or subtracting 1) doesn't decrease accuracy too much, so that our choice is reliable in the presence of uncertainty
781805
- the cost of training the model is not prohibitive (e.g., in our situation, if $K$ is too large, predicting becomes expensive!)
782806

783-
We know that $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors`
807+
We know that $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors`
784808
provides the highest estimated accuracy. Further, Figure \@ref(fig:06-find-k) shows that the estimated accuracy
785-
changes by only a small amount if we increase or decrease $K$ near $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors`.
786-
And finally, $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors` does not create a prohibitively expensive
809+
changes by only a small amount if we increase or decrease $K$ near $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors`.
810+
And finally, $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` does not create a prohibitively expensive
787811
computational cost of training. Considering these three points, we would indeed select
788-
$K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors` for the classifier.
812+
$K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` for the classifier.
789813

790814
### Under/Overfitting
791815

@@ -982,13 +1006,13 @@ cancer_irrelevant <- cancer |> select(Class, Smoothness, Concavity, Perimeter)
9821006
for (i in 1:500) {
9831007
# create column
9841008
col = (sample(2, size=nrow(cancer_irrelevant), replace=TRUE)-1)
985-
cancer_irrelevant <- cancer_irrelevant %>%
1009+
cancer_irrelevant <- cancer_irrelevant |>
9861010
add_column( !!paste("Irrelevant", i, sep="") := col)
9871011
}
9881012
```
9891013

9901014
```{r 06-irrelevant-printdata, warning = FALSE}
991-
cancer_irrelevant %>%
1015+
cancer_irrelevant |>
9921016
select(Class, Smoothness, Concavity, Perimeter, Irrelevant1, Irrelevant2)
9931017
```
9941018

@@ -1052,23 +1076,23 @@ for (i in 1:length(ks)) {
10521076
head(1)
10531077
fixedaccs[[i]] <- res$mean
10541078
}
1055-
accs <- accs %>% unlist()
1056-
nghbrs <- nghbrs %>% unlist()
1057-
fixedaccs <- fixedaccs %>% unlist()
1079+
accs <- accs |> unlist()
1080+
nghbrs <- nghbrs |> unlist()
1081+
fixedaccs <- fixedaccs |> unlist()
10581082
10591083
## get accuracy if we always just guess the most frequent label
1060-
#base_acc <- cancer_irrelevant %>%
1061-
# group_by(Class) %>%
1062-
# summarize(n = n()) %>%
1063-
# mutate(frac = n/sum(n)) %>%
1064-
# summarize(mx = max(frac)) %>%
1084+
#base_acc <- cancer_irrelevant |>
1085+
# group_by(Class) |>
1086+
# summarize(n = n()) |>
1087+
# mutate(frac = n/sum(n)) |>
1088+
# summarize(mx = max(frac)) |>
10651089
# select(mx)
1066-
#base_acc <- base_acc$mx %>% unlist()
1090+
#base_acc <- base_acc$mx |> unlist()
10671091
10681092
# plot
10691093
res <- tibble(ks = ks, accs = accs, fixedaccs = fixedaccs, nghbrs = nghbrs)
1070-
#res <- res %>% mutate(base_acc = base_acc)
1071-
#plt_irrelevant_accuracies <- res %>%
1094+
#res <- res |> mutate(base_acc = base_acc)
1095+
#plt_irrelevant_accuracies <- res |>
10721096
# ggplot() +
10731097
# geom_line(mapping = aes(x=ks, y=accs, linetype="Tuned KNN")) +
10741098
# geom_hline(data=res, mapping=aes(yintercept=base_acc, linetype="Always Predict Benign")) +
@@ -1311,7 +1335,7 @@ for (i in 1:n_total) {
13111335
models[[j]] <- model_string
13121336
}
13131337
jstar <- which.max(unlist(accs))
1314-
accuracies <- accuracies %>%
1338+
accuracies <- accuracies |>
13151339
add_row(size = i,
13161340
model_string = models[[jstar]],
13171341
accuracy = accs[[jstar]])

pdf/krantz.cls

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1437,30 +1437,9 @@
14371437
\par}\fi}
14381438

14391439
\newcommand\listoffigures{%
1440-
\if@twocolumn
1441-
\@restonecoltrue\onecolumn
1442-
\else
1443-
\@restonecolfalse
1444-
\fi
1445-
\chapter*{\listfigurename}%
1446-
\@mkboth{\MakeUppercase\listfigurename}%
1447-
{\MakeUppercase\listfigurename}%
1448-
\@starttoc{lof}%
1449-
\if@restonecol\twocolumn\fi
14501440
}
14511441
\newcommand*\l@figure{\@dottedtocline{1}{1.5em}{2.3em}}
14521442
\newcommand\listoftables{%
1453-
\if@twocolumn
1454-
\@restonecoltrue\onecolumn
1455-
\else
1456-
\@restonecolfalse
1457-
\fi
1458-
\chapter*{\listtablename}%
1459-
\@mkboth{%
1460-
\MakeUppercase\listtablename}%
1461-
{\MakeUppercase\listtablename}%
1462-
\@starttoc{lot}%
1463-
\if@restonecol\twocolumn\fi
14641443
}
14651444
\let\l@table\l@figure
14661445
\newdimen\bibindent

0 commit comments

Comments
 (0)