Skip to content

Commit 296ab28

Browse files
committed
Fixed tidymodels width issue and converted outstanding magrittr pipes (%>%) to base R pipes (|>).
1 parent ea2487b commit 296ab28

File tree

4 files changed

+122
-61
lines changed

4 files changed

+122
-61
lines changed

classification1.Rmd

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,31 @@ library(plotly)
66
library(knitr)
77
library(kableExtra)
88
library(ggpubr)
9+
library(stringr)
910
1011
knitr::opts_chunk$set(echo = TRUE,
1112
fig.align = "center")
1213
options(knitr.table.format = function() {
1314
if (knitr::is_latex_output()) 'latex' else 'pandoc'
1415
})
1516
reticulate::use_miniconda('r-reticulate')
17+
18+
print_tidymodels <- function(tidymodels_object) {
19+
if(!is_latex_output()) {
20+
tidymodels_object
21+
} else {
22+
output <- capture.output(tidymodels_object)
23+
24+
for (i in seq_along(output)) {
25+
if (nchar(output[i]) <= 80) {
26+
cat(output[i], sep = "\n")
27+
} else {
28+
cat(str_sub(output[i], start = 1, end = 80), sep = "\n")
29+
cat(str_sub(output[i], start = 81, end = nchar(output[i])), sep = "\n")
30+
}
31+
}
32+
}
33+
}
1634
```
1735

1836
## Overview
@@ -211,7 +229,7 @@ We also make the category labels ("B" and "M") more readable by
211229
changing them to "Benign" and "Malignant" using the `labels` argument.
212230

213231
```{r 05-scatter, fig.height = 3.5, fig.width = 4.5, fig.cap= "Scatter plot of concavity versus perimeter colored by diagnosis label."}
214-
perim_concav <- cancer %>%
232+
perim_concav <- cancer |>
215233
ggplot(aes(x = Perimeter, y = Concavity, color = Class)) +
216234
geom_point(alpha = 0.6) +
217235
labs(x = "Perimeter (standardized)",
@@ -290,7 +308,7 @@ Figure \@ref(fig:05-knn-1).
290308
perim_concav_with_new_point <- bind_rows(cancer,
291309
tibble(Perimeter = new_point[1],
292310
Concavity = new_point[2],
293-
Class = "unknown")) %>%
311+
Class = "unknown")) |>
294312
ggplot(aes(x = Perimeter,
295313
y = Concavity,
296314
color = Class,
@@ -348,7 +366,7 @@ not, if you consider the other nearby points...
348366
perim_concav_with_new_point2 <- bind_rows(cancer,
349367
tibble(Perimeter = new_point[1],
350368
Concavity = new_point[2],
351-
Class = "unknown")) %>%
369+
Class = "unknown")) |>
352370
ggplot(aes(x = Perimeter,
353371
y = Concavity,
354372
color = Class,
@@ -496,11 +514,11 @@ math_table <- tibble(Perimeter = round(tab[1:5,1],2),
496514
Concavity = round(tab[1:5,2],2),
497515
dist = round(neighbors[1:5, "Distance"], 2)
498516
)
499-
math_table <- math_table %>%
517+
math_table <- math_table |>
500518
mutate(Distance = paste0("$\\sqrt{(", new_obs_Perimeter, " - ", ifelse(Perimeter < 0, "(", ""), Perimeter, ifelse(Perimeter < 0,")",""), ")^2",
501519
" + ",
502-
"(", new_obs_Concavity, " - ", ifelse(Concavity < 0,"(",""), Concavity, ifelse(Concavity < 0,")",""), ")^2} = ", dist, "$")) %>%
503-
select(-dist) %>%
520+
"(", new_obs_Concavity, " - ", ifelse(Concavity < 0,"(",""), Concavity, ifelse(Concavity < 0,")",""), ")^2} = ", dist, "$")) |>
521+
select(-dist) |>
504522
mutate(Class= tab[1:5, "Class"])
505523
```
506524

@@ -586,10 +604,10 @@ my_distances_3 <- table_with_distances(cancer[, attrs],
586604
new_obs_3[, attrs])
587605
neighbors_3 <- cancer[order(my_distances_3$Distance), ]
588606
589-
data <- neighbors_3 %>% select(Perimeter, Concavity, Symmetry) %>% slice(1:5)
607+
data <- neighbors_3 |> select(Perimeter, Concavity, Symmetry) |> slice(1:5)
590608
591609
# add to the df
592-
scaled_cancer_3 <- bind_rows(cancer, new_obs_3) %>%
610+
scaled_cancer_3 <- bind_rows(cancer, new_obs_3) |>
593611
mutate(Class = fct_recode(Class, "Benign" = "B", "Malignant"= "M"))
594612
595613
plot_3d <- scaled_cancer_3 |>
@@ -598,7 +616,7 @@ plot_3d <- scaled_cancer_3 |>
598616
xaxis = list(title = "Perimeter"),
599617
yaxis = list(title = "Concavity"),
600618
zaxis = list(title = "Symmetry")
601-
)) %>%
619+
)) |>
602620
add_trace(x = ~Perimeter,
603621
y = ~Concavity,
604622
z = ~Symmetry,
@@ -628,23 +646,23 @@ x5 <- c(pull(new_obs_3[1]), data$Perimeter[5])
628646
y5 <- c(pull(new_obs_3[2]), data$Concavity[5])
629647
z5 <- c(pull(new_obs_3[3]), data$Symmetry[5])
630648
631-
plot_3d <- plot_3d %>%
649+
plot_3d <- plot_3d |>
632650
add_trace(x = x1, y = y1, z = z1, type = "scatter3d", mode = "lines",
633-
name = "lines", showlegend = FALSE, color = I("steelblue2")) %>%
651+
name = "lines", showlegend = FALSE, color = I("steelblue2")) |>
634652
add_trace(x = x2, y = y2, z = z2, type = "scatter3d", mode = "lines",
635-
name = "lines", showlegend = FALSE, color = I("steelblue2")) %>%
653+
name = "lines", showlegend = FALSE, color = I("steelblue2")) |>
636654
add_trace(x = x3, y = y3, z = z3, type = "scatter3d", mode = "lines",
637-
name = "lines", showlegend = FALSE, color = I("steelblue2")) %>%
655+
name = "lines", showlegend = FALSE, color = I("steelblue2")) |>
638656
add_trace(x = x4, y = y4, z = z4, type = "scatter3d", mode = "lines",
639-
name = "lines", showlegend = FALSE, color = I("orange2")) %>%
657+
name = "lines", showlegend = FALSE, color = I("orange2")) |>
640658
add_trace(x = x5, y = y5, z = z5, type = "scatter3d", mode = "lines",
641659
name = "lines", showlegend = FALSE, color = I("steelblue2"))
642660
643661
if(!is_latex_output()){
644662
plot_3d
645663
} else {
646664
# scene = list(camera = list(eye = list(x=2, y=2, z = 1.5)))
647-
# plot_3d <- plot_3d %>% layout(scene = scene)
665+
# plot_3d <- plot_3d |> layout(scene = scene)
648666
# save_image(plot_3d, "img/plot3d_knn_classification.png", scale = 10)
649667
# cannot adjust size of points in this plot for pdf
650668
# so using a screenshot for now instead
@@ -731,12 +749,16 @@ data frame, `Class ~ Perimeter + Concavity` and `Class ~ .` are equivalent.
731749
In general, you can choose individual predictors using the `+` symbol, or you can specify to
732750
use *all* predictors using the `.` symbol.
733751

734-
```{r 05-tidymodels-4b, tidy = TRUE, tidy.opts=list(width.cutoff = 60)}
752+
```{r 05-tidymodels-4b, results = 'hide', echo = TRUE}
735753
knn_fit <- knn_spec |>
736754
fit(Class ~ ., data = cancer_train)
737755
knn_fit
738756
```
739757

758+
```{r echo = FALSE}
759+
print_tidymodels(knn_fit)
760+
```
761+
740762
Here you can see the final trained model summary. It confirms that the computational engine used
741763
to train the model was `kknn::train.kknn`. It also shows the fraction of errors made by
742764
the nearest neighbor model, but we will ignore this for now and discuss it in more detail
@@ -1295,7 +1317,7 @@ and finally we will use the `fit` function to run the whole workflow on the `uns
12951317
Note another difference from earlier here: we do not include a formula in the `fit` function. This \index{tidymodels!fit}
12961318
is again because we included the formula in the recipe, so there is no need to respecify it:
12971319

1298-
```{r 05-workflow-add, tidy = TRUE, tidy.opts=list(width.cutoff = 60)}
1320+
```{r 05-workflow-add, results = 'hide', echo = TRUE}
12991321
knn_fit <- workflow() |>
13001322
add_recipe(uc_recipe) |>
13011323
add_model(knn_spec) |>
@@ -1304,6 +1326,10 @@ knn_fit <- workflow() |>
13041326
knn_fit
13051327
```
13061328

1329+
```{r echo = FALSE}
1330+
print_tidymodels(knn_fit)
1331+
```
1332+
13071333
As before, the fit object lists the function that trains the model as well as the "best" settings
13081334
for the number of neighbors and weight function (for now, these are just the values we chose
13091335
manually when we created `knn_spec` above). But now the fit object also includes information about

classification2.Rmd

Lines changed: 58 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,23 @@ library(gridExtra)
55
library(cowplot)
66
77
knitr::opts_chunk$set(fig.align = "center")
8+
9+
print_tidymodels <- function(tidymodels_object) {
10+
if(!is_latex_output()) {
11+
tidymodels_object
12+
} else {
13+
output <- capture.output(tidymodels_object)
14+
15+
for (i in seq_along(output)) {
16+
if (nchar(output[i]) <= 80) {
17+
cat(output[i], sep = "\n")
18+
} else {
19+
cat(str_sub(output[i], start = 1, end = 80), sep = "\n")
20+
cat(str_sub(output[i], start = 81, end = nchar(output[i])), sep = "\n")
21+
}
22+
}
23+
}
24+
}
825
```
926

1027
## Overview
@@ -324,7 +341,7 @@ use `fit` with the training data `cancer_train` to build the classifier.
324341
set.seed(1)
325342
```
326343

327-
```{r 06-create-K-nearest neighbor-classifier}
344+
```{r 06-create-K-nearest neighbor-classifier, results = 'hide', echo = TRUE}
328345
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
329346
set_engine("kknn") |>
330347
set_mode("classification")
@@ -337,6 +354,10 @@ knn_fit <- workflow() |>
337354
knn_fit
338355
```
339356

357+
```{r echo = FALSE}
358+
print_tidymodels(knn_fit)
359+
```
360+
340361
### Predict the labels in the test set
341362

342363
Now that we have a $K$-nearest neighbors classifier object, we can use it to
@@ -530,26 +551,28 @@ cancer_validation <- testing(cancer_split)
530551
531552
# recreate the standardization recipe from before
532553
# (since it must be based on the training data)
533-
cancer_recipe <- recipe(Class ~ Smoothness + Concavity, data = cancer_subtrain) %>%
534-
step_scale(all_predictors()) %>%
554+
cancer_recipe <- recipe(Class ~ Smoothness + Concavity,
555+
data = cancer_subtrain) |>
556+
step_scale(all_predictors()) |>
535557
step_center(all_predictors())
536558
537559
# fit the knn model (we can reuse the old knn_spec model from before)
538-
knn_fit <- workflow() %>%
539-
add_recipe(cancer_recipe) %>%
540-
add_model(knn_spec) %>%
560+
knn_fit <- workflow() |>
561+
add_recipe(cancer_recipe) |>
562+
add_model(knn_spec) |>
541563
fit(data = cancer_subtrain)
542564
543565
# get predictions on the validation data
544-
validation_predicted <- predict(knn_fit, cancer_validation) %>%
566+
validation_predicted <- predict(knn_fit, cancer_validation) |>
545567
bind_cols(cancer_validation)
546568
547569
# compute the accuracy
548-
acc <- validation_predicted %>%
549-
metrics(truth = Class, estimate = .pred_class) %>%
550-
filter(.metric == "accuracy") %>%
551-
select(.estimate) %>%
570+
acc <- validation_predicted |>
571+
metrics(truth = Class, estimate = .pred_class) |>
572+
filter(.metric == "accuracy") |>
573+
select(.estimate) |>
552574
pull()
575+
553576
acc
554577
```
555578

@@ -699,13 +722,13 @@ vfold_metrics
699722
### Parameter value selection
700723

701724
Using 5- and 10-fold cross-validation, we have estimated that the prediction
702-
accuracy of our classifier is somewhere around `r round(100*(vfold_metrics %>% filter(.metric == "accuracy"))$mean,0)`%.
725+
accuracy of our classifier is somewhere around `r round(100*(vfold_metrics |> filter(.metric == "accuracy"))$mean,0)`%.
703726
Whether that is good or not
704727
depends entirely on the downstream application of the data analysis. In the
705728
present situation, we are trying to predict a tumor diagnosis, with expensive,
706729
damaging chemo/radiation therapy or patient death as potential consequences of
707730
misprediction. Hence, we might like to
708-
do better than `r round(100*(vfold_metrics %>% filter(.metric == "accuracy"))$mean,0)`% for this application.
731+
do better than `r round(100*(vfold_metrics |> filter(.metric == "accuracy"))$mean,0)`% for this application.
709732

710733
In order to improve our classifier, we have one choice of parameter: the number of
711734
neighbors, $K$. Since cross-validation helps us evaluate the accuracy of our
@@ -764,13 +787,13 @@ accuracy_vs_k
764787
```
765788

766789
Setting the number of
767-
neighbors to $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors`
768-
provides the highest accuracy (`r (accuracies %>% arrange(desc(mean)) %>% slice(1) %>% pull(mean) %>% round(4))*100`%). But there is no exact or perfect answer here;
790+
neighbors to $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors`
791+
provides the highest accuracy (`r (accuracies |> arrange(desc(mean)) |> slice(1) |> pull(mean) |> round(4))*100`%). But there is no exact or perfect answer here;
769792
any selection from $K = 3$ and $15$ would be reasonably justified, as all
770793
of these differ in classifier accuracy by a small amount. Remember: the
771794
values you see on this plot are *estimates* of the true accuracy of our
772795
classifier. Although the
773-
$K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors` value is
796+
$K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` value is
774797
higher than the others on this plot,
775798
that doesn't mean the classifier is actually more accurate with this parameter
776799
value! Generally, when selecting $K$ (and other parameters for other predictive
@@ -780,12 +803,12 @@ models), we are looking for a value where:
780803
- changing the value to a nearby one (e.g., adding or subtracting 1) doesn't decrease accuracy too much, so that our choice is reliable in the presence of uncertainty
781804
- the cost of training the model is not prohibitive (e.g., in our situation, if $K$ is too large, predicting becomes expensive!)
782805

783-
We know that $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors`
806+
We know that $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors`
784807
provides the highest estimated accuracy. Further, Figure \@ref(fig:06-find-k) shows that the estimated accuracy
785-
changes by only a small amount if we increase or decrease $K$ near $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors`.
786-
And finally, $K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors` does not create a prohibitively expensive
808+
changes by only a small amount if we increase or decrease $K$ near $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors`.
809+
And finally, $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` does not create a prohibitively expensive
787810
computational cost of training. Considering these three points, we would indeed select
788-
$K =$ `r (accuracies %>% arrange(desc(mean)) %>% head(1))$neighbors` for the classifier.
811+
$K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` for the classifier.
789812

790813
### Under/Overfitting
791814

@@ -981,13 +1004,13 @@ cancer_irrelevant <- cancer |> select(Class, Smoothness, Concavity, Perimeter)
9811004
for (i in 1:500) {
9821005
# create column
9831006
col = (sample(2, size=nrow(cancer_irrelevant), replace=TRUE)-1)
984-
cancer_irrelevant <- cancer_irrelevant %>%
1007+
cancer_irrelevant <- cancer_irrelevant |>
9851008
add_column( !!paste("Irrelevant", i, sep="") := col)
9861009
}
9871010
```
9881011

9891012
```{r 06-irrelevant-printdata, warning = FALSE}
990-
cancer_irrelevant %>%
1013+
cancer_irrelevant |>
9911014
select(Class, Smoothness, Concavity, Perimeter, Irrelevant1, Irrelevant2)
9921015
```
9931016

@@ -1051,23 +1074,23 @@ for (i in 1:length(ks)) {
10511074
head(1)
10521075
fixedaccs[[i]] <- res$mean
10531076
}
1054-
accs <- accs %>% unlist()
1055-
nghbrs <- nghbrs %>% unlist()
1056-
fixedaccs <- fixedaccs %>% unlist()
1077+
accs <- accs |> unlist()
1078+
nghbrs <- nghbrs |> unlist()
1079+
fixedaccs <- fixedaccs |> unlist()
10571080
10581081
## get accuracy if we always just guess the most frequent label
1059-
#base_acc <- cancer_irrelevant %>%
1060-
# group_by(Class) %>%
1061-
# summarize(n = n()) %>%
1062-
# mutate(frac = n/sum(n)) %>%
1063-
# summarize(mx = max(frac)) %>%
1082+
#base_acc <- cancer_irrelevant |>
1083+
# group_by(Class) |>
1084+
# summarize(n = n()) |>
1085+
# mutate(frac = n/sum(n)) |>
1086+
# summarize(mx = max(frac)) |>
10641087
# select(mx)
1065-
#base_acc <- base_acc$mx %>% unlist()
1088+
#base_acc <- base_acc$mx |> unlist()
10661089
10671090
# plot
10681091
res <- tibble(ks = ks, accs = accs, fixedaccs = fixedaccs, nghbrs = nghbrs)
1069-
#res <- res %>% mutate(base_acc = base_acc)
1070-
#plt_irrelevant_accuracies <- res %>%
1092+
#res <- res |> mutate(base_acc = base_acc)
1093+
#plt_irrelevant_accuracies <- res |>
10711094
# ggplot() +
10721095
# geom_line(mapping = aes(x=ks, y=accs, linetype="Tuned KNN")) +
10731096
# geom_hline(data=res, mapping=aes(yintercept=base_acc, linetype="Always Predict Benign")) +
@@ -1103,7 +1126,7 @@ plt_irrelevant_nghbrs
11031126
```
11041127

11051128
```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
1106-
res_tmp <- res %>% pivot_longer(cols=c("accs", "fixedaccs"),
1129+
res_tmp <- res |> pivot_longer(cols=c("accs", "fixedaccs"),
11071130
names_to="Type",
11081131
values_to="accuracy")
11091132
@@ -1307,7 +1330,7 @@ for (i in 1:n_total) {
13071330
models[[j]] <- model_string
13081331
}
13091332
jstar <- which.max(unlist(accs))
1310-
accuracies <- accuracies %>%
1333+
accuracies <- accuracies |>
13111334
add_row(size = i,
13121335
model_string = models[[jstar]],
13131336
accuracy = accs[[jstar]])

regression1.Rmd

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ Then we create a 5-fold cross validation object, and put the recipe and model sp
391391
in a workflow.
392392
\index{tidymodels}\index{recipe}\index{workflow}
393393

394-
```{r 07-choose-k-knn}
394+
```{r 07-choose-k-knn, results = 'hide', echo = TRUE}
395395
sacr_recipe <- recipe(price ~ sqft, data = sacramento_train) |>
396396
step_scale(all_predictors()) |>
397397
step_center(all_predictors())
@@ -410,6 +410,10 @@ sacr_wkflw <- workflow() |>
410410
sacr_wkflw
411411
```
412412

413+
```{r echo = FALSE}
414+
print_tidymodels(sacr_wkflw)
415+
```
416+
413417
Next we run cross validation for a grid of numbers of neighbors ranging from 1 to 200.
414418
The following code tunes
415419
the model and returns the RMSPE for each number of neighbors. In the output of the `sacr_results`

0 commit comments

Comments
 (0)