@@ -190,29 +190,33 @@ glimpse(cancer)
190
190
```
191
191
192
192
From the summary of the data above, we can see that ` Class ` is of type character
193
- (denoted by ` <chr> ` ). Since we will be working with ` Class ` as a
194
- categorical statistical variable, we will convert it to a factor using the
195
- function ` as_factor ` . \index{factor!as\_ factor}
196
-
193
+ (denoted by ` <chr> ` ). We can use the ` distinct ` \index{distinct} function to see all the unique
194
+ values present in that column. We see that there are two diagnoses: benign, represented by "B",
195
+ and malignant, represented by "M".
196
+ ``` {r 05-distinct}
197
+ cancer |>
198
+ distinct(Class)
199
+ ```
200
+ Since we will be working with ` Class ` as a categorical
201
+ variable, it is a good idea to convert it to a factor type using the ` as_factor ` function. \index{factor!as\_ factor}
202
+ We will also improve the readability of our analysis by renaming "M" to
203
+ "Malignant" and "B" to "Benign" using the ` fct_recode ` method. The ` fct_recode ` method \index{factor!fct\_ recode}
204
+ is used to replace the names of factor values with other names. The arguments of ` fct_recode ` are the column that you
205
+ want to modify, followed any number of arguments of the form ` "new name" = "old name" ` to specify the renaming scheme.
206
+
197
207
``` {r 05-class}
198
208
cancer <- cancer |>
199
- mutate(Class = as_factor(Class))
209
+ mutate(Class = as_factor(Class)) |>
210
+ mutate(Class = fct_recode(Class, "Malignant" = "M", "Benign" = "B"))
200
211
glimpse(cancer)
201
212
```
202
213
203
- Recall that factors have what are called "levels", which you can think of as categories. We
204
- can verify the levels of the ` Class ` column by using the ` levels ` \index{levels}\index{factor!levels} function.
205
- This function should return the name of each category in that column. Given
206
- that we only have two different values in our ` Class ` column (B for benign and M
207
- for malignant), we only expect to get two names back. Note that the ` levels ` function requires a * vector* argument;
208
- so we use the ` pull ` function to extract a single column (` Class ` ) and
209
- pass that into the ` levels ` function to see the categories
210
- in the ` Class ` column.
214
+ Let's verify that we have successfully converted the ` Class ` column to a factor variable
215
+ and renamed its values to "Benign" and "Malignant" using the ` distinct ` function once more.
211
216
212
- ``` {r 05-levels }
217
+ ``` {r 05-distinct2 }
213
218
cancer |>
214
- pull(Class) |>
215
- levels()
219
+ distinct(Class)
216
220
```
217
221
218
222
### Exploring the cancer data
@@ -238,8 +242,6 @@ perimeter and concavity variables. Rather than use `ggplot's` default palette,
238
242
we select our own colorblind-friendly colors&mdash ; ` "orange2" `
239
243
for light orange and ` "steelblue2" ` for light blue&mdash ; and
240
244
pass them as the ` values ` argument to the ` scale_color_manual ` function.
241
- We also make the category labels ("B" and "M") more readable by
242
- changing them to "Benign" and "Malignant" using the ` labels ` argument.
243
245
244
246
``` {r 05-scatter, fig.height = 3.5, fig.width = 4.5, fig.cap= "Scatter plot of concavity versus perimeter colored by diagnosis label."}
245
247
perim_concav <- cancer |>
@@ -248,8 +250,7 @@ perim_concav <- cancer |>
248
250
labs(x = "Perimeter (standardized)",
249
251
y = "Concavity (standardized)",
250
252
color = "Diagnosis") +
251
- scale_color_manual(labels = c("Malignant", "Benign"),
252
- values = c("orange2", "steelblue2")) +
253
+ scale_color_manual(values = c("orange2", "steelblue2")) +
253
254
theme(text = element_text(size = 12))
254
255
perim_concav
255
256
```
@@ -333,13 +334,10 @@ perim_concav_with_new_point <- bind_rows(cancer,
333
334
labs(color = "Diagnosis", x = "Perimeter (standardized)",
334
335
y = "Concavity (standardized)") +
335
336
scale_color_manual(name = "Diagnosis",
336
- labels = c("Benign", "Malignant", "Unknown"),
337
337
values = c("steelblue2", "orange2", "red")) +
338
338
scale_shape_manual(name = "Diagnosis",
339
- labels = c("Benign", "Malignant", "Unknown"),
340
339
values= c(16, 16, 18))+
341
340
scale_size_manual(name = "Diagnosis",
342
- labels = c("Benign", "Malignant", "Unknown"),
343
341
values= c(2, 2, 2.5))
344
342
perim_concav_with_new_point
345
343
```
@@ -391,13 +389,10 @@ perim_concav_with_new_point2 <- bind_rows(cancer,
391
389
x = "Perimeter (standardized)",
392
390
y = "Concavity (standardized)") +
393
391
scale_color_manual(name = "Diagnosis",
394
- labels = c("Benign", "Malignant", "Unknown"),
395
392
values = c("steelblue2", "orange2", "red")) +
396
393
scale_shape_manual(name = "Diagnosis",
397
- labels = c("Benign", "Malignant", "Unknown"),
398
394
values= c(16, 16, 18))+
399
395
scale_size_manual(name = "Diagnosis",
400
- labels = c("Benign", "Malignant", "Unknown"),
401
396
values= c(2, 2, 2.5))
402
397
perim_concav_with_new_point2 +
403
398
geom_segment(aes(
@@ -488,13 +483,10 @@ perim_concav <- bind_rows(cancer,
488
483
breaks = seq(-2, 4, 1)) +
489
484
labs(color = "Diagnosis") +
490
485
scale_color_manual(name = "Diagnosis",
491
- labels = c("Benign", "Malignant", "Unknown"),
492
486
values = c("steelblue2", "orange2", "red")) +
493
487
scale_shape_manual(name = "Diagnosis",
494
- labels = c("Benign", "Malignant", "Unknown"),
495
488
values= c(16, 16, 18))+
496
489
scale_size_manual(name = "Diagnosis",
497
- labels = c("Benign", "Malignant", "Unknown"),
498
490
values= c(2, 2, 2.5))
499
491
500
492
perim_concav
@@ -545,7 +537,7 @@ kable(math_table, booktabs = TRUE,
545
537
```
546
538
547
539
The result of this computation shows that 3 of the 5 nearest neighbors to our new observation are
548
- malignant ( ` M ` ) ; since this is the majority, we classify our new observation as malignant.
540
+ malignant; since this is the majority, we classify our new observation as malignant.
549
541
These 5 neighbors are circled in Figure \@ ref(fig:05-multiknn-3).
550
542
551
543
``` {r 05-multiknn-3, echo = FALSE, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of concavity versus perimeter with 5 nearest neighbors circled."}
@@ -602,7 +594,8 @@ cancer |>
602
594
slice(1:5) # take the first 5 rows
603
595
```
604
596
605
- Based on $K=5$ nearest neighbors with these three predictors we would classify the new observation as malignant since 4 out of 5 of the nearest neighbors are malignant class.
597
+ Based on $K=5$ nearest neighbors with these three predictors, we would classify
598
+ the new observation as malignant since 4 out of 5 of the nearest neighbors are from the malignant class.
606
599
Figure \@ ref(fig:05-more) shows what the data look like when we visualize them
607
600
as a 3-dimensional scatter with lines from the new observation to its five nearest neighbors.
608
601
@@ -621,8 +614,7 @@ neighbors_3 <- cancer[order(my_distances_3$Distance), ]
621
614
data <- neighbors_3 |> select(Perimeter, Concavity, Symmetry) |> slice(1:5)
622
615
623
616
# add to the df
624
- scaled_cancer_3 <- bind_rows(cancer, new_obs_3) |>
625
- mutate(Class = fct_recode(Class, "Benign" = "B", "Malignant"= "M"))
617
+ scaled_cancer_3 <- bind_rows(cancer, new_obs_3)
626
618
627
619
plot_3d <- scaled_cancer_3 |>
628
620
plot_ly() |>
@@ -637,7 +629,7 @@ plot_3d <- scaled_cancer_3 |>
637
629
color = ~Class,
638
630
opacity = 0.4,
639
631
size = 2,
640
- colors = c("orange2 ", "steelblue2 ", "red"),
632
+ colors = c("steelblue2 ", "orange2 ", "red"),
641
633
symbol = ~Class, symbols = c('circle','circle','diamond'))
642
634
643
635
x1 <- c(pull(new_obs_3[1]), data$Perimeter[1])
@@ -662,15 +654,15 @@ z5 <- c(pull(new_obs_3[3]), data$Symmetry[5])
662
654
663
655
plot_3d <- plot_3d |>
664
656
add_trace(x = x1, y = y1, z = z1, type = "scatter3d", mode = "lines",
665
- name = "lines", showlegend = FALSE, color = I("steelblue2 ")) |>
657
+ name = "lines", showlegend = FALSE, color = I("orange2 ")) |>
666
658
add_trace(x = x2, y = y2, z = z2, type = "scatter3d", mode = "lines",
667
- name = "lines", showlegend = FALSE, color = I("steelblue2 ")) |>
659
+ name = "lines", showlegend = FALSE, color = I("orange2 ")) |>
668
660
add_trace(x = x3, y = y3, z = z3, type = "scatter3d", mode = "lines",
669
- name = "lines", showlegend = FALSE, color = I("steelblue2")) |>
670
- add_trace(x = x4, y = y4, z = z4, type = "scatter3d", mode = "lines",
671
661
name = "lines", showlegend = FALSE, color = I("orange2")) |>
662
+ add_trace(x = x4, y = y4, z = z4, type = "scatter3d", mode = "lines",
663
+ name = "lines", showlegend = FALSE, color = I("steelblue2")) |>
672
664
add_trace(x = x5, y = y5, z = z5, type = "scatter3d", mode = "lines",
673
- name = "lines", showlegend = FALSE, color = I("steelblue2 "))
665
+ name = "lines", showlegend = FALSE, color = I("orange2 "))
674
666
675
667
if(!is_latex_output()){
676
668
plot_3d
@@ -786,7 +778,7 @@ Finally, we make the prediction on the new observation by calling the `predict`
786
778
passing both the fit object we just created and the new observation itself. As above,
787
779
when we ran the $K$-nearest neighbors
788
780
classification algorithm manually, the ` knn_fit ` object classifies the new observation as
789
- malignant ("M") . Note that the ` predict ` function outputs a data frame with a single
781
+ malignant. Note that the ` predict ` function outputs a data frame with a single
790
782
variable named ` .pred_class ` .
791
783
792
784
``` {r 05-predict}
@@ -837,12 +829,15 @@ is said to be *standardized*, \index{standardization!K-nearest neighbors} and al
837
829
and a standard deviation of 1. To illustrate the effect that standardization can have on the $K$-nearest
838
830
neighbor algorithm, we will read in the original, unstandardized Wisconsin breast
839
831
cancer data set; we have been using a standardized version of the data set up
840
- until now. To keep things simple, we will just use the ` Area ` , ` Smoothness ` , and ` Class `
832
+ until now. As before, we will convert the ` Class ` variable to the factor type
833
+ and rename the values to "Malignant" and "Benign."
834
+ To keep things simple, we will just use the ` Area ` , ` Smoothness ` , and ` Class `
841
835
variables:
842
836
843
837
``` {r 05-scaling-1, message = FALSE}
844
838
unscaled_cancer <- read_csv("data/unscaled_wdbc.csv") |>
845
839
mutate(Class = as_factor(Class)) |>
840
+ mutate(Class = fct_recode(Class, "Benign" = "B", "Malignant" = "M")) |>
846
841
select(Class, Area, Smoothness)
847
842
unscaled_cancer
848
843
```
@@ -972,13 +967,10 @@ unscaled <- ggplot(unscaled_cancer, aes(x = Area,
972
967
shape = Class, size = Class)) +
973
968
geom_point(alpha = 0.6) +
974
969
scale_color_manual(name = "Diagnosis",
975
- labels = c("Benign", "Malignant", "Unknown"),
976
970
values = c("steelblue2", "orange2", "red")) +
977
971
scale_shape_manual(name = "Diagnosis",
978
- labels = c("Benign", "Malignant", "Unknown"),
979
972
values= c(16, 16, 18)) +
980
973
scale_size_manual(name = "Diagnosis",
981
- labels = c("Benign", "Malignant", "Unknown"),
982
974
values=c(2,2,2.5)) +
983
975
ggtitle("Unstandardized Data") +
984
976
geom_segment(aes(
@@ -1015,13 +1007,10 @@ scaled <- ggplot(scaled_cancer, aes(x = Area,
1015
1007
size = Class)) +
1016
1008
geom_point(alpha = 0.6) +
1017
1009
scale_color_manual(name = "Diagnosis",
1018
- labels = c("Benign", "Malignant", "Unknown"),
1019
1010
values = c("steelblue2", "orange2", "red")) +
1020
1011
scale_shape_manual(name = "Diagnosis",
1021
- labels = c("Benign", "Malignant", "Unknown"),
1022
1012
values= c(16, 16, 18)) +
1023
1013
scale_size_manual(name = "Diagnosis",
1024
- labels = c("Benign", "Malignant", "Unknown"),
1025
1014
values=c(2,2,2.5)) +
1026
1015
ggtitle("Standardized Data") +
1027
1016
labs(x = "Area (standardized)", y = "Smoothness (standardized)") +
@@ -1055,13 +1044,10 @@ ggplot(unscaled_cancer, aes(x = Area,
1055
1044
shape = Class)) +
1056
1045
geom_point(size = 2.5, alpha = 0.6) +
1057
1046
scale_color_manual(name = "Diagnosis",
1058
- labels = c("Benign", "Malignant", "Unknown"),
1059
1047
values = c("steelblue2", "orange2", "red")) +
1060
1048
scale_shape_manual(name = "Diagnosis",
1061
- labels = c("Benign", "Malignant", "Unknown"),
1062
1049
values= c(16, 16, 18)) +
1063
1050
scale_size_manual(name = "Diagnosis",
1064
- labels = c("Benign", "Malignant", "Unknown"),
1065
1051
values = c(1, 1, 2.5)) +
1066
1052
ggtitle("Unstandardized Data") +
1067
1053
geom_segment(aes(
@@ -1119,8 +1105,8 @@ set.seed(3)
1119
1105
1120
1106
``` {r 05-unbalanced, fig.height = 3.5, fig.width = 4.5, fig.pos = "H", out.extra="", fig.cap = "Imbalanced data."}
1121
1107
rare_cancer <- bind_rows(
1122
- filter(cancer, Class == "B "),
1123
- cancer |> filter(Class == "M ") |> slice_head(n = 3)
1108
+ filter(cancer, Class == "Benign "),
1109
+ cancer |> filter(Class == "Malignant ") |> slice_head(n = 3)
1124
1110
) |>
1125
1111
select(Class, Perimeter, Concavity)
1126
1112
@@ -1130,8 +1116,7 @@ rare_plot <- rare_cancer |>
1130
1116
labs(x = "Perimeter (standardized)",
1131
1117
y = "Concavity (standardized)",
1132
1118
color = "Diagnosis") +
1133
- scale_color_manual(labels = c("Malignant", "Benign"),
1134
- values = c("orange2", "steelblue2")) +
1119
+ scale_color_manual(values = c("orange2", "steelblue2")) +
1135
1120
theme(text = element_text(size = 12))
1136
1121
1137
1122
rare_plot
@@ -1164,18 +1149,15 @@ rare_plot <- bind_rows(rare_cancer,
1164
1149
x = "Perimeter (standardized)",
1165
1150
y = "Concavity (standardized)") +
1166
1151
scale_color_manual(name = "Diagnosis",
1167
- labels = c("Benign", "Malignant", "Unknown"),
1168
1152
values = c("steelblue2", "orange2", "red")) +
1169
1153
scale_shape_manual(name = "Diagnosis",
1170
- labels = c("Benign", "Malignant", "Unknown"),
1171
1154
values= c(16, 16, 18))+
1172
1155
scale_size_manual(name = "Diagnosis",
1173
- labels = c("Benign", "Malignant", "Unknown"),
1174
1156
values= c(2, 2, 2.5))
1175
1157
1176
1158
for (i in 1:7) {
1177
1159
clr <- "steelblue2"
1178
- if (neighbors$Class[i] == "M ") {
1160
+ if (neighbors$Class[i] == "Malignant ") {
1179
1161
clr <- "orange2"
1180
1162
}
1181
1163
rare_plot <- rare_plot +
@@ -1236,8 +1218,7 @@ rare_plot <-
1236
1218
labs(color = "Diagnosis",
1237
1219
x = "Perimeter (standardized)",
1238
1220
y = "Concavity (standardized)") +
1239
- scale_color_manual(labels = c("Malignant", "Benign"),
1240
- values = c("orange2", "steelblue2"))
1221
+ scale_color_manual(values = c("orange2", "steelblue2"))
1241
1222
1242
1223
rare_plot
1243
1224
```
@@ -1308,8 +1289,7 @@ upsampled_plot <-
1308
1289
labs(color = "Diagnosis",
1309
1290
x = "Perimeter (standardized)",
1310
1291
y = "Concavity (standardized)") +
1311
- scale_color_manual(labels = c("Malignant", "Benign"),
1312
- values = c("orange2", "steelblue2"))
1292
+ scale_color_manual(values = c("orange2", "steelblue2"))
1313
1293
1314
1294
upsampled_plot
1315
1295
```
@@ -1324,7 +1304,8 @@ First we will load the data, create a model, and specify a recipe for how the da
1324
1304
# load the unscaled cancer data
1325
1305
# and make sure the response variable, Class, is a factor
1326
1306
unscaled_cancer <- read_csv("data/unscaled_wdbc.csv") |>
1327
- mutate(Class = as_factor(Class))
1307
+ mutate(Class = as_factor(Class)) |>
1308
+ mutate(Class = fct_recode(Class, "Malignant" = "M", "Benign" = "B"))
1328
1309
1329
1310
# create the KNN model
1330
1311
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 7) |>
@@ -1431,8 +1412,7 @@ wkflw_plot <-
1431
1412
labs(color = "Diagnosis",
1432
1413
x = "Area",
1433
1414
y = "Smoothness") +
1434
- scale_color_manual(labels = c("Malignant", "Benign"),
1435
- values = c("orange2", "steelblue2")) +
1415
+ scale_color_manual(values = c("orange2", "steelblue2")) +
1436
1416
theme(text = element_text(size = 12))
1437
1417
1438
1418
wkflw_plot
0 commit comments