Skip to content

Commit 51b9470

Browse files
committed
2 parents 9047e7d + f59da1d commit 51b9470

File tree

2 files changed

+382
-393
lines changed

2 files changed

+382
-393
lines changed

classification1.Rmd

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ manipulate factors in R. As we learned in Chapter \@ref(viz), factors are a spec
9999
R that are often used for class label data.
100100

101101

102-
```{r 05-load-libraries}
102+
```{r 05-load-libraries, warning = FALSE, message = FALSE}
103103
library(tidyverse)
104104
library(forcats)
105105
```
@@ -173,16 +173,17 @@ column of a data frame into a vector.
173173

174174
```{r 05-levels}
175175
cancer |>
176-
select(Class) |>
177-
pull() |> # turns a data frame into a vector
176+
pull(Class) |> # turns a data frame into a vector
178177
levels()
179178
```
180179

181180
### Exploring the cancer data
182181

183182
Before we start doing any modelling, let's explore our data set. Below we use
184-
the `group_by` and `summarize` functions we used before to see that we have
185-
357 (63\%) benign and 212 (37\%) malignant tumour observations.
183+
the `group_by`, `summarize` and `n` functions to find the number and percentage
184+
of benign and maligant tumour observations in our data set. The `n` function within
185+
the `summarize` function counts the number of observations in each `Class` group.
186+
We have 357 (63\%) benign and 212 (37\%) malignant tumour observations.
186187

187188
```{r 05-tally}
188189
num_obs <- nrow(cancer)
@@ -277,7 +278,7 @@ Figure \@ref(fig:05-knn-1).
277278

278279
```{r 05-knn-1, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of concavity versus perimeter with new observation labelled in red"}
279280
perim_concav +
280-
geom_point(aes(x = new_point[1], y = new_point[2]), color = "red", size = 2.5)
281+
geom_point(aes(x = new_point[1], y = new_point[2]), color = "red", size = 2.5, pch = 17)
281282
```
282283
</center>
283284

@@ -291,7 +292,8 @@ they would have the same diagnosis.
291292
```{r 05-knn-2, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of concavity versus perimeter, with malignant nearest neighbour to a new observation highlighted"}
292293
perim_concav + geom_point(aes(x = new_point[1], y = new_point[2]),
293294
color = "red",
294-
size = 2.5
295+
size = 2.5,
296+
pch = 17
295297
) +
296298
geom_segment(aes(
297299
x = new_point[1],
@@ -320,7 +322,8 @@ not, if you consider the other nearby points...
320322
```{r 05-knn-4, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of concavity versus perimeter, with benign nearest neighbour to a new observation highlighted"}
321323
perim_concav + geom_point(aes(x = new_point[1], y = new_point[2]),
322324
color = "red",
323-
size = 2.5
325+
size = 2.5,
326+
pch = 17
324327
) +
325328
geom_segment(aes(
326329
x = new_point[1],
@@ -344,7 +347,8 @@ observation as malignant.
344347
```{r 05-knn-5, echo = FALSE, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of concavity versus perimeter with three nearest neighbours"}
345348
perim_concav + geom_point(aes(x = new_point[1], y = new_point[2]),
346349
color = "red",
347-
size = 2.5
350+
size = 2.5,
351+
pch = 17
348352
) +
349353
geom_segment(aes(
350354
x = new_point[1], y = new_point[2],
@@ -403,7 +407,7 @@ perim_concav <- cancer |>
403407
scale_y_continuous(name = "Concavity", breaks = seq(-2, 4, 1)) +
404408
labs(color = "Diagnosis") +
405409
scale_color_manual(labels = c("Malignant", "Benign"), values = c("orange2", "steelblue2")) +
406-
geom_point(aes(x = new_point[1], y = new_point[2]), color = "red", size = 2.5)
410+
geom_point(aes(x = new_point[1], y = new_point[2]), color = "red", size = 2.5, pch = 17)
407411
perim_concav
408412
```
409413
</center>
@@ -445,7 +449,7 @@ math_table <- math_table %>%
445449
```
446450

447451
```{r 05-multiknn-mathtable, echo = FALSE}
448-
kable(math_table, booktabs = TRUE, caption = "Evaluating the distances from the new observation to each of its 5 nearest neighbours", escape = FALSE)
452+
knitr::kable(math_table, booktabs = TRUE, caption = "Evaluating the distances from the new observation to each of its 5 nearest neighbours", escape = FALSE)
449453
```
450454

451455
The result of this computation shows that 3 of the 5 nearest neighbours to our new observation are
@@ -489,7 +493,7 @@ the data look like when we visualize them as a 3-dimensional scatter.
489493
In this case, the formula above is just the straight line distance in this 3-dimensional space.
490494

491495

492-
```{r 05-more, echo = FALSE, fig.cap = "3D scatter plot of the symmetry, concavity, and perimeter variables."}
496+
```{r 05-more, echo = FALSE, message = FALSE, fig.cap = "3D scatter plot of the symmetry, concavity, and perimeter variables."}
493497
library(plotly)
494498
cancer |>
495499
plot_ly(
@@ -539,7 +543,7 @@ in this collection will help keep our code simple, readable and accurate; the
539543
less we have to code ourselves, the fewer mistakes we are likely to make. We
540544
start off by loading `tidymodels`.
541545

542-
```{r 05-tidymodels}
546+
```{r 05-tidymodels, warning = FALSE, message = FALSE}
543547
library(tidymodels)
544548
```
545549

@@ -751,11 +755,14 @@ neighbours <- unscaled_cancer[order(my_distances$Distance), ]
751755
unscaled_cancer <- bind_rows(unscaled_cancer, new_obs)
752756
753757
# plot the scatter
754-
unscaled <- ggplot(unscaled_cancer, aes(x = Area, y = Smoothness, color = Class)) +
755-
geom_point(alpha = 0.6) +
756-
labs(color = "Diagnosis") +
757-
scale_color_manual(labels = c("Benign", "Malignant"), values = c("steelblue2", "orange2", "red")) +
758-
labs(color = "Diagnosis") +
758+
unscaled <- ggplot(unscaled_cancer, aes(x = Area, y = Smoothness, group = Class, color = Class, shape = Class)) +
759+
geom_point(alpha = 0.6) +
760+
scale_color_manual(name = "Diagnosis",
761+
labels = c("Benign", "Malignant", "Unknown"),
762+
values = c("steelblue2", "orange2", "red")) +
763+
scale_shape_manual(name = "Diagnosis",
764+
labels = c("Benign", "Malignant", "Unknown"),
765+
values= c(16, 16, 17)) +
759766
ggtitle("Nonstandardized Data") +
760767
geom_segment(aes(
761768
x = unlist(new_obs[1]), y = unlist(new_obs[2]),
@@ -782,11 +789,14 @@ neighbours_scaled <- scaled_cancer[order(my_distances_scaled$Distance), ]
782789
scaled_cancer <- bind_rows(scaled_cancer, new_obs_scaled)
783790
784791
# plot the scatter
785-
scaled <- ggplot(scaled_cancer, aes(x = Area, y = Smoothness, color = Class)) +
786-
geom_point(alpha = 0.6) +
787-
labs(color = "Diagnosis") +
788-
scale_color_manual(labels = c("Benign", "Malignant"), values = c("steelblue2", "orange2", "red")) +
789-
labs(color = "Diagnosis") +
792+
scaled <- ggplot(scaled_cancer, aes(x = Area, y = Smoothness, group = Class, color = Class, shape = Class)) +
793+
geom_point(alpha = 0.6) +
794+
scale_color_manual(name = "Diagnosis",
795+
labels = c("Benign", "Malignant", "Unknown"),
796+
values = c("steelblue2", "orange2", "red")) +
797+
scale_shape_manual(name = "Diagnosis",
798+
labels = c("Benign", "Malignant", "Unknown"),
799+
values= c(16, 16, 17)) +
790800
ggtitle("Standardized Data") +
791801
# coord_equal(ratio = 1) +
792802
geom_segment(aes(
@@ -889,7 +899,8 @@ for (i in 1:7) {
889899
}
890900
rare_plot + geom_point(aes(x = new_point[1], y = new_point[2]),
891901
color = "red",
892-
size = 2.5
902+
size = 2.5,
903+
pch = 17
893904
)
894905
```
895906
</center>
@@ -936,7 +947,7 @@ step to the earlier `uc_recipe` recipe with the `step_upsample` function.
936947
We show below how to do this, and also
937948
use the `group_by` and `summarize` functions to see that our classes are now balanced:
938949

939-
```{r 05-upsample-cancer}
950+
```{r 05-upsample-cancer, warning=FALSE}
940951
ups_recipe <- recipe(Class ~ ., data = rare_cancer) |>
941952
step_upsample(Class, over_ratio = 1, skip = FALSE) |>
942953
prep()

0 commit comments

Comments
 (0)