Skip to content

Commit f53e6ec

Browse files
committed
merge dev
2 parents 061fef3 + 83c635d commit f53e6ec

File tree

9 files changed

+68
-55
lines changed

9 files changed

+68
-55
lines changed

classification1.Rmd

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,7 +1021,7 @@ ggarrange(unscaled, scaled, ncol = 2, common.legend = TRUE, legend = "bottom")
10211021
10221022
```
10231023

1024-
```{r 05-scaling-plt-zoomed, fig.height = 5, fig.width = 10, echo = FALSE, fig.cap = "Close up of three nearest neighbors for unstandardized data."}
1024+
```{r 05-scaling-plt-zoomed, fig.height = 4.5, fig.width = 9, echo = FALSE, fig.cap = "Close up of three nearest neighbors for unstandardized data."}
10251025
library(ggforce)
10261026
ggplot(unscaled_cancer, aes(x = Area,
10271027
y = Smoothness,
@@ -1056,7 +1056,8 @@ ggplot(unscaled_cancer, aes(x = Area,
10561056
), color = "black") +
10571057
facet_zoom(x = ( Area > 380 & Area < 420) ,
10581058
y = (Smoothness > 0.08 & Smoothness < 0.14), zoom.size = 2) +
1059-
theme_bw() + theme(legend.position="bottom", text = element_text(size = 16))
1059+
theme_bw() +
1060+
theme(text = element_text(size = 14), legend.position="bottom")
10601061
```
10611062

10621063
### Balancing
@@ -1394,7 +1395,9 @@ wkflw_plot <-
13941395
color = Class),
13951396
alpha = 0.02,
13961397
size = 5) +
1397-
labs(color = "Diagnosis") +
1398+
labs(color = "Diagnosis",
1399+
x = "Area (standardized)",
1400+
y = "Smoothness (standardized)") +
13981401
scale_color_manual(labels = c("Malignant", "Benign"),
13991402
values = c("orange2", "steelblue2"))
14001403

classification2.Rmd

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ tumor cell concavity versus smoothness colored by diagnosis in Figure \@ref(fig:
206206
You will also notice that we set the random seed here at the beginning of the analysis
207207
using the `set.seed` function, as described in Section \@ref(randomseeds).
208208

209-
```{r 06-precode, fig.height = 4, fig.width = 5, fig.cap="Scatter plot of tumor cell concavity versus smoothness colored by diagnosis label.", message = F, warning = F}
209+
```{r 06-precode, fig.height = 3.5, fig.width = 4.5, fig.cap="Scatter plot of tumor cell concavity versus smoothness colored by diagnosis label.", message = F, warning = F}
210210
# load packages
211211
library(tidyverse)
212212
library(tidymodels)
@@ -778,7 +778,7 @@ We can select the best value of the number of neighbors (i.e., the one that resu
778778
in the highest classifier accuracy estimate) by plotting the accuracy versus $K$
779779
in Figure \@ref(fig:06-find-k).
780780

781-
```{r 06-find-k, fig.height = 4, fig.width = 5, fig.cap= "Plot of estimated accuracy versus the number of neighbors."}
781+
```{r 06-find-k, fig.height = 3.5, fig.width = 4, fig.cap= "Plot of estimated accuracy versus the number of neighbors."}
782782
accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
783783
geom_point() +
784784
geom_line() +
@@ -824,7 +824,7 @@ we vary $K$ from 1 to almost the number of observations in the data set.
824824
set.seed(1)
825825
```
826826

827-
```{r 06-lots-of-ks, message = FALSE, fig.height = 4, fig.width = 5, fig.cap="Plot of accuracy estimate versus number of neighbors for many K values."}
827+
```{r 06-lots-of-ks, message = FALSE, fig.height = 3.5, fig.width = 4, fig.cap="Plot of accuracy estimate versus number of neighbors for many K values."}
828828
k_lots <- tibble(neighbors = seq(from = 1, to = 385, by = 10))
829829
830830
knn_results <- workflow() |>
@@ -918,7 +918,8 @@ for (i in 1:length(ks)) {
918918
labs(color = "Diagnosis") +
919919
ggtitle(paste("K = ", ks[[i]])) +
920920
scale_color_manual(labels = c("Malignant", "Benign"),
921-
values = c("orange2", "steelblue2"))
921+
values = c("orange2", "steelblue2")) +
922+
theme(text = element_text(size = 18))
922923
}
923924
924925
p_no_legend <- lapply(plots, function(x) x + theme(legend.position = "none"))
@@ -1028,7 +1029,7 @@ variables there are, the more (random) influence they have, and the more they
10281029
corrupt the set of nearest neighbors that vote on the class of the new
10291030
observation to predict.
10301031

1031-
```{r 06-performance-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Effect of inclusion of irrelevant predictors."}
1032+
```{r 06-performance-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "60%", fig.cap = "Effect of inclusion of irrelevant predictors."}
10321033
# get accuracies after including k irrelevant features
10331034
ks <- c(0, 5, 10, 15, 20, 40)
10341035
fixedaccs <- list()
@@ -1101,7 +1102,8 @@ res <- tibble(ks = ks, accs = accs, fixedaccs = fixedaccs, nghbrs = nghbrs)
11011102
plt_irrelevant_accuracies <- ggplot(res) +
11021103
geom_line(mapping = aes(x=ks, y=accs)) +
11031104
labs(x = "Number of Irrelevant Predictors",
1104-
y = "Model Accuracy Estimate")
1105+
y = "Model Accuracy Estimate") +
1106+
theme(text = element_text(size = 18))
11051107
11061108
plt_irrelevant_accuracies
11071109
```
@@ -1117,24 +1119,26 @@ variables, the number of neighbors does not increase smoothly; but the general t
11171119
Figure \@ref(fig:06-fixed-irrelevant-features) corroborates
11181120
this evidence; if we fix the number of neighbors to $K=3$, the accuracy falls off more quickly.
11191121

1120-
```{r 06-neighbors-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Tuned number of neighbors for varying number of irrelevant predictors."}
1122+
```{r 06-neighbors-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "60%", fig.cap = "Tuned number of neighbors for varying number of irrelevant predictors."}
11211123
plt_irrelevant_nghbrs <- ggplot(res) +
11221124
geom_line(mapping = aes(x=ks, y=nghbrs)) +
11231125
labs(x = "Number of Irrelevant Predictors",
1124-
y = "Number of neighbors")
1126+
y = "Number of neighbors") +
1127+
theme(text = element_text(size = 18))
11251128
11261129
plt_irrelevant_nghbrs
11271130
```
11281131

1129-
```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
1130-
res_tmp <- res |> pivot_longer(cols=c("accs", "fixedaccs"),
1132+
```{r 06-fixed-irrelevant-features, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "75%", fig.cap = "Accuracy versus number of irrelevant predictors for tuned and untuned number of neighbors."}
1133+
res_tmp <- res %>% pivot_longer(cols=c("accs", "fixedaccs"),
11311134
names_to="Type",
11321135
values_to="accuracy")
11331136
11341137
plt_irrelevant_nghbrs <- ggplot(res_tmp) +
11351138
geom_line(mapping = aes(x=ks, y=accuracy, color=Type)) +
11361139
labs(x = "Number of Irrelevant Predictors", y = "Accuracy") +
1137-
scale_color_discrete(labels= c("Tuned K", "K = 3"))
1140+
scale_color_discrete(labels= c("Tuned K", "K = 3")) +
1141+
theme(text = element_text(size = 16))
11381142
11391143
plt_irrelevant_nghbrs
11401144
```
@@ -1362,11 +1366,12 @@ where the elbow occurs, and whether adding a variable provides a meaningful incr
13621366
> part of tuning your classifier, you *cannot use your test data* for this
13631367
> process!
13641368
1365-
```{r 06-fwdsel-3, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "100%", fig.cap = "Estimated accuracy versus the number of predictors for the sequence of models built using forward selection."}
1369+
```{r 06-fwdsel-3, echo = FALSE, warning = FALSE, fig.retina = 2, out.width = "60%", fig.cap = "Estimated accuracy versus the number of predictors for the sequence of models built using forward selection."}
13661370
fwd_sel_accuracies_plot <- accuracies |>
13671371
ggplot(aes(x = size, y = accuracy)) +
13681372
geom_line() +
1369-
labs(x = "Number of Predictors", y = "Estimated Accuracy")
1373+
labs(x = "Number of Predictors", y = "Estimated Accuracy") +
1374+
theme(text = element_text(size = 18))
13701375
13711376
fwd_sel_accuracies_plot
13721377
```

img/tidy_data.key

320 KB
Binary file not shown.

img/tidy_data/tidy_data.001.jpeg

461 KB
Loading

inference.Rmd

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ We have created this particular example
287287
such that we *do* have access to the full population, which lets us visualize the
288288
sampling distribution directly for learning purposes.
289289

290-
```{r 11-example-proportions7, echo = TRUE, message = FALSE, warning = FALSE,fig.cap = "Sampling distribution of the sample proportion for sample size 40.", fig.retina = 2, out.width = "100%"}
290+
```{r 11-example-proportions7, echo = TRUE, message = FALSE, warning = FALSE,fig.cap = "Sampling distribution of the sample proportion for sample size 40.", fig.height = 3.3, fig.width = 4.2}
291291
sampling_distribution <- ggplot(sample_estimates, aes(x = sample_proportion)) +
292292
geom_histogram(fill = "dodgerblue3", color = "lightgrey", bins = 12) +
293293
ylab("Count") +
@@ -335,7 +335,7 @@ We can visualize the population distribution of the price per night with a histo
335335
options(pillar.sigfig = 5)
336336
```
337337

338-
```{r 11-example-means2, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Population distribution of price per night (Canadian dollars) for all Airbnb listings in Vancouver, Canada.", fig.retina = 2, out.width = "100%"}
338+
```{r 11-example-means2, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Population distribution of price per night (Canadian dollars) for all Airbnb listings in Vancouver, Canada.", fig.height = 3.5, fig.width = 4.5}
339339
population_distribution <- ggplot(airbnb, aes(x = price)) +
340340
geom_histogram(fill = "dodgerblue3", color = "lightgrey") +
341341
ylab("Count") +
@@ -380,7 +380,7 @@ We can create a histogram to visualize the distribution of observations in the
380380
sample (Figure \@ref(fig:11-example-means-sample-hist)), and calculate the mean
381381
of our sample.
382382

383-
```{r 11-example-means-sample-hist, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Distribution of price per night (Canadian dollars) for sample of 40 Airbnb listings.", fig.retina = 2, out.width = "100%"}
383+
```{r 11-example-means-sample-hist, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Distribution of price per night (Canadian dollars) for sample of 40 Airbnb listings.", fig.height = 3.5, fig.width = 4.5}
384384
sample_distribution <- ggplot(one_sample, aes(price)) +
385385
geom_histogram(fill = "dodgerblue3", color = "lightgrey") +
386386
ylab("Count") +
@@ -422,7 +422,7 @@ samples
422422
Now we can calculate the sample mean for each replicate and plot the sampling
423423
distribution of sample means for samples of size 40.
424424

425-
```{r 11-example-means4, echo = TRUE, message = FALSE, warning = FALSE, fig.cap= "Sampling distribution of the sample means for sample size of 40.", fig.retina = 2, out.width = "100%"}
425+
```{r 11-example-means4, echo = TRUE, message = FALSE, warning = FALSE, fig.cap= "Sampling distribution of the sample means for sample size of 40.", fig.height = 3.5, fig.width = 4.5}
426426
sample_estimates <- samples |>
427427
group_by(replicate) |>
428428
summarize(sample_mean = mean(price))
@@ -468,15 +468,15 @@ Notice that the mean of the sample means is \$`r round(mean(sample_estimates$sam
468468
was \$`r round(mean(airbnb$price),2)`.
469469
-->
470470

471-
```{r 11-example-means5, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Comparison of population distribution, sample distribution, and sampling distribution."}
471+
```{r 11-example-means5, echo = FALSE, message = FALSE, warning = FALSE, fig.height = 5.5, fig.width = 4, fig.cap = "Comparison of population distribution, sample distribution, and sampling distribution."}
472472
grid.arrange(population_distribution +
473473
ggtitle("Population") +
474474
xlim(min(airbnb$price), 600),
475475
sample_distribution +
476476
ggtitle("Sample (n = 40)") +
477477
xlim(min(airbnb$price), 600),
478478
sampling_distribution_40 +
479-
ggtitle("Sampling distribution of the mean for samples of size 40") +
479+
ggtitle("Sampling distribution of the mean \n for samples of size 40") +
480480
xlim(min(airbnb$price), 600),
481481
nrow = 3
482482
)
@@ -664,7 +664,7 @@ see that the sample’s distribution looks like that of the population for a
664664
large enough sample.
665665

666666

667-
```{r 11-example-bootstrapping0, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Comparison of samples of different sizes from the population."}
667+
```{r 11-example-bootstrapping0, echo = FALSE, message = FALSE, warning = FALSE, fig.height = 7, fig.cap = "Comparison of samples of different sizes from the population."}
668668
sample_10 <- airbnb |>
669669
rep_sample_n(10)
670670
sample_distribution_10 <- ggplot(sample_10, aes(price)) +
@@ -773,7 +773,7 @@ one_sample <- one_sample |>
773773
ungroup() |> select(-replicate)
774774
```
775775

776-
```{r 11-bootstrapping1, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Histogram of price per night (Canadian dollars) for one sample of size 40.", fig.retina = 2, out.width = "100%"}
776+
```{r 11-bootstrapping1, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Histogram of price per night (Canadian dollars) for one sample of size 40.", fig.height = 3.5, fig.width = 4.5}
777777
one_sample
778778
779779
one_sample_dist <- ggplot(one_sample, aes(price)) +
@@ -799,7 +799,7 @@ we change the argument for `replace` from its default value of `FALSE` to `TRUE`
799799
\index{bootstrap!in R}
800800
\index{rep\_sample\_n!bootstrap}
801801

802-
```{r 11-bootstrapping3, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Bootstrap distribution.", fig.retina = 2, out.width = "100%"}
802+
```{r 11-bootstrapping3, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Bootstrap distribution.", fig.height = 3.5, fig.width = 4.5}
803803
boot1 <- one_sample |>
804804
rep_sample_n(size = 40, replace = TRUE, reps = 1)
805805
boot1_dist <- ggplot(boot1, aes(price)) +
@@ -865,7 +865,7 @@ generate a bootstrap distribution of our point estimates. The bootstrap
865865
distribution (Figure \@ref(fig:11-bootstrapping5)) suggests how we might expect
866866
our point estimate to behave if we took another sample.
867867

868-
```{r 11-bootstrapping5, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Distribution of the bootstrap sample means.", out.width = "100%"}
868+
```{r 11-bootstrapping5, echo = TRUE, message = FALSE, warning = FALSE, fig.cap = "Distribution of the bootstrap sample means.", fig.height = 3.5, fig.width = 4.5}
869869
boot20000_means <- boot20000 |>
870870
group_by(replicate) |>
871871
summarize(mean = mean(price))
@@ -884,7 +884,7 @@ boot_est_dist
884884
Let's compare the bootstrap distribution&mdash;which we construct by taking many samples from our original sample of size 40&mdash;with
885885
the true sampling distribution&mdash;which corresponds to taking many samples from the population.
886886

887-
```{r 11-bootstrapping6, echo = F, message = FALSE, warning = FALSE, fig.cap = "Comparison of the distribution of the bootstrap sample means and sampling distribution.", out.height = "70%"}
887+
```{r 11-bootstrapping6, echo = F, message = FALSE, warning = FALSE, fig.cap = "Comparison of the distribution of the bootstrap sample means and sampling distribution.", fig.height = 3.5}
888888
samples <- rep_sample_n(airbnb, size = 40, reps = 20000)
889889
890890
sample_estimates <- samples |>
@@ -1125,11 +1125,11 @@ the middle 95\% of the sample mean prices in the bootstrap distribution. We can
11251125
visualize the interval on our distribution in Figure
11261126
\@ref(fig:11-bootstrapping9).
11271127

1128-
```{r 11-bootstrapping9, echo = F, message = FALSE, warning = FALSE, fig.cap = "Distribution of the bootstrap sample means with percentile lower and upper bounds.", out.width = "100%"}
1128+
```{r 11-bootstrapping9, echo = F, message = FALSE, warning = FALSE, fig.cap = "Distribution of the bootstrap sample means with percentile lower and upper bounds.", fig.height=4, fig.width = 6.5}
11291129
boot_est_dist +
11301130
geom_vline(xintercept = bounds, col = "#E69F00", size = 2, linetype = 2) +
11311131
annotate("text",
1132-
x = bounds[1], max_count(boot_est_dist), hjust = 0.5, vjust = 2,
1132+
x = bounds[1], max_count(boot_est_dist), hjust = 0.6, vjust = 2,
11331133
label = paste("2.5th percentile =", round(bounds[1], 2))
11341134
) +
11351135
annotate("text",

0 commit comments

Comments
 (0)