Skip to content

Commit 30a1b88

Browse files
committed
create error trend figures
1 parent ffa1622 commit 30a1b88

File tree

2 files changed

+168
-28
lines changed

2 files changed

+168
-28
lines changed

source/R/plot_cross_validation.R

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,11 @@ plot_cross_validation <- function(
2525
),
2626
size = 2.5, max.overlaps = max.overlaps
2727
) +
28+
coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
2829
labs(x = "Proportion of occupied grid cells\nin ABV dataset",
2930
y = "Proportion of occupied grid cells\nin cube dataset",
30-
shape = "Rarity") +
31+
shape = "Rarity",
32+
colour = toupper(measure)) +
3133
scale_colour_viridis_c(option = "turbo") +
3234
theme_minimal()
3335
}

source/dataset_bias_cv.Rmd

Lines changed: 165 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ knitr::opts_chunk$set(echo = TRUE)
1919
```{r, warning=FALSE, message=FALSE}
2020
# Load packages
2121
library(tidyverse) # Data wrangling and visualisation
22+
library(cowplot) # Nice figures
2223
library(dubicube) # Cross-validation
2324
library(sf) # Spatial objects
2425
library(targets)
@@ -369,15 +370,16 @@ p_prevalence <- prevalence_df %>%
369370
ggplot(aes(x = abv, y = birdcube)) +
370371
geom_abline(slope = 1, intercept = 0, colour = "firebrick",
371372
linewidth = 1) +
372-
annotate("label", x = 0.8, y = 0.6, size = 3,
373+
annotate("label", x = 0.8, y = 0.1, size = 4,
373374
label = "Higher prevalence in ABV",
374375
color = "black") +
375-
annotate("label", x = 0.4, y = 0.6, size = 3,
376+
annotate("label", x = 0.2, y = 0.95, size = 4,
376377
label = "Higher prevalence in cube",
377378
color = "black") +
378379
geom_smooth(method = "loess", formula = "y ~ x",
379380
colour = "darkgrey", linetype = "dashed") +
380381
geom_point(aes(shape = rarity), size = 2) +
382+
coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
381383
labs(x = "Proportion of occupied grid cells\nin ABV dataset",
382384
y = "Proportion of occupied grid cells\nin cube dataset",
383385
shape = "Rarity") +
@@ -423,7 +425,8 @@ prevalence_cv %>%
423425
prevalence_df,
424426
measure = "max_error",
425427
quant = quantile
426-
)
428+
) +
429+
theme_bw(base_size = 12)
427430
```
428431

429432
We look at the `r (1 - quantile) * 100` % species with the highest maximum absolute error.
@@ -473,7 +476,8 @@ prevalence_cv %>%
473476
prevalence_df,
474477
measure = "max_rel_error",
475478
quant = quantile
476-
)
479+
) +
480+
theme_bw(base_size = 12)
477481
```
478482

479483
We look at the `r (1 - quantile) * 100` % species with the highest maximum relative error.
@@ -516,12 +520,14 @@ birdcube_dataset_filtered %>%
516520

517521
```{r, warning=FALSE, out.width="90%", message=FALSE}
518522
# Mean relative error
519-
plot_cross_validation(
523+
p_mre <- plot_cross_validation(
520524
prevalence_cv,
521525
prevalence_df,
522526
measure = "mre",
523527
quant = quantile
524-
)
528+
) +
529+
theme_bw(base_size = 12)
530+
p_mre
525531
```
526532

527533
```{r}
@@ -604,12 +610,14 @@ birdcube_dataset_filtered %>%
604610

605611
```{r, warning=FALSE, out.width="90%", message=FALSE}
606612
# Root mean squared error
607-
plot_cross_validation(
613+
p_rmse <- plot_cross_validation(
608614
prevalence_cv,
609615
prevalence_df,
610616
measure = "rmse",
611617
quant = quantile
612-
)
618+
) +
619+
theme_bw(base_size = 12)
620+
p_rmse
613621
```
614622

615623
```{r}
@@ -688,6 +696,40 @@ birdcube_dataset_filtered %>%
688696
facet_wrap(~species, ncol = 1, scales = "free")
689697
```
690698

699+
```{r, echo=FALSE}
700+
# ----- Combine plots side by side -----
701+
bottom_row <- plot_grid(
702+
p_rmse +
703+
theme(plot.margin = margin(t = 12, r = -5, b = 5, l = 10)) +
704+
guides(shape = "none"),
705+
p_mre +
706+
theme(axis.title.y = element_blank(),
707+
plot.margin = margin(t = 12, r = 5, b = 5, l = 40)) +
708+
guides(shape = "none"),
709+
labels = c("B.", "C."),
710+
label_size = 20,
711+
ncol = 2,
712+
rel_widths = c(1, 1)
713+
)
714+
715+
# ----- Combine top + bottom -----
716+
final_plot <- plot_grid(
717+
p_prevalence +
718+
theme(axis.title.x = element_blank(),
719+
plot.margin = margin(t = 10, r = 60, b = 10, l = 80)),
720+
bottom_row,
721+
labels = "A.",
722+
label_size = 20,
723+
ncol = 1,
724+
rel_heights = c(1.2, 1) # top vs bottom height ratio
725+
)
726+
727+
# ----- Save to file -----
728+
ggsave(file.path(out_path, "prevalence_panels.png"),
729+
final_plot,
730+
width = 12, height = 10, dpi = 300)
731+
```
732+
691733
## Trends in error: RMSE
692734

693735
We look at trends in CV error measures related to:
@@ -700,16 +742,26 @@ We look at trends in CV error measures related to:
700742
### Differences in rarity
701743

702744
```{r}
703-
prevalence_cv %>%
704-
ggplot(aes(x = rarity, y = rmse, colour = rarity)) +
745+
p_rmse_rarity <- prevalence_cv %>%
746+
mutate(
747+
rarity = fct_recode(
748+
rarity,
749+
"Extremely\ncommon" = "Extremely common"
750+
)
751+
) %>%
752+
ggplot(aes(x = rarity, y = rmse)) +
705753
geom_boxplot() +
754+
labs(x = "Rarity", y = "RMSE") +
706755
theme_bw(base_size = 12)
756+
p_rmse_rarity
707757
```
708758

709759
### Number of datasets
710760

761+
We cannot compute the evenness for species only found in a single dataset.
762+
711763
```{r}
712-
trend_dataset <- birdcube_dataset_filtered %>%
764+
trend_dataset_rmse <- birdcube_dataset_filtered %>%
713765
left_join(prevalence_cv %>% distinct(species, rarity, rmse),
714766
by = join_by(species, rarity)) %>%
715767
select(mgrscode, year, species, datasetname, rarity, rmse) %>%
@@ -729,7 +781,7 @@ trend_dataset <- birdcube_dataset_filtered %>%
729781
Does RMSE change with number of datasets?
730782

731783
```{r}
732-
trend_dataset %>%
784+
trend_dataset_rmse %>%
733785
ggplot(aes(x = n_datasets, y = rmse, colour = rarity)) +
734786
geom_point() +
735787
geom_smooth(method = "lm", formula = "y ~ x") +
@@ -738,7 +790,7 @@ trend_dataset %>%
738790

739791
```{r}
740792
grouped_lm(
741-
data = trend_dataset,
793+
data = trend_dataset_rmse,
742794
group_var = "rarity",
743795
x_var = "n_datasets",
744796
y_var = "rmse"
@@ -759,7 +811,7 @@ $$
759811
where $D_j$ the total number of datasets where species $j$ is present, and $p_i$ the proportion of entries (rows) of species $j$ in dataset $i$.
760812

761813
```{r}
762-
trend_dataset %>%
814+
trend_dataset_rmse %>%
763815
ggplot(aes(x = neff_datasets, y = rmse, colour = rarity)) +
764816
geom_point() +
765817
geom_smooth(method = "lm", formula = "y ~ x") +
@@ -768,7 +820,7 @@ trend_dataset %>%
768820

769821
```{r}
770822
grouped_lm(
771-
data = trend_dataset,
823+
data = trend_dataset_rmse,
772824
group_var = "rarity",
773825
x_var = "neff_datasets",
774826
y_var = "rmse"
@@ -786,17 +838,36 @@ J_{j} = \frac{- \sum_{i=1}^{D_j}p_{ij}\ln(p_{ij}))}{\ln(D_j)}
786838
$$
787839
<!-- spell-check: ignore:end -->
788840

841+
789842
```{r}
790-
trend_dataset %>%
843+
trend_dataset_rmse %>%
791844
ggplot(aes(x = evenness, y = rmse, colour = rarity)) +
792845
geom_point() +
793846
geom_smooth(method = "lm", formula = "y ~ x") +
847+
labs(x = "Dataset evenness", y = "RMSE", colour = "Rarity") +
794848
theme_bw(base_size = 12)
795849
```
796850

851+
```{r, echo=FALSE}
852+
p_rmse_evenness <- trend_dataset_rmse %>%
853+
ggplot(aes(x = evenness, y = rmse, colour = rarity)) +
854+
geom_point() +
855+
geom_smooth(method = "lm", formula = "y ~ x") +
856+
labs(x = "Dataset evenness", y = "RMSE", colour = "Rarity") +
857+
theme_bw(base_size = 12) +
858+
theme(
859+
legend.position = c(0.05, 0.05), # bottom-left inside
860+
legend.justification = c(0, 0), # anchor legend at bottom-left
861+
legend.background = element_rect(
862+
fill = "white", colour = "black", linewidth = 0.3
863+
),
864+
legend.key = element_blank()
865+
)
866+
```
867+
797868
```{r}
798869
grouped_lm(
799-
data = trend_dataset,
870+
data = trend_dataset_rmse,
800871
group_var = "rarity",
801872
x_var = "evenness",
802873
y_var = "rmse"
@@ -815,16 +886,24 @@ We look at trends in CV error measures related to:
815886
### Differences in rarity
816887

817888
```{r}
818-
prevalence_cv %>%
819-
ggplot(aes(x = rarity, y = mre, colour = rarity)) +
889+
p_mre_rarity <- prevalence_cv %>%
890+
mutate(
891+
rarity = fct_recode(
892+
rarity,
893+
"Extremely\ncommon" = "Extremely common"
894+
)
895+
) %>%
896+
ggplot(aes(x = rarity, y = mre)) +
820897
geom_boxplot() +
898+
labs(x = "Rarity", y = "MRE") +
821899
theme_bw(base_size = 12)
900+
p_mre_rarity
822901
```
823902

824903
### Number of datasets
825904

826905
```{r}
827-
trend_dataset <- birdcube_dataset_filtered %>%
906+
trend_dataset_mre <- birdcube_dataset_filtered %>%
828907
left_join(prevalence_cv %>% distinct(species, rarity, mre),
829908
by = join_by(species, rarity)) %>%
830909
select(mgrscode, year, species, datasetname, rarity, mre) %>%
@@ -844,7 +923,7 @@ trend_dataset <- birdcube_dataset_filtered %>%
844923
Does MRE change with number of datasets?
845924

846925
```{r}
847-
trend_dataset %>%
926+
trend_dataset_mre %>%
848927
ggplot(aes(x = n_datasets, y = mre, colour = rarity)) +
849928
geom_point() +
850929
geom_smooth(method = "lm", formula = "y ~ x") +
@@ -853,7 +932,7 @@ trend_dataset %>%
853932

854933
```{r}
855934
grouped_lm(
856-
data = trend_dataset,
935+
data = trend_dataset_mre,
857936
group_var = "rarity",
858937
x_var = "n_datasets",
859938
y_var = "mre"
@@ -874,7 +953,7 @@ $$
874953
where $D_j$ the total number of datasets where species $j$ is present, and $p_i$ the proportion of entries (rows) of species $j$ in dataset $i$.
875954

876955
```{r}
877-
trend_dataset %>%
956+
trend_dataset_mre %>%
878957
ggplot(aes(x = neff_datasets, y = mre, colour = rarity)) +
879958
geom_point() +
880959
geom_smooth(method = "lm", formula = "y ~ x") +
@@ -883,7 +962,7 @@ trend_dataset %>%
883962

884963
```{r}
885964
grouped_lm(
886-
data = trend_dataset,
965+
data = trend_dataset_mre,
887966
group_var = "rarity",
888967
x_var = "neff_datasets",
889968
y_var = "mre"
@@ -902,22 +981,81 @@ $$
902981
<!-- spell-check: ignore:end -->
903982

904983
```{r}
905-
trend_dataset %>%
984+
p_mre_evenness <- trend_dataset_mre %>%
906985
ggplot(aes(x = evenness, y = mre, colour = rarity)) +
907986
geom_point() +
908987
geom_smooth(method = "lm", formula = "y ~ x") +
909-
theme_bw(base_size = 12)
988+
labs(x = "Dataset evenness", y = "MRE", colour = "Rarity") +
989+
theme_bw(base_size = 12) +
990+
theme(
991+
legend.position = c(0.05, 0.05), # bottom-left inside
992+
legend.justification = c(0, 0), # anchor legend at bottom-left
993+
legend.background = element_rect(
994+
fill = "white", colour = "black", linewidth = 0.3
995+
),
996+
legend.key = element_blank()
997+
)
998+
p_mre_evenness
910999
```
9111000

9121001
```{r}
9131002
grouped_lm(
914-
data = trend_dataset,
1003+
data = trend_dataset_mre,
9151004
group_var = "rarity",
9161005
x_var = "evenness",
9171006
y_var = "mre"
9181007
)
9191008
```
9201009

1010+
```{r, echo=FALSE}
1011+
col_rmse <- plot_grid(
1012+
p_rmse_rarity +
1013+
coord_cartesian(ylim = c(0, 0.12)) +
1014+
theme(
1015+
plot.margin = margin(t = 10, r = 5, b = 5, l = 20),
1016+
axis.title.x = element_blank(),
1017+
plot.title = element_text(hjust = 0.5)
1018+
),
1019+
p_rmse_evenness +
1020+
coord_cartesian(ylim = c(0, 0.12)) +
1021+
theme(plot.margin = margin(t = 5, r = 5, b = 5, l = 20)) +
1022+
guides(colour = "none"),
1023+
labels = c("A.", "C."),
1024+
label_size = 20,
1025+
ncol = 1
1026+
)
1027+
1028+
col_mre <- plot_grid(
1029+
p_mre_rarity +
1030+
coord_cartesian(ylim = c(0, 0.027)) +
1031+
theme(
1032+
plot.margin = margin(t = 10, r = 5, b = 5, l = 20),
1033+
axis.title.x = element_blank(),
1034+
plot.title = element_text(hjust = 0.5)
1035+
),
1036+
p_mre_evenness +
1037+
coord_cartesian(ylim = c(0, 0.027)) +
1038+
theme(plot.margin = margin(t = 5, r = 5, b = 5, l = 5)),
1039+
labels = c("B.", "D."),
1040+
label_size = 20,
1041+
ncol = 1
1042+
)
1043+
1044+
p_error_trends <- plot_grid(
1045+
ggdraw() + draw_label("RMSE", fontface = "bold", size = 16),
1046+
ggdraw() + draw_label("MRE", fontface = "bold", size = 16),
1047+
col_rmse,
1048+
col_mre,
1049+
ncol = 2,
1050+
rel_heights = c(0.08, 1)
1051+
)
1052+
1053+
ggsave(file.path(out_path, "error_trends.png"),
1054+
p_error_trends,
1055+
width = 10, height = 10, dpi = 300)
1056+
```
1057+
1058+
9211059
## Error of prevalence estimates relative to ABV reference values
9221060
Calculate the CV error compared to ABV prevalence (= "true" prevalence).
9231061

0 commit comments

Comments
 (0)