@@ -97,7 +97,8 @@ We read in the data cube and add dataset names.
9797``` {r}
9898birdcubeflanders_dataset_raw <- read_csv(
9999 here::here("data", "raw", "birdcubeflanders_dataset.csv"),
100- show_col_types = FALSE)
100+ show_col_types = FALSE
101+ )
101102
102103# Add dataset names
103104birdcubeflanders_dataset <- get_dataset_names(birdcubeflanders_dataset_raw) %>%
@@ -136,14 +137,14 @@ birdcubeflanders_dataset %>%
136137 .by = "datasetname") %>%
137138 mutate(datasetname = reorder(datasetname, n_obs)) %>%
138139 ggplot(aes(x = datasetname, y = n_obs)) +
139- geom_bar(stat = "identity",
140- fill = "cornflowerblue") +
141- geom_text(aes(label = n_obs), vjust = 0.3, hjust = -0.3, size = 3) +
142- scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
143- scale_y_continuous(expand = expansion(mult = c(0.05, 0.2))) +
144- labs(x = "", y = "Number of observations (sum)") +
145- theme_minimal() +
146- coord_flip()
140+ geom_bar(stat = "identity",
141+ fill = "cornflowerblue") +
142+ geom_text(aes(label = n_obs), vjust = 0.3, hjust = -0.3, size = 3) +
143+ scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
144+ scale_y_continuous(expand = expansion(mult = c(0.05, 0.2))) +
145+ labs(x = "", y = "Number of observations (sum)") +
146+ theme_minimal() +
147+ coord_flip()
147148```
148149
149150# Comparing species prevalence
@@ -162,7 +163,8 @@ We categorise the species according to rarity:
162163
163164``` {r, message=FALSE}
164165abv_data_total_sf <- read_sf(
165- here::here("data", "interim", "abv_data_total.gpkg"))
166+ here::here("data", "interim", "abv_data_total.gpkg")
167+ )
166168
167169# Cut rarity
168170abv_data <- abv_data_total_sf %>%
@@ -186,7 +188,7 @@ abv_data <- abv_data_total_sf %>%
186188 species == "Dendrocopus major" ~ "Dendrocopos major",
187189 species == "Saxicola torquatus" ~ "Saxicola rubicola",
188190 TRUE ~ species
189- )
191+ )
190192 ) %>%
191193 arrange(year, mgrscode, species)
192194
@@ -251,21 +253,21 @@ Rare species are more prevalent in the cube dataset while more common species ar
251253``` {r}
252254prevalence_df %>%
253255 ggplot(aes(x = abv, y = birdcube)) +
254- geom_abline(slope = 1, intercept = 0, colour = "firebrick",
255- linewidth = 1) +
256- annotate("label", x = 0.8, y = 0.6, size = 3,
257- label = "Higher prevalence in ABV",
258- color = "black") +
259- annotate("label", x = 0.4, y = 0.6, size = 3,
260- label = "Higher prevalence in cube",
261- color = "black") +
262- geom_smooth(method = "loess", formula = "y ~ x",
263- colour = "darkgrey", linetype = "dashed") +
264- geom_point(aes(shape = rarity), size = 2) +
265- labs(x = "Proportion of occupied grid cells\nin ABV dataset",
266- y = "Proportion of occupied grid cells\nin cube dataset",
267- shape = "Rarity") +
268- theme_minimal()
256+ geom_abline(slope = 1, intercept = 0, colour = "firebrick",
257+ linewidth = 1) +
258+ annotate("label", x = 0.8, y = 0.6, size = 3,
259+ label = "Higher prevalence in ABV",
260+ color = "black") +
261+ annotate("label", x = 0.4, y = 0.6, size = 3,
262+ label = "Higher prevalence in cube",
263+ color = "black") +
264+ geom_smooth(method = "loess", formula = "y ~ x",
265+ colour = "darkgrey", linetype = "dashed") +
266+ geom_point(aes(shape = rarity), size = 2) +
267+ labs(x = "Proportion of occupied grid cells\nin ABV dataset",
268+ y = "Proportion of occupied grid cells\nin cube dataset",
269+ shape = "Rarity") +
270+ theme_minimal()
269271```
270272
271273We calculate error measures for the indicator based on leave-one-dataset-out cross-validation (use ` remotes::install_github("b-cubed-eu/dubicube#25") ` ).
@@ -298,7 +300,8 @@ prevalence_cv %>%
298300 plot_cross_validation(
299301 prevalence_df,
300302 measure = "max_error",
301- quant = quantile)
303+ quant = quantile
304+ )
302305```
303306
304307We look at the ` r (1 - quantile) * 100 ` % species with the highest maximum absolute error.
@@ -327,14 +330,14 @@ birdcube_dataset_filtered %>%
327330 mutate(species = reorder(species, max_error, decreasing = TRUE)) %>%
328331 mutate(datasetname = tidytext::reorder_within(datasetname, n, species)) %>%
329332 ggplot(aes(x = datasetname, y = n)) +
330- geom_bar(stat = "identity") +
331- geom_text(aes(label = n), vjust = 0.3, hjust = -0.3, size = 3) +
332- scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
333- labs(x = "", y = "Number of observations (count)") +
334- scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
335- theme_minimal() +
336- coord_flip() +
337- facet_wrap(~species, ncol = 1, scales = "free")
333+ geom_bar(stat = "identity") +
334+ geom_text(aes(label = n), vjust = 0.3, hjust = -0.3, size = 3) +
335+ scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
336+ labs(x = "", y = "Number of observations (count)") +
337+ scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
338+ theme_minimal() +
339+ coord_flip() +
340+ facet_wrap(~species, ncol = 1, scales = "free")
338341```
339342
340343### Maximum relative error
@@ -347,7 +350,8 @@ prevalence_cv %>%
347350 plot_cross_validation(
348351 prevalence_df,
349352 measure = "max_rel_error",
350- quant = quantile)
353+ quant = quantile
354+ )
351355```
352356
353357We look at the ` r (1 - quantile) * 100 ` % species with the highest maximum relative error.
@@ -376,14 +380,14 @@ birdcube_dataset_filtered %>%
376380 mutate(species = reorder(species, max_rel_error, decreasing = TRUE)) %>%
377381 mutate(datasetname = tidytext::reorder_within(datasetname, n, species)) %>%
378382 ggplot(aes(x = datasetname, y = n)) +
379- geom_bar(stat = "identity") +
380- geom_text(aes(label = n), vjust = 0.3, hjust = -0.3, size = 3) +
381- scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
382- labs(x = "", y = "Number of observations (count)") +
383- scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
384- theme_minimal() +
385- coord_flip() +
386- facet_wrap(~species, ncol = 1, scales = "free")
383+ geom_bar(stat = "identity") +
384+ geom_text(aes(label = n), vjust = 0.3, hjust = -0.3, size = 3) +
385+ scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
386+ labs(x = "", y = "Number of observations (count)") +
387+ scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
388+ theme_minimal() +
389+ coord_flip() +
390+ facet_wrap(~species, ncol = 1, scales = "free")
387391```
388392
389393### Mean relative error
@@ -394,14 +398,15 @@ plot_cross_validation(
394398 prevalence_cv,
395399 prevalence_df,
396400 measure = "mre",
397- quant = quantile)
401+ quant = quantile
402+ )
398403```
399404
400405``` {r}
401406prevalence_cv %>%
402407 distinct(species, rarity, mre) %>%
403408 ggplot() +
404- geom_histogram(aes(x = mre, fill = rarity))
409+ geom_histogram(aes(x = mre, fill = rarity))
405410```
406411
407412We look at the ` r (1 - quantile) * 100 ` % species with the highest maximum relative error.
@@ -428,14 +433,14 @@ birdcube_dataset_filtered %>%
428433 mutate(species = reorder(species, mre, decreasing = TRUE)) %>%
429434 mutate(datasetname = tidytext::reorder_within(datasetname, n, species)) %>%
430435 ggplot(aes(x = datasetname, y = n)) +
431- geom_bar(stat = "identity") +
432- geom_text(aes(label = n), vjust = 0.3, hjust = -0.3, size = 3) +
433- scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
434- labs(x = "", y = "Number of observations (count)") +
435- scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
436- theme_minimal() +
437- coord_flip() +
438- facet_wrap(~species, ncol = 1, scales = "free")
436+ geom_bar(stat = "identity") +
437+ geom_text(aes(label = n), vjust = 0.3, hjust = -0.3, size = 3) +
438+ scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
439+ labs(x = "", y = "Number of observations (count)") +
440+ scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
441+ theme_minimal() +
442+ coord_flip() +
443+ facet_wrap(~species, ncol = 1, scales = "free")
439444```
440445
441446### Root mean squared error
@@ -446,14 +451,15 @@ plot_cross_validation(
446451 prevalence_cv,
447452 prevalence_df,
448453 measure = "rmse",
449- quant = quantile)
454+ quant = quantile
455+ )
450456```
451457
452458``` {r}
453459prevalence_cv %>%
454460 distinct(species, rarity, rmse) %>%
455461 ggplot() +
456- geom_histogram(aes(x = rmse, fill = rarity))
462+ geom_histogram(aes(x = rmse, fill = rarity))
457463```
458464
459465We look at the ` r (1 - quantile) * 100 ` % species with the highest RMSE.
@@ -480,14 +486,14 @@ birdcube_dataset_filtered %>%
480486 mutate(species = reorder(species, rmse, decreasing = TRUE)) %>%
481487 mutate(datasetname = tidytext::reorder_within(datasetname, n, species)) %>%
482488 ggplot(aes(x = datasetname, y = n)) +
483- geom_bar(stat = "identity") +
484- geom_text(aes(label = n), vjust = 0.3, hjust = -0.3, size = 3) +
485- scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
486- labs(x = "", y = "Number of observations (count)") +
487- scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
488- theme_minimal() +
489- coord_flip() +
490- facet_wrap(~species, ncol = 1, scales = "free")
489+ geom_bar(stat = "identity") +
490+ geom_text(aes(label = n), vjust = 0.3, hjust = -0.3, size = 3) +
491+ scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
492+ labs(x = "", y = "Number of observations (count)") +
493+ scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
494+ theme_minimal() +
495+ coord_flip() +
496+ facet_wrap(~species, ncol = 1, scales = "free")
491497```
492498
493499We look at the ` r (1 - quantile) * 100 ` % species with the lowest RMSE.
@@ -514,14 +520,14 @@ birdcube_dataset_filtered %>%
514520 mutate(species = reorder(species, rmse, decreasing = FALSE)) %>%
515521 mutate(datasetname = tidytext::reorder_within(datasetname, n, species)) %>%
516522 ggplot(aes(x = datasetname, y = n)) +
517- geom_bar(stat = "identity") +
518- geom_text(aes(label = n), vjust = 0.3, hjust = -0.3, size = 3) +
519- scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
520- labs(x = "", y = "Number of observations (count)") +
521- scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
522- theme_minimal() +
523- coord_flip() +
524- facet_wrap(~species, ncol = 1, scales = "free")
523+ geom_bar(stat = "identity") +
524+ geom_text(aes(label = n), vjust = 0.3, hjust = -0.3, size = 3) +
525+ scale_x_discrete(label = function(x) stringr::str_trunc(x, 40)) +
526+ labs(x = "", y = "Number of observations (count)") +
527+ scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
528+ theme_minimal() +
529+ coord_flip() +
530+ facet_wrap(~species, ncol = 1, scales = "free")
525531```
526532
527533## Trends in error
@@ -538,7 +544,7 @@ We look at trends in CV error measures related to:
538544``` {r}
539545prevalence_cv %>%
540546 ggplot(aes(x = rarity, y = rmse, colour = rarity)) +
541- geom_boxplot()
547+ geom_boxplot()
542548```
543549
544550### Number of datasets
@@ -557,16 +563,17 @@ trend_dataset <- birdcube_dataset_filtered %>%
557563 shannon = -sum(p * log(p)),
558564 neff_datasets = exp(shannon),
559565 evenness = shannon / log(n_datasets),
560- .groups = "drop")
566+ .groups = "drop"
567+ )
561568```
562569
563570Does RMSE change with number of datasets?
564571
565572``` {r}
566573trend_dataset %>%
567574 ggplot(aes(x = n_datasets, y = rmse, colour = rarity)) +
568- geom_point() +
569- geom_smooth(method = "lm", formula = "y ~ x")
575+ geom_point() +
576+ geom_smooth(method = "lm", formula = "y ~ x")
570577```
571578
572579### Effective number of datasets
@@ -585,8 +592,8 @@ where $D_j$ the total number of datasets where species $j$ is present, and $p_i$
585592``` {r}
586593trend_dataset %>%
587594 ggplot(aes(x = neff_datasets, y = rmse, colour = rarity)) +
588- geom_point() +
589- geom_smooth(method = "lm", formula = "y ~ x")
595+ geom_point() +
596+ geom_smooth(method = "lm", formula = "y ~ x")
590597```
591598
592599### Dataset evenness
603610``` {r}
604611trend_dataset %>%
605612 ggplot(aes(x = evenness, y = rmse, colour = rarity)) +
606- geom_point() +
607- geom_smooth(method = "lm", formula = "y ~ x")
613+ geom_point() +
614+ geom_smooth(method = "lm", formula = "y ~ x")
608615```
609616
610617# Species specific indicators
0 commit comments